OILS / data_lang / j8.py View on Github | oils.pub

1331 lines, 691 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- Unify with ASDL pretty printing - NIL8
8 - {} [] are identical
9 - () is for statically typed ASDL data
10 (command.Simple blame_tok:(...) words:[ ])
11 although we are also using [] for typed ASDL arrays, not just JSON
12 - object IDs
13 - @ x123 can create an ID
14 - ! x123 can reference an ID
15 - <> can be for non-J8 data types? For the = operator
16 - 'hi \(name)' interpolation is useful for code
17
18- Common between JSON8 and NIL8 - for writing by hand
19 - comments - # line or // line (JSON5 uses // line, following JS)
20 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21 - commas
22 - JSON8 could have trailing commas rule
23 - NIL8 at least has no commas for [1 2 "hi"]
24"""
25
26import math
27
28from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
30from _devbuild.gen.runtime_asdl import error_code_e
31from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
32
33from core import bash_impl
34from core import error
35from data_lang import pyj8
36# dependency issue: consts.py pulls in frontend/option_def.py
37from frontend import consts
38from frontend import match
39from mycpp import mops
40from mycpp import mylib
41from mycpp.mylib import tagswitch, iteritems, NewDict, log
42
43import fastfunc
44
45_ = log
46
47from typing import cast, Dict, List, Tuple, Optional
48
49
50# COPIED from ui.ValType() to break dep
51def ValType(val):
52 # type: (value_t) -> str
53 """For displaying type errors in the UI."""
54
55 return value_str(val.tag(), dot=False)
56
57
58if mylib.PYTHON:
59
60 def HeapValueId(val):
61 # type: (value_t) -> int
62 """
63 Python's id() returns the address, which is up to 64 bits.
64
65 In C++ we can use the GC ID, which fits within 32 bits.
66 """
67 return id(val)
68
69
70def ValueId(val):
71 # type: (value_t) -> int
72 """
73 Return an integer ID for object that:
74
75 1. Can be used to determine whether 2 objects are the same, e.g. for
76 List, Dict, Func, Proc, etc.
77 2. Will help detect object cycles
78
79 Primitives types like Int and Float don't have this notion. They're
80 immutable values that are copied and compared by value.
81 """
82 with tagswitch(val) as case:
83 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
84 value_e.Str):
85 # These will not be on the heap if we switch to tagged pointers
86 # Str is handled conservatively - when we add small string
87 # optimization, some strings will be values, so we assume all are.
88 return -1
89 else:
90 return HeapValueId(val)
91
92
93def ValueIdString(val):
94 # type: (value_t) -> str
95 """Used by pp value (42) and = 42"""
96 heap_id = ValueId(val) # could be -1
97 if heap_id == -1:
98 return ''
99 else:
100 return ' 0x%s' % mylib.hex_lower(heap_id)
101
102
103def Utf8Encode(code):
104 # type: (int) -> str
105 """Return utf-8 encoded bytes from a unicode code point.
106
107 Based on https://stackoverflow.com/a/23502707
108 """
109 num_cont_bytes = 0
110
111 if code <= 0x7F:
112 return chr(code & 0x7F) # ASCII
113
114 elif code <= 0x7FF:
115 num_cont_bytes = 1
116 elif code <= 0xFFFF:
117 num_cont_bytes = 2
118 else:
119 # What about the check code <= 0x10FFFF ?
120 # - it happens in statically parsed $'' u''
121 # - but not dynamically parsed echo -e / printf, following bash/zsh
122 num_cont_bytes = 3
123
124 bytes_ = [] # type: List[int]
125 for _ in xrange(num_cont_bytes):
126 bytes_.append(0x80 | (code & 0x3F))
127 code >>= 6
128
129 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
130 bytes_.append(b)
131 bytes_.reverse()
132
133 # mod 256 because Python ints don't wrap around!
134 tmp = [chr(b & 0xFF) for b in bytes_]
135 return ''.join(tmp)
136
137
138SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
139SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
140LOSSY_JSON = 1 << 3 # JSON may lose data about strings
141INF_NAN_ARE_NULL = 1 << 4 # for JSON
142
143# Hack until we fully translate
144assert pyj8.LOSSY_JSON == LOSSY_JSON
145
146
147def _Print(val, buf, indent, options=0):
148 # type: (value_t, mylib.BufWriter, int, int) -> None
149 """
150 Args:
151 indent: number of spaces to indent, or -1 for everything on one line
152 """
153 p = InstancePrinter(buf, indent, options)
154 p.Print(val)
155
156
157def PrintMessage(val, buf, indent):
158 # type: (value_t, mylib.BufWriter, int) -> None
159 """ For json8 write (x) and toJson8()
160
161 Caller must handle error.Encode
162 """
163 _Print(val, buf, indent)
164
165
166def PrintJsonMessage(val, buf, indent):
167 # type: (value_t, mylib.BufWriter, int) -> None
168 """ For json write (x) and toJson()
169
170 Caller must handle error.Encode()
171 Doesn't decay to b'' strings - will use Unicode replacement char.
172 """
173 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
174
175
176def PrintLine(val, f):
177 # type: (value_t, mylib.Writer) -> None
178 """ For pp line (x) """
179
180 # error.Encode should be impossible - we show cycles and non-data
181 buf = mylib.BufWriter()
182
183 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
184
185 f.write(buf.getvalue())
186 f.write('\n')
187
188
189if 0:
190
191 def Repr(val):
192 # type: (value_t) -> str
193 """ Unused
194 This is like Python's repr
195 """
196 # error.Encode should be impossible - we show cycles and non-data
197 buf = mylib.BufWriter()
198 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
199 return buf.getvalue()
200
201
202def EncodeString(s, buf, unquoted_ok=False):
203 # type: (str, mylib.BufWriter, bool) -> None
204 """ For pp proc, etc."""
205
206 if unquoted_ok and fastfunc.CanOmitQuotes(s):
207 buf.write(s)
208 return
209
210 _Print(value.Str(s), buf, -1)
211
212
213def MaybeEncodeString(s):
214 # type: (str) -> str
215 """ For write --json8 $s and compexport """
216
217 # TODO: add unquoted_ok here?
218 # /usr/local/foo-bar/x.y/a_b
219
220 buf = mylib.BufWriter()
221 _Print(value.Str(s), buf, -1)
222 return buf.getvalue()
223
224
225def MaybeEncodeJsonString(s):
226 # type: (str) -> str
227 """ For write --json """
228
229 # TODO: add unquoted_ok here?
230 # /usr/local/foo-bar/x.y/a_b
231 buf = mylib.BufWriter()
232 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
233 return buf.getvalue()
234
235
236class InstancePrinter(object):
237 """Print a value tree as J8/JSON."""
238
239 def __init__(self, buf, indent, options):
240 # type: (mylib.BufWriter, int, int) -> None
241 self.buf = buf
242 self.indent = indent
243 self.options = options
244
245 # Key is vm.HeapValueId(val)
246 self.visiting = {} # type: Dict[int, bool]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintMapping(self, d, left, right, level):
297 # type: (Dict[str, value_t], str, str, int) -> None
298 if len(d) == 0: # Special case like Python/JS
299 self.buf.write(left)
300 self.buf.write(right)
301 else:
302 self.buf.write(left)
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write(right)
324
325 def _PrintDict(self, val, level):
326 # type: (value.Dict, int) -> None
327 self._PrintMapping(val.d, '{', '}', level)
328
329 def _PrintObj(self, val, level):
330 # type: (Obj, int) -> None
331
332 self._PrintMapping(val.d, '(', ')', level)
333
334 if val.prototype:
335 self.buf.write(' --> ')
336 self._PrintObj(val.prototype, level)
337
338 def _PrintBashPrefix(self, type_str, level):
339 # type: (str, int) -> None
340
341 self.buf.write('{')
342 self._MaybeNewline()
343 self._ItemIndent(level)
344 self.buf.write('"type":')
345 self._MaybeSpace()
346 self.buf.write(type_str) # "BashArray", "SparseArray", or "BashAssoc",
347
348 self._MaybeNewline()
349
350 self._ItemIndent(level)
351 self.buf.write('"data":')
352 self._MaybeSpace()
353
354 def _PrintBashSuffix(self, level):
355 # type: (int) -> None
356 self._MaybeNewline()
357 self._BracketIndent(level)
358 self.buf.write('}')
359
360 def _PrintSparseArray(self, val, level):
361 # type: (value.SparseArray, int) -> None
362
363 self._PrintBashPrefix('"SparseArray",', level)
364
365 if bash_impl.SparseArray_Count(
366 val) == 0: # Special case like Python/JS
367 self.buf.write('{}')
368 else:
369 self.buf.write('{')
370 self._MaybeNewline()
371
372 i = 0
373 for k in bash_impl.SparseArray_GetKeys(val):
374 if i != 0:
375 self.buf.write(',')
376 self._MaybeNewline()
377
378 self._ItemIndent(level + 1)
379 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381 self.buf.write(':')
382 self._MaybeSpace()
383
384 v, error_code = bash_impl.SparseArray_GetElement(val, k)
385 assert error_code == error_code_e.OK, error_code
386 pyj8.WriteString(v, self.options, self.buf)
387
388 i += 1
389
390 self._MaybeNewline()
391
392 self._BracketIndent(level + 1)
393 self.buf.write('}')
394
395 self._PrintBashSuffix(level)
396
397 def _PrintBashArray(self, val, level):
398 # type: (value.BashArray, int) -> None
399
400 self._PrintBashPrefix('"BashArray",', level)
401
402 if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
403 self.buf.write('{}')
404 else:
405 self.buf.write('{')
406 self._MaybeNewline()
407
408 first = True
409 for i, s in enumerate(bash_impl.BashArray_GetValues(val)):
410 if s is None:
411 continue
412
413 if not first:
414 self.buf.write(',')
415 self._MaybeNewline()
416
417 self._ItemIndent(level + 1)
418 pyj8.WriteString(str(i), self.options, self.buf)
419
420 self.buf.write(':')
421 self._MaybeSpace()
422
423 pyj8.WriteString(s, self.options, self.buf)
424
425 first = False
426
427 self._MaybeNewline()
428
429 self._BracketIndent(level + 1)
430 self.buf.write('}')
431
432 self._PrintBashSuffix(level)
433
434 def _PrintBashAssoc(self, val, level):
435 # type: (value.BashAssoc, int) -> None
436
437 self._PrintBashPrefix('"BashAssoc",', level)
438
439 if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
440 self.buf.write('{}')
441 else:
442 self.buf.write('{')
443 self._MaybeNewline()
444
445 i = 0
446 for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
447 if i != 0:
448 self.buf.write(',')
449 self._MaybeNewline()
450
451 self._ItemIndent(level + 1)
452 pyj8.WriteString(k2, self.options, self.buf)
453
454 self.buf.write(':')
455 self._MaybeSpace()
456
457 pyj8.WriteString(v2, self.options, self.buf)
458
459 i += 1
460
461 self._MaybeNewline()
462
463 self._BracketIndent(level + 1)
464 self.buf.write('}')
465
466 self._PrintBashSuffix(level)
467
468 def Print(self, val, level=0):
469 # type: (value_t, int) -> None
470
471 # special value that means everything is on one line
472 # It's like
473 # JSON.stringify(d, null, 0)
474 # except we use -1, not 0. 0 can still have newlines.
475
476 UP_val = val
477 with tagswitch(val) as case:
478 if case(value_e.Null):
479 self.buf.write('null')
480
481 elif case(value_e.Bool):
482 val = cast(value.Bool, UP_val)
483 self.buf.write('true' if val.b else 'false')
484
485 elif case(value_e.Int):
486 val = cast(value.Int, UP_val)
487 # TODO: avoid intermediate allocation with
488 # self.buf.WriteBigInt(val.i)
489 #
490 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
491 # be of arbitrary length, and will need a growth strategy.
492 # Although that is not very common, so we could allocate in
493 # that case.
494
495 self.buf.write(mops.ToStr(val.i))
496
497 elif case(value_e.Float):
498 val = cast(value.Float, UP_val)
499
500 fl = val.f
501 if math.isinf(fl):
502 if self.options & INF_NAN_ARE_NULL:
503 s = 'null' # negative infinity is null too
504 else:
505 s = 'INFINITY'
506 if fl < 0:
507 s = '-' + s
508 elif math.isnan(fl):
509 if self.options & INF_NAN_ARE_NULL:
510 # JavaScript JSON lib behavior: Inf and NaN are null
511 # Python has a bug in the encoder by default, and then
512 # allow_nan=False raises an error
513 s = 'null'
514 else:
515 s = 'NAN'
516 else:
517 # TODO: can we avoid intermediate allocation?
518 # self.buf.WriteFloat(val.f)
519 s = str(fl)
520
521 self.buf.write(s)
522
523 elif case(value_e.Str):
524 val = cast(value.Str, UP_val)
525
526 pyj8.WriteString(val.s, self.options, self.buf)
527
528 elif case(value_e.List):
529 val = cast(value.List, UP_val)
530
531 # Cycle detection, only for containers that can be in cycles
532 heap_id = HeapValueId(val)
533
534 if self.visiting.get(heap_id, False):
535 if self.options & SHOW_CYCLES:
536 # Showing the ID would be nice for pretty printing, but
537 # the problem is we'd have to show it TWICE to make it
538 # meaningful
539 #
540 #self.buf.write('[ -->%s ]' % ValueIdString(val))
541 self.buf.write('[...]')
542 return
543 else:
544 # node.js prints which index closes the cycle
545 raise error.Encode(
546 "Can't encode List%s in object cycle" %
547 ValueIdString(val))
548 else:
549 self.visiting[heap_id] = True
550 self._PrintList(val, level)
551 self.visiting[heap_id] = False
552
553 elif case(value_e.Dict):
554 val = cast(value.Dict, UP_val)
555
556 # Cycle detection, only for containers that can be in cycles
557 heap_id = HeapValueId(val)
558
559 if self.visiting.get(heap_id, False):
560 if self.options & SHOW_CYCLES:
561 self.buf.write('{...}')
562 return
563 else:
564 # node.js prints which key closes the cycle
565 raise error.Encode(
566 "Can't encode Dict%s in object cycle" %
567 ValueIdString(val))
568 else:
569 self.visiting[heap_id] = True
570 self._PrintDict(val, level)
571 self.visiting[heap_id] = False
572
573 elif case(value_e.Obj):
574 val = cast(Obj, UP_val)
575
576 if not (self.options & SHOW_NON_DATA):
577 raise error.Encode("Can't encode value of type Obj")
578
579 # Cycle detection, only for containers that can be in cycles
580 heap_id = HeapValueId(val)
581
582 if self.visiting.get(heap_id, False):
583 if self.options & SHOW_CYCLES:
584 self.buf.write('(...)')
585 return
586 else:
587 # node.js prints which key closes the cycle
588 raise error.Encode(
589 "Can't encode Obj%s in object cycle" %
590 ValueIdString(val))
591 else:
592 self.visiting[heap_id] = True
593 self._PrintObj(val, level)
594 self.visiting[heap_id] = False
595
596 elif case(value_e.SparseArray):
597 val = cast(value.SparseArray, UP_val)
598 self._PrintSparseArray(val, level)
599
600 elif case(value_e.BashArray):
601 val = cast(value.BashArray, UP_val)
602 self._PrintBashArray(val, level)
603
604 elif case(value_e.BashAssoc):
605 val = cast(value.BashAssoc, UP_val)
606 self._PrintBashAssoc(val, level)
607
608 else:
609 pass # mycpp workaround
610 if self.options & SHOW_NON_DATA:
611 # Similar to = operator, ui.DebugPrint()
612 # TODO: that prints value.Range in a special way
613 ysh_type = ValType(val)
614 # Don't show ID in 'pp test_'
615 #id_str = ValueIdString(val)
616 self.buf.write('<%s>' % ysh_type)
617 else:
618 raise error.Encode("Can't serialize object of type %s" %
619 ValType(val))
620
621
622class LexerDecoder(object):
623 """J8 lexer and string decoder.
624
625 Similar interface as SimpleLexer, except we return an optional decoded
626 string
627 """
628
629 def __init__(self, s, is_j8, lang_str):
630 # type: (str, bool, str) -> None
631 self.s = s
632 self.is_j8 = is_j8
633 self.lang_str = lang_str
634
635 self.pos = 0
636
637 # current line being lexed -- for error messages
638 self.cur_line_num = 1
639
640 # Reuse this instance to save GC objects. JSON objects could have
641 # thousands of strings.
642 self.decoded = mylib.BufWriter()
643
644 def _Error(self, msg, end_pos):
645 # type: (str, int) -> error.Decode
646
647 # Use the current position as start pos
648 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
649
650 def Next(self):
651 # type: () -> Tuple[Id_t, int, Optional[str]]
652 """ Returns a token and updates self.pos """
653
654 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
655
656 if not self.is_j8:
657 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
658 raise self._Error(
659 "Single quotes aren't part of JSON; you may want 'json8 read'",
660 end_pos)
661 if tok_id == Id.Ignored_Comment:
662 raise self._Error(
663 "Comments aren't part of JSON; you may want 'json8 read'",
664 end_pos)
665
666 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
667 Id.Left_USingleQuote):
668 return self._DecodeString(tok_id, end_pos)
669
670 if tok_id == Id.Left_JDoubleQuote:
671 if self.is_j8:
672 return self._DecodeString(tok_id, end_pos)
673 else:
674 raise self._Error('Pure JSON does not accept j"" prefix',
675 end_pos)
676
677 if tok_id == Id.Ignored_Newline:
678 #log('LINE %d', self.cur_line_num)
679 self.cur_line_num += 1
680
681 self.pos = end_pos
682 return tok_id, end_pos, None
683
684 def NextForLines(self):
685 # type: () -> Tuple[Id_t, int, Optional[str]]
686 """ Like Next(), but for J8 Lines """
687
688 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
689
690 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
691 Id.Left_BSingleQuote, Id.Left_USingleQuote):
692 return self._DecodeString(tok_id, end_pos)
693
694 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
695 # this for quoted strings.)
696 if (tok_id == Id.Lit_Chars and
697 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
698 raise self._Error(
699 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
700 if tok_id == Id.Char_AsciiControl:
701 raise self._Error(
702 "J8 Lines can't have unescaped ASCII control chars", end_pos)
703
704 if tok_id == Id.J8_Newline:
705 #log('LINE %d', self.cur_line_num)
706 self.cur_line_num += 1
707
708 self.pos = end_pos
709 return tok_id, end_pos, None
710
711 def _DecodeString(self, left_id, str_pos):
712 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
713 """ Returns a string token and updates self.pos """
714
715 while True:
716 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
717 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
718 else:
719 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
720
721 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
722
723 if tok_id == Id.Eol_Tok:
724 # TODO: point to beginning of # quote?
725 raise self._Error(
726 'Unexpected EOF while lexing %s string' % self.lang_str,
727 str_end)
728 if tok_id == Id.Unknown_Backslash:
729 raise self._Error(
730 'Bad backslash escape in %s string' % self.lang_str,
731 str_end)
732 if tok_id == Id.Char_AsciiControl:
733 raise self._Error(
734 "%s strings can't have unescaped ASCII control chars" %
735 self.lang_str, str_end)
736
737 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
738
739 self.pos = str_end
740
741 s = self.decoded.getvalue()
742 self.decoded.clear() # reuse this instance
743
744 #log('decoded %r', self.decoded.getvalue())
745 return Id.J8_String, str_end, s
746
747 #
748 # Now handle each kind of token
749 #
750
751 if tok_id == Id.Lit_Chars: # JSON and J8
752 part = self.s[str_pos:str_end]
753 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
754 raise self._Error(
755 'Invalid UTF-8 in %s string literal' % self.lang_str,
756 str_end)
757
758 # TODO: would be nice to avoid allocation in all these cases.
759 # But LookupCharC() would have to change.
760
761 elif tok_id == Id.Char_OneChar: # JSON and J8
762 ch = self.s[str_pos + 1]
763 part = consts.LookupCharC(ch)
764
765 elif tok_id == Id.Char_UBraced: # J8 only
766 h = self.s[str_pos + 3:str_end - 1]
767 i = int(h, 16)
768
769 # Same checks in osh/word_compile.py
770 if i > 0x10ffff:
771 raise self._Error(
772 "Code point can't be greater than U+10ffff", str_end)
773 if 0xD800 <= i and i < 0xE000:
774 raise self._Error(
775 r"\u{%s} escape is illegal because it's in the surrogate range"
776 % h, str_end)
777
778 part = Utf8Encode(i)
779
780 elif tok_id == Id.Char_YHex: # J8 only
781 h = self.s[str_pos + 2:str_end]
782
783 # Same check in osh/word_parse.py
784 if left_id != Id.Left_BSingleQuote:
785 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
786 raise self._Error(
787 r"\y%s escapes not allowed in u'' strings" % h,
788 str_end)
789
790 i = int(h, 16)
791 part = chr(i)
792
793 elif tok_id == Id.Char_SurrogatePair:
794 h1 = self.s[str_pos + 2:str_pos + 6]
795 h2 = self.s[str_pos + 8:str_pos + 12]
796
797 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
798 i1 = int(h1, 16) - 0xD800 # high surrogate
799 i2 = int(h2, 16) - 0xDC00 # low surrogate
800 code_point = 0x10000 + (i1 << 10) + i2
801
802 part = Utf8Encode(code_point)
803
804 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
805 h = self.s[str_pos + 2:str_end]
806 i = int(h, 16)
807 part = Utf8Encode(i)
808
809 else:
810 # Should never happen
811 raise AssertionError(Id_str(tok_id))
812
813 #log('%s part %r', Id_str(tok_id), part)
814 self.decoded.write(part)
815 str_pos = str_end
816
817
818class _Parser(object):
819
820 def __init__(self, s, is_j8):
821 # type: (str, bool) -> None
822 self.s = s
823 self.is_j8 = is_j8
824 self.lang_str = "J8" if is_j8 else "JSON"
825
826 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
827 self.tok_id = Id.Undefined_Tok
828 self.start_pos = 0
829 self.end_pos = 0
830 self.decoded = '' # decoded J8 string
831
832 def _Next(self):
833 # type: () -> None
834
835 # This isn't the start of a J8_Bool token, it's the END of the token before it
836 while True:
837 self.start_pos = self.end_pos
838 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
839 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
840 Id.Ignored_Comment):
841 break
842 # TODO: add Ignored_Newline to count lines, and show line numbers
843 # in errors messages. The position of the last newline and a token
844 # can be used to calculate a column number.
845
846 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
847
848 def _Eat(self, tok_id):
849 # type: (Id_t) -> None
850
851 if self.tok_id != tok_id:
852 #log('position %r %d-%d %r', self.s, self.start_pos,
853 # self.end_pos, self.s[self.start_pos:self.end_pos])
854 raise self._ParseError("Expected %s, got %s" %
855 (Id_str(tok_id), Id_str(self.tok_id)))
856 self._Next()
857
858 def _NextForLines(self):
859 # type: () -> None
860 """Like _Next, but use the J8 Lines lexer."""
861 self.start_pos = self.end_pos
862 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
863
864 def _ParseError(self, msg):
865 # type: (str) -> error.Decode
866 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
867 self.lexer.cur_line_num)
868
869
870class Parser(_Parser):
871 """JSON and JSON8 Parser."""
872
873 def __init__(self, s, is_j8):
874 # type: (str, bool) -> None
875 _Parser.__init__(self, s, is_j8)
876
877 def _ParsePair(self):
878 # type: () -> Tuple[str, value_t]
879
880 k = self.decoded # Save the potential string value
881 self._Eat(Id.J8_String) # Check that it's a string
882 assert k is not None
883
884 self._Eat(Id.J8_Colon)
885
886 v = self._ParseValue()
887 return k, v
888
889 def _ParseDict(self):
890 # type: () -> value_t
891 """
892 pair = string ':' value
893 Dict = '{' '}'
894 | '{' pair (',' pair)* '}'
895 """
896 # precondition
897 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
898
899 #log('> Dict')
900
901 d = NewDict() # type: Dict[str, value_t]
902
903 self._Next()
904 if self.tok_id == Id.J8_RBrace:
905 self._Next()
906 return value.Dict(d)
907
908 k, v = self._ParsePair()
909 d[k] = v
910 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
911
912 while self.tok_id == Id.J8_Comma:
913 self._Next()
914 k, v = self._ParsePair()
915 d[k] = v
916 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
917
918 self._Eat(Id.J8_RBrace)
919
920 #log('< Dict')
921
922 return value.Dict(d)
923
924 def _ParseList(self):
925 # type: () -> value_t
926 """
927 List = '[' ']'
928 | '[' value (',' value)* ']'
929 """
930 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
931
932 items = [] # type: List[value_t]
933
934 self._Next()
935 if self.tok_id == Id.J8_RBracket:
936 self._Next()
937 return value.List(items)
938
939 items.append(self._ParseValue())
940
941 while self.tok_id == Id.J8_Comma:
942 self._Next()
943 items.append(self._ParseValue())
944
945 self._Eat(Id.J8_RBracket)
946
947 return value.List(items)
948
949 def _ParseValue(self):
950 # type: () -> value_t
951 if self.tok_id == Id.J8_LBrace:
952 return self._ParseDict()
953
954 elif self.tok_id == Id.J8_LBracket:
955 return self._ParseList()
956
957 elif self.tok_id == Id.J8_Null:
958 self._Next()
959 return value.Null
960
961 elif self.tok_id == Id.J8_Bool:
962 #log('%r %d', self.s[self.start_pos], self.start_pos)
963 b = value.Bool(self.s[self.start_pos] == 't')
964 self._Next()
965 return b
966
967 elif self.tok_id == Id.J8_Int:
968 part = self.s[self.start_pos:self.end_pos]
969 self._Next()
970 ok, big = mops.FromStr2(part)
971 if not ok:
972 raise self._ParseError('Integer is too big')
973 return value.Int(big)
974
975 elif self.tok_id == Id.J8_Float:
976 part = self.s[self.start_pos:self.end_pos]
977 self._Next()
978 return value.Float(float(part))
979
980 # UString, BString too
981 elif self.tok_id == Id.J8_String:
982 str_val = value.Str(self.decoded)
983 #log('d %r', self.decoded)
984 self._Next()
985 return str_val
986
987 elif self.tok_id == Id.Eol_Tok:
988 raise self._ParseError('Unexpected EOF while parsing %s' %
989 self.lang_str)
990
991 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
992 raise self._ParseError('Invalid token while parsing %s: %s' %
993 (self.lang_str, Id_str(self.tok_id)))
994
995 def ParseValue(self):
996 # type: () -> value_t
997 """ Raises error.Decode. """
998 self._Next()
999 obj = self._ParseValue()
1000
1001 n = len(self.s)
1002 if self.start_pos != n:
1003 extra = n - self.start_pos
1004 #log('n %d pos %d', n, self.start_pos)
1005 raise self._ParseError(
1006 'Got %d bytes of unexpected trailing input' % extra)
1007 return obj
1008
1009
1010class Nil8Parser(_Parser):
1011 """
1012 Tokens not in JSON8:
1013 LParen RParen Symbol
1014
1015 Tokens not in JSON, but in JSON8 and NIL8:
1016 Identifier (unquoted keys)
1017 Ignored_Comment
1018 """
1019
1020 def __init__(self, s, is_j8):
1021 # type: (str, bool) -> None
1022 _Parser.__init__(self, s, is_j8)
1023
1024 if 0:
1025
1026 def _LookAhead(self):
1027 # type: () -> Id_t
1028 """
1029 Don't need this right now
1030 """
1031 end_pos = self.end_pos # look ahead from last token
1032 while True:
1033 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1034 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1035 Id.Ignored_Comment):
1036 break
1037 return tok_id
1038
1039 def _ParseRecord(self):
1040 # type: () -> nvalue_t
1041 """
1042 Yaks
1043 (self->Next) => (-> self Next)
1044 (self->Next obj.field) => ((-> self Next) (. obj field))
1045
1046 Similar to
1047 ((identity identity) 42) => 42 in Clojure
1048
1049 ASDL
1050 (Node left:(. x4beef2))
1051 (Node left !x4beef2)
1052
1053 # Ambiguous because value can be identifier.
1054 # We have to look ahead to and see if there's a colon :
1055 field =
1056 Identifier ':' value
1057 | value
1058
1059 record = '(' head field* ')'
1060
1061 - Identifier | Symbol are treated the same, it's a side effect of
1062 the lexing style
1063 - do positional args come before named args
1064 - () is invalid? Use [] for empty list
1065 """
1066 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1067
1068 items = [] # type: List[nvalue_t]
1069
1070 self._Next()
1071 if self.tok_id == Id.J8_RParen:
1072 self._Next()
1073 return nvalue.List(items)
1074
1075 #log('TOK %s', Id_str(self.tok_id))
1076 while self.tok_id != Id.J8_RParen:
1077 items.append(self._ParseNil8())
1078 #log('TOK 2 %s', Id_str(self.tok_id))
1079
1080 self._Eat(Id.J8_RParen)
1081
1082 return nvalue.List(items)
1083
1084 def _ParseList8(self):
1085 # type: () -> nvalue_t
1086 """
1087 List8 = '[' value* ']'
1088
1089 No commas, not even optional ones for now.
1090 """
1091 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1092
1093 items = [] # type: List[nvalue_t]
1094
1095 self._Next()
1096 if self.tok_id == Id.J8_RBracket:
1097 self._Next()
1098 return nvalue.List(items)
1099
1100 #log('TOK %s', Id_str(self.tok_id))
1101 while self.tok_id != Id.J8_RBracket:
1102 items.append(self._ParseNil8())
1103 #log('TOK 2 %s', Id_str(self.tok_id))
1104
1105 self._Eat(Id.J8_RBracket)
1106
1107 return nvalue.List(items)
1108
1109 def _ParseNil8(self):
1110 # type: () -> nvalue_t
1111 if self.tok_id == Id.J8_LParen:
1112 obj = self._ParseRecord() # type: nvalue_t
1113 #return obj
1114
1115 elif self.tok_id == Id.J8_LBracket:
1116 obj = self._ParseList8()
1117 #return obj
1118
1119 # Primitives are copied from J8 above.
1120 # TODO: We also want hex literals.
1121 elif self.tok_id == Id.J8_Null:
1122 self._Next()
1123 obj = nvalue.Null
1124
1125 elif self.tok_id == Id.J8_Bool:
1126 b = nvalue.Bool(self.s[self.start_pos] == 't')
1127 self._Next()
1128 obj = b
1129
1130 elif self.tok_id == Id.J8_Int:
1131 part = self.s[self.start_pos:self.end_pos]
1132 self._Next()
1133 obj = nvalue.Int(int(part))
1134
1135 elif self.tok_id == Id.J8_Float:
1136 part = self.s[self.start_pos:self.end_pos]
1137 self._Next()
1138 obj = nvalue.Float(float(part))
1139
1140 elif self.tok_id == Id.J8_String:
1141 str_val = nvalue.Str(self.decoded)
1142 self._Next()
1143 obj = str_val
1144
1145 # <- etc.
1146 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1147 Id.J8_Comma):
1148 # unquoted "word" treated like a string
1149 part = self.s[self.start_pos:self.end_pos]
1150 self._Next()
1151 obj = nvalue.Symbol(part)
1152
1153 elif self.tok_id == Id.Eol_Tok:
1154 raise self._ParseError('Unexpected EOF while parsing %s' %
1155 self.lang_str)
1156
1157 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1158 raise self._ParseError('Invalid token while parsing %s: %s' %
1159 (self.lang_str, Id_str(self.tok_id)))
1160
1161 #log('YO %s', Id_str(self.tok_id))
1162 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1163 #log('AT %s', Id_str(self.tok_id))
1164
1165 # key: "value" -> (: key "value")
1166 part = self.s[self.start_pos:self.end_pos]
1167 op = nvalue.Symbol(part)
1168
1169 self._Next()
1170 operand2 = self._ParseNil8()
1171 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1172 #print("--> INFIX %d %s" % (id(infix), infix))
1173 return infix
1174
1175 #next_id = self._LookAhead()
1176 #print('NEXT %s' % Id_str(next_id))
1177
1178 #raise AssertionError()
1179 #print("--> OBJ %d %s" % (id(obj), obj))
1180 return obj
1181
1182 def ParseNil8(self):
1183 # type: () -> nvalue_t
1184 """ Raises error.Decode. """
1185 self._Next()
1186 #print('yo')
1187 obj = self._ParseNil8()
1188 #print("==> %d %s" % (id(obj), obj))
1189 if self.tok_id != Id.Eol_Tok:
1190 raise self._ParseError('Unexpected trailing input')
1191 return obj
1192
1193
1194class J8LinesParser(_Parser):
1195 """Decode lines from a string with newlines.
1196
1197 We specify this with a grammar, to preserve location info and to reduce
1198 allocations. (But note that unquoted_line is more like a LOOP than it is
1199 grammatical.)
1200
1201 Grammar:
1202
1203 end = J8_Newline | Eol_Tok
1204
1205 empty_line = WS_Space? end
1206
1207 # special case: read until end token, but REMOVE trailing WS_Space
1208 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1209
1210 j8_line = WS_Space? J8_String WS_Space? end
1211
1212 lines = (empty_line | unquoted_line | j8_line)*
1213
1214 where Lit_Chars is valid UTF-8
1215
1216 Notes:
1217
1218 (1) We disallow multiple strings on a line, like:
1219
1220 "json" "json2"
1221 "json" unquoted
1222
1223 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1224
1225 foo "" u''
1226
1227 The "" and u'' are not a decoded string, because the line started with
1228 Id.Lit_Chars literals.
1229
1230 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1231 Does it have - for empty cell?
1232 """
1233
1234 def __init__(self, s):
1235 # type: (str) -> None
1236 _Parser.__init__(self, s, True)
1237
1238 def _Show(self, s):
1239 # type: (str) -> None
1240 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1241 self.end_pos)
1242
1243 def _ParseLine(self, out):
1244 # type: (List[str]) -> None
1245 """ May append a line to 'out' """
1246 #self._Show('1')
1247 if self.tok_id == Id.WS_Space:
1248 self._NextForLines()
1249
1250 # Empty line - return without doing anything
1251 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1252 self._NextForLines()
1253 return
1254
1255 # Quoted string on line
1256 if self.tok_id == Id.J8_String:
1257 out.append(self.decoded)
1258 self._NextForLines()
1259
1260 if self.tok_id == Id.WS_Space: # trailing whitespace
1261 self._NextForLines()
1262
1263 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1264 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1265 Id_str(self.tok_id))
1266
1267 self._NextForLines()
1268 return
1269
1270 # Unquoted line
1271 if self.tok_id == Id.Lit_Chars:
1272 # ' unquoted "" text on line ' # read every token until end
1273 string_start = self.start_pos
1274 while True:
1275 # for stripping whitespace
1276 prev_id = self.tok_id
1277 prev_start = self.start_pos
1278
1279 self._NextForLines()
1280
1281 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1282 # \r, but we're sticking with the JSON spec definition of
1283 # whitespace. (As another data point, CPython on Unix allows
1284 # \r in the middle of expressions, treating it as whitespace.)
1285 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1286 break
1287
1288 if prev_id == Id.WS_Space:
1289 string_end = prev_start # remove trailing whitespace
1290 else:
1291 string_end = self.start_pos
1292
1293 out.append(self.s[string_start:string_end])
1294
1295 self._NextForLines() # past newline
1296 return
1297
1298 raise AssertionError(Id_str(self.tok_id))
1299
1300 def Parse(self):
1301 # type: () -> List[str]
1302 """ Raises error.Decode. """
1303 self._NextForLines()
1304
1305 lines = [] # type: List[str]
1306 while self.tok_id != Id.Eol_Tok:
1307 self._ParseLine(lines)
1308
1309 if self.tok_id != Id.Eol_Tok:
1310 raise self._ParseError('Unexpected trailing input in J8 Lines')
1311
1312 return lines
1313
1314
1315def SplitJ8Lines(s):
1316 # type: (str) -> List[str]
1317 """Used by @(echo split command sub)
1318
1319 Raises:
1320 error.Decode
1321
1322 3 Errors:
1323 - J8 string syntax error inside quotes
1324 - Extra input on line
1325 - unquoted line isn't utf-8
1326 """
1327 p = J8LinesParser(s)
1328 return p.Parse()
1329
1330
1331# vim: sw=4