OILS / data_lang / j8.py View on Github | oils.pub

1335 lines, 703 significant
1#!/usr/bin/env python2
2"""j8.py - J8 Notation, a superset of JSON
3
4Later:
5
6- Unify with ASDL pretty printing - NIL8
7 - {} [] are identical
8 - () is for statically typed ASDL data
9 (command.Simple blame_tok:(...) words:[ ])
10 although we are also using [] for typed ASDL arrays, not just JSON
11 - object IDs
12 - @ x123 can create an ID
13 - ! x123 can reference an ID
14 - <> can be for non-J8 data types? For the = operator
15 - 'hi \(name)' interpolation is useful for code
16
17- Common between JSON8 and NIL8 - for writing by hand
18 - comments - # line or // line (JSON5 uses // line, following JS)
19 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
20 - commas
21 - JSON8 could have trailing commas rule
22 - NIL8 at least has no commas for [1 2 "hi"]
23"""
24
25from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
26from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
27from _devbuild.gen.runtime_asdl import error_code_e
28from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
29
30from core import bash_impl
31from core import error
32from data_lang import pyj8
33# dependency issue: consts.py pulls in frontend/option_def.py
34from frontend import consts
35from frontend import match
36from mycpp import mops
37from mycpp import mylib
38from mycpp.mylib import tagswitch, iteritems, NewDict, log, isinf_, isnan_
39
40import fastfunc
41
42_ = log
43
44from typing import cast, Dict, List, Tuple, Optional
45
46
47# COPIED from ui.ValType() to break dep
48def ValType(val):
49 # type: (value_t) -> str
50 """For displaying type errors in the UI."""
51
52 return value_str(val.tag(), dot=False)
53
54
55if mylib.PYTHON:
56
57 def HeapValueId(val):
58 # type: (value_t) -> int
59 """
60 Python's id() returns the address, which is up to 64 bits.
61
62 In C++ we can use the GC ID, which fits within 32 bits.
63 """
64 return id(val)
65
66
67def ValueId(val):
68 # type: (value_t) -> int
69 """
70 Return an integer ID for object that:
71
72 1. Can be used to determine whether 2 objects are the same, e.g. for
73 List, Dict, Func, Proc, etc.
74 2. Will help detect object cycles
75
76 Primitives types like Int and Float don't have this notion. They're
77 immutable values that are copied and compared by value.
78 """
79 with tagswitch(val) as case:
80 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
81 value_e.Str):
82 # These will not be on the heap if we switch to tagged pointers
83 # Str is handled conservatively - when we add small string
84 # optimization, some strings will be values, so we assume all are.
85 return -1
86 else:
87 return HeapValueId(val)
88
89
90def ValueIdString(val):
91 # type: (value_t) -> str
92 """Used by pp value (42) and = 42"""
93 heap_id = ValueId(val) # could be -1
94 if heap_id == -1:
95 return ''
96 else:
97 return ' 0x%s' % mylib.hex_lower(heap_id)
98
99
100def Utf8Encode(code):
101 # type: (int) -> str
102 """Return utf-8 encoded bytes from a unicode code point.
103
104 Based on https://stackoverflow.com/a/23502707
105 """
106 num_cont_bytes = 0
107
108 if code <= 0x7F:
109 return chr(code & 0x7F) # ASCII
110
111 elif code <= 0x7FF:
112 num_cont_bytes = 1
113 elif code <= 0xFFFF:
114 num_cont_bytes = 2
115 else:
116 # What about the check code <= 0x10FFFF ?
117 # - it happens in statically parsed $'' u''
118 # - but not dynamically parsed echo -e / printf, following bash/zsh
119 num_cont_bytes = 3
120
121 bytes_ = [] # type: List[int]
122 for _ in xrange(num_cont_bytes):
123 bytes_.append(0x80 | (code & 0x3F))
124 code >>= 6
125
126 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
127 bytes_.append(b)
128 bytes_.reverse()
129
130 # mod 256 because Python ints don't wrap around!
131 tmp = [chr(b & 0xFF) for b in bytes_]
132 return ''.join(tmp)
133
134
135SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
136LOSSY_JSON_STRINGS = 1 << 3 # JSON may lose data about strings
137INF_NAN_ARE_NULL = 1 << 4 # another lossy json issue
138
139NON_DATA_IS_NULL = 1 << 6
140NON_DATA_IS_ERROR = 1 << 7
141# Otherwise, non-data objects like Eggex will be <Eggex 0xff>
142
143# Hack until we fully translate
144assert pyj8.LOSSY_JSON_STRINGS == LOSSY_JSON_STRINGS
145
146
147def _Print(val, buf, indent, options=0):
148 # type: (value_t, mylib.BufWriter, int, int) -> None
149 """
150 Args:
151 indent: number of spaces to indent, or -1 for everything on one line
152 """
153 p = InstancePrinter(buf, indent, options)
154 p.Print(val)
155
156
157def PrintMessage(val, buf, indent, type_errors):
158 # type: (value_t, mylib.BufWriter, int, bool) -> None
159 """ For json8 write (x) and toJson8()
160
161 Caller must handle error.Encode
162 """
163 options = 0
164 if type_errors:
165 options |= NON_DATA_IS_ERROR
166 else:
167 options |= NON_DATA_IS_NULL
168 _Print(val, buf, indent, options=options)
169
170
171def PrintJsonMessage(val, buf, indent, type_errors):
172 # type: (value_t, mylib.BufWriter, int, bool) -> None
173 """ For json write (x) and toJson()
174
175 Caller must handle error.Encode()
176 Doesn't decay to b'' strings - will use Unicode replacement char.
177 """
178 options = LOSSY_JSON_STRINGS | INF_NAN_ARE_NULL
179 if type_errors:
180 options |= NON_DATA_IS_ERROR
181 else:
182 options |= NON_DATA_IS_NULL
183 _Print(val, buf, indent, options=options)
184
185
186def PrintLine(val, f):
187 # type: (value_t, mylib.Writer) -> None
188 """ For pp test_ (x) """
189
190 # error.Encode should be impossible - we show cycles and non-data
191 buf = mylib.BufWriter()
192
193 _Print(val, buf, -1, options=SHOW_CYCLES)
194
195 f.write(buf.getvalue())
196 f.write('\n')
197
198
199def EncodeString(s, buf, unquoted_ok=False):
200 # type: (str, mylib.BufWriter, bool) -> None
201 """ For pp proc, etc."""
202
203 if unquoted_ok and fastfunc.CanOmitQuotes(s):
204 buf.write(s)
205 return
206
207 _Print(value.Str(s), buf, -1)
208
209
210def MaybeEncodeString(s):
211 # type: (str) -> str
212 """ For write --json8 $s and compexport """
213
214 # TODO: add unquoted_ok here?
215 # /usr/local/foo-bar/x.y/a_b
216
217 buf = mylib.BufWriter()
218 _Print(value.Str(s), buf, -1)
219 return buf.getvalue()
220
221
222def MaybeEncodeJsonString(s):
223 # type: (str) -> str
224 """ For write --json """
225
226 # TODO: add unquoted_ok here?
227 # /usr/local/foo-bar/x.y/a_b
228 buf = mylib.BufWriter()
229 _Print(value.Str(s), buf, -1, options=LOSSY_JSON_STRINGS)
230 return buf.getvalue()
231
232
233class InstancePrinter(object):
234 """Print a value tree as J8/JSON."""
235
236 def __init__(self, buf, indent, options):
237 # type: (mylib.BufWriter, int, int) -> None
238 self.buf = buf
239 self.indent = indent
240 self.options = options
241
242 # Key is vm.HeapValueId(val)
243 self.visiting = {} # type: Dict[int, bool]
244
245 def _ItemIndent(self, level):
246 # type: (int) -> None
247
248 if self.indent == -1:
249 return
250
251 self.buf.write_spaces((level + 1) * self.indent)
252
253 def _BracketIndent(self, level):
254 # type: (int) -> None
255
256 if self.indent == -1:
257 return
258
259 self.buf.write_spaces(level * self.indent)
260
261 def _MaybeNewline(self):
262 # type: () -> None
263 if self.indent == -1:
264 return
265 self.buf.write('\n')
266
267 def _MaybeSpace(self):
268 # type: () -> None
269 if self.indent == -1:
270 return
271 self.buf.write(' ')
272
273 def _PrintList(self, val, level):
274 # type: (value.List, int) -> None
275
276 if len(val.items) == 0: # Special case like Python/JS
277 self.buf.write('[]')
278 else:
279 self.buf.write('[')
280 self._MaybeNewline()
281 for i, item in enumerate(val.items):
282 if i != 0:
283 self.buf.write(',')
284 self._MaybeNewline()
285
286 self._ItemIndent(level)
287 self.Print(item, level + 1)
288 self._MaybeNewline()
289
290 self._BracketIndent(level)
291 self.buf.write(']')
292
293 def _PrintMapping(self, d, left, right, level):
294 # type: (Dict[str, value_t], str, str, int) -> None
295 if len(d) == 0: # Special case like Python/JS
296 self.buf.write(left)
297 self.buf.write(right)
298 else:
299 self.buf.write(left)
300 self._MaybeNewline()
301 i = 0
302 for k, v in iteritems(d):
303 if i != 0:
304 self.buf.write(',')
305 self._MaybeNewline()
306
307 self._ItemIndent(level)
308
309 pyj8.WriteString(k, self.options, self.buf)
310
311 self.buf.write(':')
312 self._MaybeSpace()
313
314 self.Print(v, level + 1)
315
316 i += 1
317
318 self._MaybeNewline()
319 self._BracketIndent(level)
320 self.buf.write(right)
321
322 def _PrintDict(self, val, level):
323 # type: (value.Dict, int) -> None
324 self._PrintMapping(val.d, '{', '}', level)
325
326 def _PrintObj(self, val, level):
327 # type: (Obj, int) -> None
328
329 self._PrintMapping(val.d, '(', ')', level)
330
331 if val.prototype:
332 self.buf.write(' --> ')
333 self._PrintObj(val.prototype, level)
334
335 def _PrintBashPrefix(self, type_str, level):
336 # type: (str, int) -> None
337
338 self.buf.write('{')
339 self._MaybeNewline()
340 self._ItemIndent(level)
341 self.buf.write('"type":')
342 self._MaybeSpace()
343 self.buf.write(
344 type_str) # "InternalStringArray", "BashArray", or "BashAssoc",
345
346 self._MaybeNewline()
347
348 self._ItemIndent(level)
349 self.buf.write('"data":')
350 self._MaybeSpace()
351
352 def _PrintBashSuffix(self, level):
353 # type: (int) -> None
354 self._MaybeNewline()
355 self._BracketIndent(level)
356 self.buf.write('}')
357
358 def _PrintBashArray(self, val, level):
359 # type: (value.BashArray, int) -> None
360
361 self._PrintBashPrefix('"BashArray",', level)
362
363 if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
364 self.buf.write('{}')
365 else:
366 self.buf.write('{')
367 self._MaybeNewline()
368
369 i = 0
370 for k in bash_impl.BashArray_GetKeys(val):
371 if i != 0:
372 self.buf.write(',')
373 self._MaybeNewline()
374
375 self._ItemIndent(level + 1)
376 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
377
378 self.buf.write(':')
379 self._MaybeSpace()
380
381 v, error_code = bash_impl.BashArray_GetElement(val, k)
382 assert error_code == error_code_e.OK, error_code
383 pyj8.WriteString(v, self.options, self.buf)
384
385 i += 1
386
387 self._MaybeNewline()
388
389 self._BracketIndent(level + 1)
390 self.buf.write('}')
391
392 self._PrintBashSuffix(level)
393
394 def _PrintInternalStringArray(self, val, level):
395 # type: (value.InternalStringArray, int) -> None
396
397 self._PrintBashPrefix('"InternalStringArray",', level)
398
399 if bash_impl.InternalStringArray_Count(
400 val) == 0: # Special case like Python/JS
401 self.buf.write('{}')
402 else:
403 self.buf.write('{')
404 self._MaybeNewline()
405
406 first = True
407 for i, s in enumerate(
408 bash_impl.InternalStringArray_GetValues(val)):
409 if s is None:
410 continue
411
412 if not first:
413 self.buf.write(',')
414 self._MaybeNewline()
415
416 self._ItemIndent(level + 1)
417 pyj8.WriteString(str(i), self.options, self.buf)
418
419 self.buf.write(':')
420 self._MaybeSpace()
421
422 pyj8.WriteString(s, self.options, self.buf)
423
424 first = False
425
426 self._MaybeNewline()
427
428 self._BracketIndent(level + 1)
429 self.buf.write('}')
430
431 self._PrintBashSuffix(level)
432
433 def _PrintBashAssoc(self, val, level):
434 # type: (value.BashAssoc, int) -> None
435
436 self._PrintBashPrefix('"BashAssoc",', level)
437
438 if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
439 self.buf.write('{}')
440 else:
441 self.buf.write('{')
442 self._MaybeNewline()
443
444 i = 0
445 for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
446 if i != 0:
447 self.buf.write(',')
448 self._MaybeNewline()
449
450 self._ItemIndent(level + 1)
451 pyj8.WriteString(k2, self.options, self.buf)
452
453 self.buf.write(':')
454 self._MaybeSpace()
455
456 pyj8.WriteString(v2, self.options, self.buf)
457
458 i += 1
459
460 self._MaybeNewline()
461
462 self._BracketIndent(level + 1)
463 self.buf.write('}')
464
465 self._PrintBashSuffix(level)
466
467 def Print(self, val, level=0):
468 # type: (value_t, int) -> None
469
470 # special value that means everything is on one line
471 # It's like
472 # JSON.stringify(d, null, 0)
473 # except we use -1, not 0. 0 can still have newlines.
474
475 UP_val = val
476 with tagswitch(val) as case:
477 if case(value_e.Null):
478 self.buf.write('null')
479
480 elif case(value_e.Bool):
481 val = cast(value.Bool, UP_val)
482 self.buf.write('true' if val.b else 'false')
483
484 elif case(value_e.Int):
485 val = cast(value.Int, UP_val)
486 # TODO: avoid intermediate allocation with
487 # self.buf.WriteBigInt(val.i)
488 #
489 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
490 # be of arbitrary length, and will need a growth strategy.
491 # Although that is not very common, so we could allocate in
492 # that case.
493
494 self.buf.write(mops.ToStr(val.i))
495
496 elif case(value_e.Float):
497 val = cast(value.Float, UP_val)
498
499 fl = val.f
500 if isinf_(fl):
501 if self.options & INF_NAN_ARE_NULL:
502 s = 'null' # negative infinity is null too
503 else:
504 s = 'INFINITY'
505 if fl < 0:
506 s = '-' + s
507 elif isnan_(fl):
508 if self.options & INF_NAN_ARE_NULL:
509 # JavaScript JSON lib behavior: Inf and NaN are null
510 # Python has a bug in the encoder by default, and then
511 # allow_nan=False raises an error
512 s = 'null'
513 else:
514 s = 'NAN'
515 else:
516 # TODO: can we avoid intermediate allocation?
517 # self.buf.WriteFloat(val.f)
518 s = str(fl)
519
520 self.buf.write(s)
521
522 elif case(value_e.Str):
523 val = cast(value.Str, UP_val)
524
525 pyj8.WriteString(val.s, self.options, self.buf)
526
527 elif case(value_e.List):
528 val = cast(value.List, UP_val)
529
530 # Cycle detection, only for containers that can be in cycles
531 heap_id = HeapValueId(val)
532
533 if self.visiting.get(heap_id, False):
534 if self.options & SHOW_CYCLES:
535 # Showing the ID would be nice for pretty printing, but
536 # the problem is we'd have to show it TWICE to make it
537 # meaningful
538 #
539 #self.buf.write('[ -->%s ]' % ValueIdString(val))
540 self.buf.write('[...]')
541 return
542 else:
543 # node.js prints which index closes the cycle
544 raise error.Encode(
545 "Can't encode List%s in object cycle" %
546 ValueIdString(val))
547 else:
548 self.visiting[heap_id] = True
549 self._PrintList(val, level)
550 self.visiting[heap_id] = False
551
552 elif case(value_e.Dict):
553 val = cast(value.Dict, UP_val)
554
555 # Cycle detection, only for containers that can be in cycles
556 heap_id = HeapValueId(val)
557
558 if self.visiting.get(heap_id, False):
559 if self.options & SHOW_CYCLES:
560 self.buf.write('{...}')
561 return
562 else:
563 # node.js prints which key closes the cycle
564 raise error.Encode(
565 "Can't encode Dict%s in object cycle" %
566 ValueIdString(val))
567 else:
568 self.visiting[heap_id] = True
569 self._PrintDict(val, level)
570 self.visiting[heap_id] = False
571
572 elif case(value_e.Obj):
573 val = cast(Obj, UP_val)
574
575 if self.options & NON_DATA_IS_ERROR:
576 raise error.Encode("Can't encode value of type Obj")
577 elif self.options & NON_DATA_IS_NULL:
578 self.buf.write('null')
579 return
580
581 # Cycle detection, only for containers that can be in cycles
582 heap_id = HeapValueId(val)
583
584 if self.visiting.get(heap_id, False):
585 if self.options & SHOW_CYCLES:
586 self.buf.write('(...)')
587 return
588 else:
589 # node.js prints which key closes the cycle
590 raise error.Encode(
591 "Can't encode Obj%s in object cycle" %
592 ValueIdString(val))
593 else:
594 self.visiting[heap_id] = True
595 self._PrintObj(val, level)
596 self.visiting[heap_id] = False
597
598 elif case(value_e.BashArray):
599 val = cast(value.BashArray, UP_val)
600 self._PrintBashArray(val, level)
601
602 elif case(value_e.InternalStringArray):
603 val = cast(value.InternalStringArray, UP_val)
604 self._PrintInternalStringArray(val, level)
605
606 elif case(value_e.BashAssoc):
607 val = cast(value.BashAssoc, UP_val)
608 self._PrintBashAssoc(val, level)
609
610 else:
611 pass # mycpp workaround
612 if self.options & NON_DATA_IS_ERROR:
613 raise error.Encode("Can't serialize object of type %s" %
614 ValType(val))
615 elif self.options & NON_DATA_IS_NULL:
616 self.buf.write('null')
617 else:
618 # Similar to = operator, ui.DebugPrint()
619 # TODO: that prints value.Range in a special way
620 ysh_type = ValType(val)
621 # Don't show ID in 'pp test_'
622 #id_str = ValueIdString(val)
623 self.buf.write('<%s>' % ysh_type)
624
625
626class LexerDecoder(object):
627 """J8 lexer and string decoder.
628
629 Similar interface as SimpleLexer, except we return an optional decoded
630 string
631 """
632
633 def __init__(self, s, is_j8, lang_str):
634 # type: (str, bool, str) -> None
635 self.s = s
636 self.is_j8 = is_j8
637 self.lang_str = lang_str
638
639 self.pos = 0
640
641 # current line being lexed -- for error messages
642 self.cur_line_num = 1
643
644 # Reuse this instance to save GC objects. JSON objects could have
645 # thousands of strings.
646 self.decoded = mylib.BufWriter()
647
648 def _Error(self, msg, end_pos):
649 # type: (str, int) -> error.Decode
650
651 # Use the current position as start pos
652 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
653
654 def Next(self):
655 # type: () -> Tuple[Id_t, int, Optional[str]]
656 """ Returns a token and updates self.pos """
657
658 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
659
660 if not self.is_j8:
661 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
662 raise self._Error(
663 "Single quotes aren't part of JSON; you may want 'json8 read'",
664 end_pos)
665 if tok_id == Id.Ignored_Comment:
666 raise self._Error(
667 "Comments aren't part of JSON; you may want 'json8 read'",
668 end_pos)
669
670 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
671 Id.Left_USingleQuote):
672 return self._DecodeString(tok_id, end_pos)
673
674 if tok_id == Id.Left_JDoubleQuote:
675 if self.is_j8:
676 return self._DecodeString(tok_id, end_pos)
677 else:
678 raise self._Error('Pure JSON does not accept j"" prefix',
679 end_pos)
680
681 if tok_id == Id.Ignored_Newline:
682 #log('LINE %d', self.cur_line_num)
683 self.cur_line_num += 1
684
685 self.pos = end_pos
686 return tok_id, end_pos, None
687
688 def NextForLines(self):
689 # type: () -> Tuple[Id_t, int, Optional[str]]
690 """ Like Next(), but for J8 Lines """
691
692 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
693
694 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
695 Id.Left_BSingleQuote, Id.Left_USingleQuote):
696 return self._DecodeString(tok_id, end_pos)
697
698 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
699 # this for quoted strings.)
700 if (tok_id == Id.Lit_Chars and
701 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
702 raise self._Error(
703 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
704 if tok_id == Id.Char_AsciiControl:
705 raise self._Error(
706 "J8 Lines can't have unescaped ASCII control chars", end_pos)
707
708 if tok_id == Id.J8_Newline:
709 #log('LINE %d', self.cur_line_num)
710 self.cur_line_num += 1
711
712 self.pos = end_pos
713 return tok_id, end_pos, None
714
715 def _DecodeString(self, left_id, str_pos):
716 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
717 """ Returns a string token and updates self.pos """
718
719 while True:
720 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
721 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
722 else:
723 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
724
725 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
726
727 if tok_id == Id.Eol_Tok:
728 # TODO: point to beginning of # quote?
729 raise self._Error(
730 'Unexpected EOF while lexing %s string' % self.lang_str,
731 str_end)
732 if tok_id == Id.Unknown_Backslash:
733 raise self._Error(
734 'Bad backslash escape in %s string' % self.lang_str,
735 str_end)
736 if tok_id == Id.Char_AsciiControl:
737 raise self._Error(
738 "%s strings can't have unescaped ASCII control chars" %
739 self.lang_str, str_end)
740
741 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
742
743 self.pos = str_end
744
745 s = self.decoded.getvalue()
746 self.decoded.clear() # reuse this instance
747
748 #log('decoded %r', self.decoded.getvalue())
749 return Id.J8_String, str_end, s
750
751 #
752 # Now handle each kind of token
753 #
754
755 if tok_id == Id.Lit_Chars: # JSON and J8
756 part = self.s[str_pos:str_end]
757 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
758 raise self._Error(
759 'Invalid UTF-8 in %s string literal' % self.lang_str,
760 str_end)
761
762 # TODO: would be nice to avoid allocation in all these cases.
763 # But LookupCharC() would have to change.
764
765 elif tok_id == Id.Char_OneChar: # JSON and J8
766 ch = self.s[str_pos + 1]
767 part = consts.LookupCharC(ch)
768
769 elif tok_id == Id.Char_UBraced: # J8 only
770 h = self.s[str_pos + 3:str_end - 1]
771 i = int(h, 16)
772
773 # Same checks in osh/word_compile.py
774 if i > 0x10ffff:
775 raise self._Error(
776 "Code point can't be greater than U+10ffff", str_end)
777 if 0xD800 <= i and i < 0xE000:
778 raise self._Error(
779 r"\u{%s} escape is illegal because it's in the surrogate range"
780 % h, str_end)
781
782 part = Utf8Encode(i)
783
784 elif tok_id == Id.Char_YHex: # J8 only
785 h = self.s[str_pos + 2:str_end]
786
787 # Same check in osh/word_parse.py
788 if left_id != Id.Left_BSingleQuote:
789 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
790 raise self._Error(
791 r"\y%s escapes not allowed in u'' strings" % h,
792 str_end)
793
794 i = int(h, 16)
795 part = chr(i)
796
797 elif tok_id == Id.Char_SurrogatePair:
798 h1 = self.s[str_pos + 2:str_pos + 6]
799 h2 = self.s[str_pos + 8:str_pos + 12]
800
801 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
802 i1 = int(h1, 16) - 0xD800 # high surrogate
803 i2 = int(h2, 16) - 0xDC00 # low surrogate
804 code_point = 0x10000 + (i1 << 10) + i2
805
806 part = Utf8Encode(code_point)
807
808 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
809 h = self.s[str_pos + 2:str_end]
810 i = int(h, 16)
811 part = Utf8Encode(i)
812
813 else:
814 # Should never happen
815 raise AssertionError(Id_str(tok_id))
816
817 #log('%s part %r', Id_str(tok_id), part)
818 self.decoded.write(part)
819 str_pos = str_end
820
821
822class _Parser(object):
823
824 def __init__(self, s, is_j8):
825 # type: (str, bool) -> None
826 self.s = s
827 self.is_j8 = is_j8
828 self.lang_str = "J8" if is_j8 else "JSON"
829
830 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
831 self.tok_id = Id.Undefined_Tok
832 self.start_pos = 0
833 self.end_pos = 0
834 self.decoded = '' # decoded J8 string
835
836 def _Next(self):
837 # type: () -> None
838
839 # This isn't the start of a J8_Bool token, it's the END of the token before it
840 while True:
841 self.start_pos = self.end_pos
842 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
843 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
844 Id.Ignored_Comment):
845 break
846 # TODO: add Ignored_Newline to count lines, and show line numbers
847 # in errors messages. The position of the last newline and a token
848 # can be used to calculate a column number.
849
850 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
851
852 def _Eat(self, tok_id):
853 # type: (Id_t) -> None
854
855 if self.tok_id != tok_id:
856 #log('position %r %d-%d %r', self.s, self.start_pos,
857 # self.end_pos, self.s[self.start_pos:self.end_pos])
858 raise self._ParseError("Expected %s, got %s" %
859 (Id_str(tok_id), Id_str(self.tok_id)))
860 self._Next()
861
862 def _NextForLines(self):
863 # type: () -> None
864 """Like _Next, but use the J8 Lines lexer."""
865 self.start_pos = self.end_pos
866 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
867
868 def _ParseError(self, msg):
869 # type: (str) -> error.Decode
870 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
871 self.lexer.cur_line_num)
872
873
874class Parser(_Parser):
875 """JSON and JSON8 Parser."""
876
877 def __init__(self, s, is_j8):
878 # type: (str, bool) -> None
879 _Parser.__init__(self, s, is_j8)
880
881 def _ParsePair(self):
882 # type: () -> Tuple[str, value_t]
883
884 k = self.decoded # Save the potential string value
885 self._Eat(Id.J8_String) # Check that it's a string
886 assert k is not None
887
888 self._Eat(Id.J8_Colon)
889
890 v = self._ParseValue()
891 return k, v
892
893 def _ParseDict(self):
894 # type: () -> value_t
895 """
896 pair = string ':' value
897 Dict = '{' '}'
898 | '{' pair (',' pair)* '}'
899 """
900 # precondition
901 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
902
903 #log('> Dict')
904
905 d = NewDict() # type: Dict[str, value_t]
906
907 self._Next()
908 if self.tok_id == Id.J8_RBrace:
909 self._Next()
910 return value.Dict(d)
911
912 k, v = self._ParsePair()
913 d[k] = v
914 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
915
916 while self.tok_id == Id.J8_Comma:
917 self._Next()
918 k, v = self._ParsePair()
919 d[k] = v
920 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
921
922 self._Eat(Id.J8_RBrace)
923
924 #log('< Dict')
925
926 return value.Dict(d)
927
928 def _ParseList(self):
929 # type: () -> value_t
930 """
931 List = '[' ']'
932 | '[' value (',' value)* ']'
933 """
934 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
935
936 items = [] # type: List[value_t]
937
938 self._Next()
939 if self.tok_id == Id.J8_RBracket:
940 self._Next()
941 return value.List(items)
942
943 items.append(self._ParseValue())
944
945 while self.tok_id == Id.J8_Comma:
946 self._Next()
947 items.append(self._ParseValue())
948
949 self._Eat(Id.J8_RBracket)
950
951 return value.List(items)
952
953 def _ParseValue(self):
954 # type: () -> value_t
955 if self.tok_id == Id.J8_LBrace:
956 return self._ParseDict()
957
958 elif self.tok_id == Id.J8_LBracket:
959 return self._ParseList()
960
961 elif self.tok_id == Id.J8_Null:
962 self._Next()
963 return value.Null
964
965 elif self.tok_id == Id.J8_Bool:
966 #log('%r %d', self.s[self.start_pos], self.start_pos)
967 b = value.Bool(self.s[self.start_pos] == 't')
968 self._Next()
969 return b
970
971 elif self.tok_id == Id.J8_Int:
972 part = self.s[self.start_pos:self.end_pos]
973 self._Next()
974 ok, big = mops.FromStr2(part)
975 if not ok:
976 raise self._ParseError('Integer is too big')
977 return value.Int(big)
978
979 elif self.tok_id == Id.J8_Float:
980 part = self.s[self.start_pos:self.end_pos]
981 self._Next()
982 return value.Float(float(part))
983
984 # UString, BString too
985 elif self.tok_id == Id.J8_String:
986 str_val = value.Str(self.decoded)
987 #log('d %r', self.decoded)
988 self._Next()
989 return str_val
990
991 elif self.tok_id == Id.Eol_Tok:
992 raise self._ParseError('Unexpected EOF while parsing %s' %
993 self.lang_str)
994
995 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
996 raise self._ParseError('Invalid token while parsing %s: %s' %
997 (self.lang_str, Id_str(self.tok_id)))
998
999 def ParseValue(self):
1000 # type: () -> value_t
1001 """ Raises error.Decode. """
1002 self._Next()
1003 obj = self._ParseValue()
1004
1005 n = len(self.s)
1006 if self.start_pos != n:
1007 extra = n - self.start_pos
1008 #log('n %d pos %d', n, self.start_pos)
1009 raise self._ParseError(
1010 'Got %d bytes of unexpected trailing input' % extra)
1011 return obj
1012
1013
1014class Nil8Parser(_Parser):
1015 """
1016 Tokens not in JSON8:
1017 LParen RParen Symbol
1018
1019 Tokens not in JSON, but in JSON8 and NIL8:
1020 Identifier (unquoted keys)
1021 Ignored_Comment
1022 """
1023
1024 def __init__(self, s, is_j8):
1025 # type: (str, bool) -> None
1026 _Parser.__init__(self, s, is_j8)
1027
1028 if 0:
1029
1030 def _LookAhead(self):
1031 # type: () -> Id_t
1032 """
1033 Don't need this right now
1034 """
1035 end_pos = self.end_pos # look ahead from last token
1036 while True:
1037 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1038 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1039 Id.Ignored_Comment):
1040 break
1041 return tok_id
1042
1043 def _ParseRecord(self):
1044 # type: () -> nvalue_t
1045 """
1046 Yaks
1047 (self->Next) => (-> self Next)
1048 (self->Next obj.field) => ((-> self Next) (. obj field))
1049
1050 Similar to
1051 ((identity identity) 42) => 42 in Clojure
1052
1053 ASDL
1054 (Node left:(. x4beef2))
1055 (Node left !x4beef2)
1056
1057 # Ambiguous because value can be identifier.
1058 # We have to look ahead to and see if there's a colon :
1059 field =
1060 Identifier ':' value
1061 | value
1062
1063 record = '(' head field* ')'
1064
1065 - Identifier | Symbol are treated the same, it's a side effect of
1066 the lexing style
1067 - do positional args come before named args
1068 - () is invalid? Use [] for empty list
1069 """
1070 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1071
1072 items = [] # type: List[nvalue_t]
1073
1074 self._Next()
1075 if self.tok_id == Id.J8_RParen:
1076 self._Next()
1077 return nvalue.List(items)
1078
1079 #log('TOK %s', Id_str(self.tok_id))
1080 while self.tok_id != Id.J8_RParen:
1081 items.append(self._ParseNil8())
1082 #log('TOK 2 %s', Id_str(self.tok_id))
1083
1084 self._Eat(Id.J8_RParen)
1085
1086 return nvalue.List(items)
1087
1088 def _ParseList8(self):
1089 # type: () -> nvalue_t
1090 """
1091 List8 = '[' value* ']'
1092
1093 No commas, not even optional ones for now.
1094 """
1095 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1096
1097 items = [] # type: List[nvalue_t]
1098
1099 self._Next()
1100 if self.tok_id == Id.J8_RBracket:
1101 self._Next()
1102 return nvalue.List(items)
1103
1104 #log('TOK %s', Id_str(self.tok_id))
1105 while self.tok_id != Id.J8_RBracket:
1106 items.append(self._ParseNil8())
1107 #log('TOK 2 %s', Id_str(self.tok_id))
1108
1109 self._Eat(Id.J8_RBracket)
1110
1111 return nvalue.List(items)
1112
1113 def _ParseNil8(self):
1114 # type: () -> nvalue_t
1115 if self.tok_id == Id.J8_LParen:
1116 obj = self._ParseRecord() # type: nvalue_t
1117 #return obj
1118
1119 elif self.tok_id == Id.J8_LBracket:
1120 obj = self._ParseList8()
1121 #return obj
1122
1123 # Primitives are copied from J8 above.
1124 # TODO: We also want hex literals.
1125 elif self.tok_id == Id.J8_Null:
1126 self._Next()
1127 obj = nvalue.Null
1128
1129 elif self.tok_id == Id.J8_Bool:
1130 b = nvalue.Bool(self.s[self.start_pos] == 't')
1131 self._Next()
1132 obj = b
1133
1134 elif self.tok_id == Id.J8_Int:
1135 part = self.s[self.start_pos:self.end_pos]
1136 self._Next()
1137 obj = nvalue.Int(int(part))
1138
1139 elif self.tok_id == Id.J8_Float:
1140 part = self.s[self.start_pos:self.end_pos]
1141 self._Next()
1142 obj = nvalue.Float(float(part))
1143
1144 elif self.tok_id == Id.J8_String:
1145 str_val = nvalue.Str(self.decoded)
1146 self._Next()
1147 obj = str_val
1148
1149 # <- etc.
1150 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1151 Id.J8_Comma):
1152 # unquoted "word" treated like a string
1153 part = self.s[self.start_pos:self.end_pos]
1154 self._Next()
1155 obj = nvalue.Symbol(part)
1156
1157 elif self.tok_id == Id.Eol_Tok:
1158 raise self._ParseError('Unexpected EOF while parsing %s' %
1159 self.lang_str)
1160
1161 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1162 raise self._ParseError('Invalid token while parsing %s: %s' %
1163 (self.lang_str, Id_str(self.tok_id)))
1164
1165 #log('YO %s', Id_str(self.tok_id))
1166 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1167 #log('AT %s', Id_str(self.tok_id))
1168
1169 # key: "value" -> (: key "value")
1170 part = self.s[self.start_pos:self.end_pos]
1171 op = nvalue.Symbol(part)
1172
1173 self._Next()
1174 operand2 = self._ParseNil8()
1175 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1176 #print("--> INFIX %d %s" % (id(infix), infix))
1177 return infix
1178
1179 #next_id = self._LookAhead()
1180 #print('NEXT %s' % Id_str(next_id))
1181
1182 #raise AssertionError()
1183 #print("--> OBJ %d %s" % (id(obj), obj))
1184 return obj
1185
1186 def ParseNil8(self):
1187 # type: () -> nvalue_t
1188 """ Raises error.Decode. """
1189 self._Next()
1190 #print('yo')
1191 obj = self._ParseNil8()
1192 #print("==> %d %s" % (id(obj), obj))
1193 if self.tok_id != Id.Eol_Tok:
1194 raise self._ParseError('Unexpected trailing input')
1195 return obj
1196
1197
1198class J8LinesParser(_Parser):
1199 """Decode lines from a string with newlines.
1200
1201 We specify this with a grammar, to preserve location info and to reduce
1202 allocations. (But note that unquoted_line is more like a LOOP than it is
1203 grammatical.)
1204
1205 Grammar:
1206
1207 end = J8_Newline | Eol_Tok
1208
1209 empty_line = WS_Space? end
1210
1211 # special case: read until end token, but REMOVE trailing WS_Space
1212 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1213
1214 j8_line = WS_Space? J8_String WS_Space? end
1215
1216 lines = (empty_line | unquoted_line | j8_line)*
1217
1218 where Lit_Chars is valid UTF-8
1219
1220 Notes:
1221
1222 (1) We disallow multiple strings on a line, like:
1223
1224 "json" "json2"
1225 "json" unquoted
1226
1227 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1228
1229 foo "" u''
1230
1231 The "" and u'' are not a decoded string, because the line started with
1232 Id.Lit_Chars literals.
1233
1234 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1235 Does it have - for empty cell?
1236 """
1237
1238 def __init__(self, s):
1239 # type: (str) -> None
1240 _Parser.__init__(self, s, True)
1241
1242 def _Show(self, s):
1243 # type: (str) -> None
1244 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1245 self.end_pos)
1246
1247 def _ParseLine(self, out):
1248 # type: (List[str]) -> None
1249 """ May append a line to 'out' """
1250 #self._Show('1')
1251 if self.tok_id == Id.WS_Space:
1252 self._NextForLines()
1253
1254 # Empty line - return without doing anything
1255 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1256 self._NextForLines()
1257 return
1258
1259 # Quoted string on line
1260 if self.tok_id == Id.J8_String:
1261 out.append(self.decoded)
1262 self._NextForLines()
1263
1264 if self.tok_id == Id.WS_Space: # trailing whitespace
1265 self._NextForLines()
1266
1267 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1268 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1269 Id_str(self.tok_id))
1270
1271 self._NextForLines()
1272 return
1273
1274 # Unquoted line
1275 if self.tok_id == Id.Lit_Chars:
1276 # ' unquoted "" text on line ' # read every token until end
1277 string_start = self.start_pos
1278 while True:
1279 # for stripping whitespace
1280 prev_id = self.tok_id
1281 prev_start = self.start_pos
1282
1283 self._NextForLines()
1284
1285 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1286 # \r, but we're sticking with the JSON spec definition of
1287 # whitespace. (As another data point, CPython on Unix allows
1288 # \r in the middle of expressions, treating it as whitespace.)
1289 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1290 break
1291
1292 if prev_id == Id.WS_Space:
1293 string_end = prev_start # remove trailing whitespace
1294 else:
1295 string_end = self.start_pos
1296
1297 out.append(self.s[string_start:string_end])
1298
1299 self._NextForLines() # past newline
1300 return
1301
1302 raise AssertionError(Id_str(self.tok_id))
1303
1304 def Parse(self):
1305 # type: () -> List[str]
1306 """ Raises error.Decode. """
1307 self._NextForLines()
1308
1309 lines = [] # type: List[str]
1310 while self.tok_id != Id.Eol_Tok:
1311 self._ParseLine(lines)
1312
1313 if self.tok_id != Id.Eol_Tok:
1314 raise self._ParseError('Unexpected trailing input in J8 Lines')
1315
1316 return lines
1317
1318
1319def SplitJ8Lines(s):
1320 # type: (str) -> List[str]
1321 """Used by @(echo split command sub)
1322
1323 Raises:
1324 error.Decode
1325
1326 3 Errors:
1327 - J8 string syntax error inside quotes
1328 - Extra input on line
1329 - unquoted line isn't utf-8
1330 """
1331 p = J8LinesParser(s)
1332 return p.Parse()
1333
1334
1335# vim: sw=4