OILS / data_lang / j8.py View on Github | oilshell.org

1326 lines, 686 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- Unify with ASDL pretty printing - NIL8
8 - {} [] are identical
9 - () is for statically typed ASDL data
10 (command.Simple blame_tok:(...) words:[ ])
11 although we are also using [] for typed ASDL arrays, not just JSON
12 - object IDs
13 - @ x123 can create an ID
14 - ! x123 can reference an ID
15 - <> can be for non-J8 data types? For the = operator
16 - 'hi \(name)' interpolation is useful for code
17
18- Common between JSON8 and NIL8 - for writing by hand
19 - comments - # line or // line (JSON5 uses // line, following JS)
20 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21 - commas
22 - JSON8 could have trailing commas rule
23 - NIL8 at least has no commas for [1 2 "hi"]
24"""
25
26import math
27
28from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
30from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
31
32from core import error
33from data_lang import pyj8
34# dependency issue: consts.py pulls in frontend/option_def.py
35from frontend import consts
36from frontend import match
37from mycpp import mops
38from mycpp import mylib
39from mycpp.mylib import tagswitch, iteritems, NewDict, log
40
41import fastfunc
42
43_ = log
44
45from typing import cast, Dict, List, Tuple, Optional
46
47
48# COPIED from ui.ValType() to break dep
49def ValType(val):
50 # type: (value_t) -> str
51 """For displaying type errors in the UI."""
52
53 return value_str(val.tag(), dot=False)
54
55
56if mylib.PYTHON:
57
58 def HeapValueId(val):
59 # type: (value_t) -> int
60 """
61 Python's id() returns the address, which is up to 64 bits.
62
63 In C++ we can use the GC ID, which fits within 32 bits.
64 """
65 return id(val)
66
67
68def ValueId(val):
69 # type: (value_t) -> int
70 """
71 Return an integer ID for object that:
72
73 1. Can be used to determine whether 2 objects are the same, e.g. for
74 List, Dict, Func, Proc, etc.
75 2. Will help detect object cycles
76
77 Primitives types like Int and Float don't have this notion. They're
78 immutable values that are copied and compared by value.
79 """
80 with tagswitch(val) as case:
81 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
82 value_e.Str):
83 # These will not be on the heap if we switch to tagged pointers
84 # Str is handled conservatively - when we add small string
85 # optimization, some strings will be values, so we assume all are.
86 return -1
87 else:
88 return HeapValueId(val)
89
90
91def ValueIdString(val):
92 # type: (value_t) -> str
93 """Used by pp value (42) and = 42"""
94 heap_id = ValueId(val) # could be -1
95 if heap_id == -1:
96 return ''
97 else:
98 return ' 0x%s' % mylib.hex_lower(heap_id)
99
100
101def Utf8Encode(code):
102 # type: (int) -> str
103 """Return utf-8 encoded bytes from a unicode code point.
104
105 Based on https://stackoverflow.com/a/23502707
106 """
107 num_cont_bytes = 0
108
109 if code <= 0x7F:
110 return chr(code & 0x7F) # ASCII
111
112 elif code <= 0x7FF:
113 num_cont_bytes = 1
114 elif code <= 0xFFFF:
115 num_cont_bytes = 2
116 else:
117 # What about the check code <= 0x10FFFF ?
118 # - it happens in statically parsed $'' u''
119 # - but not dynamically parsed echo -e / printf, following bash/zsh
120 num_cont_bytes = 3
121
122 bytes_ = [] # type: List[int]
123 for _ in xrange(num_cont_bytes):
124 bytes_.append(0x80 | (code & 0x3F))
125 code >>= 6
126
127 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
128 bytes_.append(b)
129 bytes_.reverse()
130
131 # mod 256 because Python ints don't wrap around!
132 tmp = [chr(b & 0xFF) for b in bytes_]
133 return ''.join(tmp)
134
135
136SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
137SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
138LOSSY_JSON = 1 << 3 # JSON may lose data about strings
139INF_NAN_ARE_NULL = 1 << 4 # for JSON
140
141# Hack until we fully translate
142assert pyj8.LOSSY_JSON == LOSSY_JSON
143
144
145def _Print(val, buf, indent, options=0):
146 # type: (value_t, mylib.BufWriter, int, int) -> None
147 """
148 Args:
149 indent: number of spaces to indent, or -1 for everything on one line
150 """
151 p = InstancePrinter(buf, indent, options)
152 p.Print(val)
153
154
155def PrintMessage(val, buf, indent):
156 # type: (value_t, mylib.BufWriter, int) -> None
157 """ For json8 write (x) and toJson8()
158
159 Caller must handle error.Encode
160 """
161 _Print(val, buf, indent)
162
163
164def PrintJsonMessage(val, buf, indent):
165 # type: (value_t, mylib.BufWriter, int) -> None
166 """ For json write (x) and toJson()
167
168 Caller must handle error.Encode()
169 Doesn't decay to b'' strings - will use Unicode replacement char.
170 """
171 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
172
173
174def PrintLine(val, f):
175 # type: (value_t, mylib.Writer) -> None
176 """ For pp line (x) """
177
178 # error.Encode should be impossible - we show cycles and non-data
179 buf = mylib.BufWriter()
180
181 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
182
183 f.write(buf.getvalue())
184 f.write('\n')
185
186
187if 0:
188
189 def Repr(val):
190 # type: (value_t) -> str
191 """ Unused
192 This is like Python's repr
193 """
194 # error.Encode should be impossible - we show cycles and non-data
195 buf = mylib.BufWriter()
196 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
197 return buf.getvalue()
198
199
200def EncodeString(s, buf, unquoted_ok=False):
201 # type: (str, mylib.BufWriter, bool) -> None
202 """ For pp proc, etc."""
203
204 if unquoted_ok and fastfunc.CanOmitQuotes(s):
205 buf.write(s)
206 return
207
208 _Print(value.Str(s), buf, -1)
209
210
211def MaybeEncodeString(s):
212 # type: (str) -> str
213 """ For write --json8 $s and compexport """
214
215 # TODO: add unquoted_ok here?
216 # /usr/local/foo-bar/x.y/a_b
217
218 buf = mylib.BufWriter()
219 _Print(value.Str(s), buf, -1)
220 return buf.getvalue()
221
222
223def MaybeEncodeJsonString(s):
224 # type: (str) -> str
225 """ For write --json """
226
227 # TODO: add unquoted_ok here?
228 # /usr/local/foo-bar/x.y/a_b
229 buf = mylib.BufWriter()
230 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
231 return buf.getvalue()
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 self.visiting = {} # type: Dict[int, bool]
245
246 def _ItemIndent(self, level):
247 # type: (int) -> None
248
249 if self.indent == -1:
250 return
251
252 self.buf.write_spaces((level + 1) * self.indent)
253
254 def _BracketIndent(self, level):
255 # type: (int) -> None
256
257 if self.indent == -1:
258 return
259
260 self.buf.write_spaces(level * self.indent)
261
262 def _MaybeNewline(self):
263 # type: () -> None
264 if self.indent == -1:
265 return
266 self.buf.write('\n')
267
268 def _MaybeSpace(self):
269 # type: () -> None
270 if self.indent == -1:
271 return
272 self.buf.write(' ')
273
274 def _PrintList(self, val, level):
275 # type: (value.List, int) -> None
276
277 if len(val.items) == 0: # Special case like Python/JS
278 self.buf.write('[]')
279 else:
280 self.buf.write('[')
281 self._MaybeNewline()
282 for i, item in enumerate(val.items):
283 if i != 0:
284 self.buf.write(',')
285 self._MaybeNewline()
286
287 self._ItemIndent(level)
288 self.Print(item, level + 1)
289 self._MaybeNewline()
290
291 self._BracketIndent(level)
292 self.buf.write(']')
293
294 def _PrintMapping(self, d, left, right, level):
295 # type: (Dict[str, value_t], str, str, int) -> None
296 if len(d) == 0: # Special case like Python/JS
297 self.buf.write(left)
298 self.buf.write(right)
299 else:
300 self.buf.write(left)
301 self._MaybeNewline()
302 i = 0
303 for k, v in iteritems(d):
304 if i != 0:
305 self.buf.write(',')
306 self._MaybeNewline()
307
308 self._ItemIndent(level)
309
310 pyj8.WriteString(k, self.options, self.buf)
311
312 self.buf.write(':')
313 self._MaybeSpace()
314
315 self.Print(v, level + 1)
316
317 i += 1
318
319 self._MaybeNewline()
320 self._BracketIndent(level)
321 self.buf.write(right)
322
323 def _PrintDict(self, val, level):
324 # type: (value.Dict, int) -> None
325 self._PrintMapping(val.d, '{', '}', level)
326
327 def _PrintObj(self, val, level):
328 # type: (Obj, int) -> None
329
330 self._PrintMapping(val.d, '(', ')', level)
331
332 if val.prototype:
333 self.buf.write(' --> ')
334 self._PrintObj(val.prototype, level)
335
336 def _PrintBashPrefix(self, type_str, level):
337 # type: (str, int) -> None
338
339 self.buf.write('{')
340 self._MaybeNewline()
341 self._ItemIndent(level)
342 self.buf.write('"type":')
343 self._MaybeSpace()
344 self.buf.write(type_str) # "BashArray", or "BashAssoc",
345
346 self._MaybeNewline()
347
348 self._ItemIndent(level)
349 self.buf.write('"data":')
350 self._MaybeSpace()
351
352 def _PrintBashSuffix(self, level):
353 # type: (int) -> None
354 self._MaybeNewline()
355 self._BracketIndent(level)
356 self.buf.write('}')
357
358 def _PrintSparseArray(self, val, level):
359 # type: (value.SparseArray, int) -> None
360
361 self._PrintBashPrefix('"SparseArray",', level)
362
363 if len(val.d) == 0: # Special case like Python/JS
364 self.buf.write('{}')
365 else:
366 self.buf.write('{')
367 self._MaybeNewline()
368
369 i = 0
370 for k, v in iteritems(val.d):
371 if i != 0:
372 self.buf.write(',')
373 self._MaybeNewline()
374
375 self._ItemIndent(level + 1)
376 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
377
378 self.buf.write(':')
379 self._MaybeSpace()
380
381 pyj8.WriteString(v, self.options, self.buf)
382
383 i += 1
384
385 self._MaybeNewline()
386
387 self._BracketIndent(level + 1)
388 self.buf.write('}')
389
390 self._PrintBashSuffix(level)
391
392 def _PrintBashArray(self, val, level):
393 # type: (value.BashArray, int) -> None
394
395 self._PrintBashPrefix('"BashArray",', level)
396
397 if len(val.strs) == 0: # Special case like Python/JS
398 self.buf.write('{}')
399 else:
400 self.buf.write('{')
401 self._MaybeNewline()
402
403 first = True
404 for i, s in enumerate(val.strs):
405 if s is None:
406 continue
407
408 if not first:
409 self.buf.write(',')
410 self._MaybeNewline()
411
412 self._ItemIndent(level + 1)
413 pyj8.WriteString(str(i), self.options, self.buf)
414
415 self.buf.write(':')
416 self._MaybeSpace()
417
418 pyj8.WriteString(s, self.options, self.buf)
419
420 first = False
421
422 self._MaybeNewline()
423
424 self._BracketIndent(level + 1)
425 self.buf.write('}')
426
427 self._PrintBashSuffix(level)
428
429 def _PrintBashAssoc(self, val, level):
430 # type: (value.BashAssoc, int) -> None
431
432 self._PrintBashPrefix('"BashAssoc",', level)
433
434 if len(val.d) == 0: # Special case like Python/JS
435 self.buf.write('{}')
436 else:
437 self.buf.write('{')
438 self._MaybeNewline()
439
440 i = 0
441 for k2, v2 in iteritems(val.d):
442 if i != 0:
443 self.buf.write(',')
444 self._MaybeNewline()
445
446 self._ItemIndent(level + 1)
447 pyj8.WriteString(k2, self.options, self.buf)
448
449 self.buf.write(':')
450 self._MaybeSpace()
451
452 pyj8.WriteString(v2, self.options, self.buf)
453
454 i += 1
455
456 self._MaybeNewline()
457
458 self._BracketIndent(level + 1)
459 self.buf.write('}')
460
461 self._PrintBashSuffix(level)
462
463 def Print(self, val, level=0):
464 # type: (value_t, int) -> None
465
466 # special value that means everything is on one line
467 # It's like
468 # JSON.stringify(d, null, 0)
469 # except we use -1, not 0. 0 can still have newlines.
470
471 UP_val = val
472 with tagswitch(val) as case:
473 if case(value_e.Null):
474 self.buf.write('null')
475
476 elif case(value_e.Bool):
477 val = cast(value.Bool, UP_val)
478 self.buf.write('true' if val.b else 'false')
479
480 elif case(value_e.Int):
481 val = cast(value.Int, UP_val)
482 # TODO: avoid intermediate allocation with
483 # self.buf.WriteBigInt(val.i)
484 #
485 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
486 # be of arbitrary length, and will need a growth strategy.
487 # Although that is not very common, so we could allocate in
488 # that case.
489
490 self.buf.write(mops.ToStr(val.i))
491
492 elif case(value_e.Float):
493 val = cast(value.Float, UP_val)
494
495 fl = val.f
496 if math.isinf(fl):
497 if self.options & INF_NAN_ARE_NULL:
498 s = 'null' # negative infinity is null too
499 else:
500 s = 'INFINITY'
501 if fl < 0:
502 s = '-' + s
503 elif math.isnan(fl):
504 if self.options & INF_NAN_ARE_NULL:
505 # JavaScript JSON lib behavior: Inf and NaN are null
506 # Python has a bug in the encoder by default, and then
507 # allow_nan=False raises an error
508 s = 'null'
509 else:
510 s = 'NAN'
511 else:
512 # TODO: can we avoid intermediate allocation?
513 # self.buf.WriteFloat(val.f)
514 s = str(fl)
515
516 self.buf.write(s)
517
518 elif case(value_e.Str):
519 val = cast(value.Str, UP_val)
520
521 pyj8.WriteString(val.s, self.options, self.buf)
522
523 elif case(value_e.List):
524 val = cast(value.List, UP_val)
525
526 # Cycle detection, only for containers that can be in cycles
527 heap_id = HeapValueId(val)
528
529 if self.visiting.get(heap_id, False):
530 if self.options & SHOW_CYCLES:
531 # Showing the ID would be nice for pretty printing, but
532 # the problem is we'd have to show it TWICE to make it
533 # meaningful
534 #
535 #self.buf.write('[ -->%s ]' % ValueIdString(val))
536 self.buf.write('[...]')
537 return
538 else:
539 # node.js prints which index closes the cycle
540 raise error.Encode(
541 "Can't encode List%s in object cycle" %
542 ValueIdString(val))
543 else:
544 self.visiting[heap_id] = True
545 self._PrintList(val, level)
546 self.visiting[heap_id] = False
547
548 elif case(value_e.Dict):
549 val = cast(value.Dict, UP_val)
550
551 # Cycle detection, only for containers that can be in cycles
552 heap_id = HeapValueId(val)
553
554 if self.visiting.get(heap_id, False):
555 if self.options & SHOW_CYCLES:
556 self.buf.write('{...}')
557 return
558 else:
559 # node.js prints which key closes the cycle
560 raise error.Encode(
561 "Can't encode Dict%s in object cycle" %
562 ValueIdString(val))
563 else:
564 self.visiting[heap_id] = True
565 self._PrintDict(val, level)
566 self.visiting[heap_id] = False
567
568 elif case(value_e.Obj):
569 val = cast(Obj, UP_val)
570
571 if not (self.options & SHOW_NON_DATA):
572 raise error.Encode("Can't encode value of type Obj")
573
574 # Cycle detection, only for containers that can be in cycles
575 heap_id = HeapValueId(val)
576
577 if self.visiting.get(heap_id, False):
578 if self.options & SHOW_CYCLES:
579 self.buf.write('(...)')
580 return
581 else:
582 # node.js prints which key closes the cycle
583 raise error.Encode(
584 "Can't encode Obj%s in object cycle" %
585 ValueIdString(val))
586 else:
587 self.visiting[heap_id] = True
588 self._PrintObj(val, level)
589 self.visiting[heap_id] = False
590
591 elif case(value_e.SparseArray):
592 val = cast(value.SparseArray, UP_val)
593 self._PrintSparseArray(val, level)
594
595 elif case(value_e.BashArray):
596 val = cast(value.BashArray, UP_val)
597 self._PrintBashArray(val, level)
598
599 elif case(value_e.BashAssoc):
600 val = cast(value.BashAssoc, UP_val)
601 self._PrintBashAssoc(val, level)
602
603 else:
604 pass # mycpp workaround
605 if self.options & SHOW_NON_DATA:
606 # Similar to = operator, ui.DebugPrint()
607 # TODO: that prints value.Range in a special way
608 ysh_type = ValType(val)
609 # Don't show ID in 'pp test_'
610 #id_str = ValueIdString(val)
611 self.buf.write('<%s>' % ysh_type)
612 else:
613 raise error.Encode("Can't serialize object of type %s" %
614 ValType(val))
615
616
617class LexerDecoder(object):
618 """J8 lexer and string decoder.
619
620 Similar interface as SimpleLexer, except we return an optional decoded
621 string
622 """
623
624 def __init__(self, s, is_j8, lang_str):
625 # type: (str, bool, str) -> None
626 self.s = s
627 self.is_j8 = is_j8
628 self.lang_str = lang_str
629
630 self.pos = 0
631
632 # current line being lexed -- for error messages
633 self.cur_line_num = 1
634
635 # Reuse this instance to save GC objects. JSON objects could have
636 # thousands of strings.
637 self.decoded = mylib.BufWriter()
638
639 def _Error(self, msg, end_pos):
640 # type: (str, int) -> error.Decode
641
642 # Use the current position as start pos
643 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
644
645 def Next(self):
646 # type: () -> Tuple[Id_t, int, Optional[str]]
647 """ Returns a token and updates self.pos """
648
649 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
650
651 if not self.is_j8:
652 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
653 raise self._Error(
654 "Single quotes aren't part of JSON; you may want 'json8 read'",
655 end_pos)
656 if tok_id == Id.Ignored_Comment:
657 raise self._Error(
658 "Comments aren't part of JSON; you may want 'json8 read'",
659 end_pos)
660
661 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
662 Id.Left_USingleQuote):
663 return self._DecodeString(tok_id, end_pos)
664
665 if tok_id == Id.Left_JDoubleQuote:
666 if self.is_j8:
667 return self._DecodeString(tok_id, end_pos)
668 else:
669 raise self._Error('Pure JSON does not accept j"" prefix',
670 end_pos)
671
672 if tok_id == Id.Ignored_Newline:
673 #log('LINE %d', self.cur_line_num)
674 self.cur_line_num += 1
675
676 self.pos = end_pos
677 return tok_id, end_pos, None
678
679 def NextForLines(self):
680 # type: () -> Tuple[Id_t, int, Optional[str]]
681 """ Like Next(), but for J8 Lines """
682
683 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
684
685 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
686 Id.Left_BSingleQuote, Id.Left_USingleQuote):
687 return self._DecodeString(tok_id, end_pos)
688
689 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
690 # this for quoted strings.)
691 if (tok_id == Id.Lit_Chars and
692 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
693 raise self._Error(
694 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
695 if tok_id == Id.Char_AsciiControl:
696 raise self._Error(
697 "J8 Lines can't have unescaped ASCII control chars", end_pos)
698
699 if tok_id == Id.J8_Newline:
700 #log('LINE %d', self.cur_line_num)
701 self.cur_line_num += 1
702
703 self.pos = end_pos
704 return tok_id, end_pos, None
705
706 def _DecodeString(self, left_id, str_pos):
707 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
708 """ Returns a string token and updates self.pos """
709
710 while True:
711 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
712 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
713 else:
714 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
715
716 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
717
718 if tok_id == Id.Eol_Tok:
719 # TODO: point to beginning of # quote?
720 raise self._Error(
721 'Unexpected EOF while lexing %s string' % self.lang_str,
722 str_end)
723 if tok_id == Id.Unknown_Backslash:
724 raise self._Error(
725 'Bad backslash escape in %s string' % self.lang_str,
726 str_end)
727 if tok_id == Id.Char_AsciiControl:
728 raise self._Error(
729 "%s strings can't have unescaped ASCII control chars" %
730 self.lang_str, str_end)
731
732 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
733
734 self.pos = str_end
735
736 s = self.decoded.getvalue()
737 self.decoded.clear() # reuse this instance
738
739 #log('decoded %r', self.decoded.getvalue())
740 return Id.J8_String, str_end, s
741
742 #
743 # Now handle each kind of token
744 #
745
746 if tok_id == Id.Lit_Chars: # JSON and J8
747 part = self.s[str_pos:str_end]
748 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
749 raise self._Error(
750 'Invalid UTF-8 in %s string literal' % self.lang_str,
751 str_end)
752
753 # TODO: would be nice to avoid allocation in all these cases.
754 # But LookupCharC() would have to change.
755
756 elif tok_id == Id.Char_OneChar: # JSON and J8
757 ch = self.s[str_pos + 1]
758 part = consts.LookupCharC(ch)
759
760 elif tok_id == Id.Char_UBraced: # J8 only
761 h = self.s[str_pos + 3:str_end - 1]
762 i = int(h, 16)
763
764 # Same checks in osh/word_compile.py
765 if i > 0x10ffff:
766 raise self._Error(
767 "Code point can't be greater than U+10ffff", str_end)
768 if 0xD800 <= i and i < 0xE000:
769 raise self._Error(
770 r"\u{%s} escape is illegal because it's in the surrogate range"
771 % h, str_end)
772
773 part = Utf8Encode(i)
774
775 elif tok_id == Id.Char_YHex: # J8 only
776 h = self.s[str_pos + 2:str_end]
777
778 # Same check in osh/word_parse.py
779 if left_id != Id.Left_BSingleQuote:
780 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
781 raise self._Error(
782 r"\y%s escapes not allowed in u'' strings" % h,
783 str_end)
784
785 i = int(h, 16)
786 part = chr(i)
787
788 elif tok_id == Id.Char_SurrogatePair:
789 h1 = self.s[str_pos + 2:str_pos + 6]
790 h2 = self.s[str_pos + 8:str_pos + 12]
791
792 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
793 i1 = int(h1, 16) - 0xD800 # high surrogate
794 i2 = int(h2, 16) - 0xDC00 # low surrogate
795 code_point = 0x10000 + (i1 << 10) + i2
796
797 part = Utf8Encode(code_point)
798
799 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
800 h = self.s[str_pos + 2:str_end]
801 i = int(h, 16)
802 part = Utf8Encode(i)
803
804 else:
805 # Should never happen
806 raise AssertionError(Id_str(tok_id))
807
808 #log('%s part %r', Id_str(tok_id), part)
809 self.decoded.write(part)
810 str_pos = str_end
811
812
813class _Parser(object):
814
815 def __init__(self, s, is_j8):
816 # type: (str, bool) -> None
817 self.s = s
818 self.is_j8 = is_j8
819 self.lang_str = "J8" if is_j8 else "JSON"
820
821 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
822 self.tok_id = Id.Undefined_Tok
823 self.start_pos = 0
824 self.end_pos = 0
825 self.decoded = '' # decoded J8 string
826
827 def _Next(self):
828 # type: () -> None
829
830 # This isn't the start of a J8_Bool token, it's the END of the token before it
831 while True:
832 self.start_pos = self.end_pos
833 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
834 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
835 Id.Ignored_Comment):
836 break
837 # TODO: add Ignored_Newline to count lines, and show line numbers
838 # in errors messages. The position of the last newline and a token
839 # can be used to calculate a column number.
840
841 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
842
843 def _Eat(self, tok_id):
844 # type: (Id_t) -> None
845
846 if self.tok_id != tok_id:
847 #log('position %r %d-%d %r', self.s, self.start_pos,
848 # self.end_pos, self.s[self.start_pos:self.end_pos])
849 raise self._ParseError("Expected %s, got %s" %
850 (Id_str(tok_id), Id_str(self.tok_id)))
851 self._Next()
852
853 def _NextForLines(self):
854 # type: () -> None
855 """Like _Next, but use the J8 Lines lexer."""
856 self.start_pos = self.end_pos
857 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
858
859 def _ParseError(self, msg):
860 # type: (str) -> error.Decode
861 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
862 self.lexer.cur_line_num)
863
864
865class Parser(_Parser):
866 """JSON and JSON8 Parser."""
867
868 def __init__(self, s, is_j8):
869 # type: (str, bool) -> None
870 _Parser.__init__(self, s, is_j8)
871
872 def _ParsePair(self):
873 # type: () -> Tuple[str, value_t]
874
875 k = self.decoded # Save the potential string value
876 self._Eat(Id.J8_String) # Check that it's a string
877 assert k is not None
878
879 self._Eat(Id.J8_Colon)
880
881 v = self._ParseValue()
882 return k, v
883
884 def _ParseDict(self):
885 # type: () -> value_t
886 """
887 pair = string ':' value
888 Dict = '{' '}'
889 | '{' pair (',' pair)* '}'
890 """
891 # precondition
892 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
893
894 #log('> Dict')
895
896 d = NewDict() # type: Dict[str, value_t]
897
898 self._Next()
899 if self.tok_id == Id.J8_RBrace:
900 self._Next()
901 return value.Dict(d)
902
903 k, v = self._ParsePair()
904 d[k] = v
905 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
906
907 while self.tok_id == Id.J8_Comma:
908 self._Next()
909 k, v = self._ParsePair()
910 d[k] = v
911 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
912
913 self._Eat(Id.J8_RBrace)
914
915 #log('< Dict')
916
917 return value.Dict(d)
918
919 def _ParseList(self):
920 # type: () -> value_t
921 """
922 List = '[' ']'
923 | '[' value (',' value)* ']'
924 """
925 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
926
927 items = [] # type: List[value_t]
928
929 self._Next()
930 if self.tok_id == Id.J8_RBracket:
931 self._Next()
932 return value.List(items)
933
934 items.append(self._ParseValue())
935
936 while self.tok_id == Id.J8_Comma:
937 self._Next()
938 items.append(self._ParseValue())
939
940 self._Eat(Id.J8_RBracket)
941
942 return value.List(items)
943
944 def _ParseValue(self):
945 # type: () -> value_t
946 if self.tok_id == Id.J8_LBrace:
947 return self._ParseDict()
948
949 elif self.tok_id == Id.J8_LBracket:
950 return self._ParseList()
951
952 elif self.tok_id == Id.J8_Null:
953 self._Next()
954 return value.Null
955
956 elif self.tok_id == Id.J8_Bool:
957 #log('%r %d', self.s[self.start_pos], self.start_pos)
958 b = value.Bool(self.s[self.start_pos] == 't')
959 self._Next()
960 return b
961
962 elif self.tok_id == Id.J8_Int:
963 part = self.s[self.start_pos:self.end_pos]
964 self._Next()
965 ok, big = mops.FromStr2(part)
966 if not ok:
967 raise self._ParseError('Integer is too big')
968 return value.Int(big)
969
970 elif self.tok_id == Id.J8_Float:
971 part = self.s[self.start_pos:self.end_pos]
972 self._Next()
973 return value.Float(float(part))
974
975 # UString, BString too
976 elif self.tok_id == Id.J8_String:
977 str_val = value.Str(self.decoded)
978 #log('d %r', self.decoded)
979 self._Next()
980 return str_val
981
982 elif self.tok_id == Id.Eol_Tok:
983 raise self._ParseError('Unexpected EOF while parsing %s' %
984 self.lang_str)
985
986 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
987 raise self._ParseError('Invalid token while parsing %s: %s' %
988 (self.lang_str, Id_str(self.tok_id)))
989
990 def ParseValue(self):
991 # type: () -> value_t
992 """ Raises error.Decode. """
993 self._Next()
994 obj = self._ParseValue()
995
996 n = len(self.s)
997 if self.start_pos != n:
998 extra = n - self.start_pos
999 #log('n %d pos %d', n, self.start_pos)
1000 raise self._ParseError(
1001 'Got %d bytes of unexpected trailing input' % extra)
1002 return obj
1003
1004
1005class Nil8Parser(_Parser):
1006 """
1007 Tokens not in JSON8:
1008 LParen RParen Symbol
1009
1010 Tokens not in JSON, but in JSON8 and NIL8:
1011 Identifier (unquoted keys)
1012 Ignored_Comment
1013 """
1014
1015 def __init__(self, s, is_j8):
1016 # type: (str, bool) -> None
1017 _Parser.__init__(self, s, is_j8)
1018
1019 if 0:
1020
1021 def _LookAhead(self):
1022 # type: () -> Id_t
1023 """
1024 Don't need this right now
1025 """
1026 end_pos = self.end_pos # look ahead from last token
1027 while True:
1028 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1029 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1030 Id.Ignored_Comment):
1031 break
1032 return tok_id
1033
1034 def _ParseRecord(self):
1035 # type: () -> nvalue_t
1036 """
1037 Yaks
1038 (self->Next) => (-> self Next)
1039 (self->Next obj.field) => ((-> self Next) (. obj field))
1040
1041 Similar to
1042 ((identity identity) 42) => 42 in Clojure
1043
1044 ASDL
1045 (Node left:(. x4beef2))
1046 (Node left !x4beef2)
1047
1048 # Ambiguous because value can be identifier.
1049 # We have to look ahead to and see if there's a colon :
1050 field =
1051 Identifier ':' value
1052 | value
1053
1054 record = '(' head field* ')'
1055
1056 - Identifier | Symbol are treated the same, it's a side effect of
1057 the lexing style
1058 - do positional args come before named args
1059 - () is invalid? Use [] for empty list
1060 """
1061 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1062
1063 items = [] # type: List[nvalue_t]
1064
1065 self._Next()
1066 if self.tok_id == Id.J8_RParen:
1067 self._Next()
1068 return nvalue.List(items)
1069
1070 #log('TOK %s', Id_str(self.tok_id))
1071 while self.tok_id != Id.J8_RParen:
1072 items.append(self._ParseNil8())
1073 #log('TOK 2 %s', Id_str(self.tok_id))
1074
1075 self._Eat(Id.J8_RParen)
1076
1077 return nvalue.List(items)
1078
1079 def _ParseList8(self):
1080 # type: () -> nvalue_t
1081 """
1082 List8 = '[' value* ']'
1083
1084 No commas, not even optional ones for now.
1085 """
1086 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1087
1088 items = [] # type: List[nvalue_t]
1089
1090 self._Next()
1091 if self.tok_id == Id.J8_RBracket:
1092 self._Next()
1093 return nvalue.List(items)
1094
1095 #log('TOK %s', Id_str(self.tok_id))
1096 while self.tok_id != Id.J8_RBracket:
1097 items.append(self._ParseNil8())
1098 #log('TOK 2 %s', Id_str(self.tok_id))
1099
1100 self._Eat(Id.J8_RBracket)
1101
1102 return nvalue.List(items)
1103
1104 def _ParseNil8(self):
1105 # type: () -> nvalue_t
1106 if self.tok_id == Id.J8_LParen:
1107 obj = self._ParseRecord() # type: nvalue_t
1108 #return obj
1109
1110 elif self.tok_id == Id.J8_LBracket:
1111 obj = self._ParseList8()
1112 #return obj
1113
1114 # Primitives are copied from J8 above.
1115 # TODO: We also want hex literals.
1116 elif self.tok_id == Id.J8_Null:
1117 self._Next()
1118 obj = nvalue.Null
1119
1120 elif self.tok_id == Id.J8_Bool:
1121 b = nvalue.Bool(self.s[self.start_pos] == 't')
1122 self._Next()
1123 obj = b
1124
1125 elif self.tok_id == Id.J8_Int:
1126 part = self.s[self.start_pos:self.end_pos]
1127 self._Next()
1128 obj = nvalue.Int(int(part))
1129
1130 elif self.tok_id == Id.J8_Float:
1131 part = self.s[self.start_pos:self.end_pos]
1132 self._Next()
1133 obj = nvalue.Float(float(part))
1134
1135 elif self.tok_id == Id.J8_String:
1136 str_val = nvalue.Str(self.decoded)
1137 self._Next()
1138 obj = str_val
1139
1140 # <- etc.
1141 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1142 Id.J8_Comma):
1143 # unquoted "word" treated like a string
1144 part = self.s[self.start_pos:self.end_pos]
1145 self._Next()
1146 obj = nvalue.Symbol(part)
1147
1148 elif self.tok_id == Id.Eol_Tok:
1149 raise self._ParseError('Unexpected EOF while parsing %s' %
1150 self.lang_str)
1151
1152 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1153 raise self._ParseError('Invalid token while parsing %s: %s' %
1154 (self.lang_str, Id_str(self.tok_id)))
1155
1156 #log('YO %s', Id_str(self.tok_id))
1157 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1158 #log('AT %s', Id_str(self.tok_id))
1159
1160 # key: "value" -> (: key "value")
1161 part = self.s[self.start_pos:self.end_pos]
1162 op = nvalue.Symbol(part)
1163
1164 self._Next()
1165 operand2 = self._ParseNil8()
1166 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1167 #print("--> INFIX %d %s" % (id(infix), infix))
1168 return infix
1169
1170 #next_id = self._LookAhead()
1171 #print('NEXT %s' % Id_str(next_id))
1172
1173 #raise AssertionError()
1174 #print("--> OBJ %d %s" % (id(obj), obj))
1175 return obj
1176
1177 def ParseNil8(self):
1178 # type: () -> nvalue_t
1179 """ Raises error.Decode. """
1180 self._Next()
1181 #print('yo')
1182 obj = self._ParseNil8()
1183 #print("==> %d %s" % (id(obj), obj))
1184 if self.tok_id != Id.Eol_Tok:
1185 raise self._ParseError('Unexpected trailing input')
1186 return obj
1187
1188
1189class J8LinesParser(_Parser):
1190 """Decode lines from a string with newlines.
1191
1192 We specify this with a grammar, to preserve location info and to reduce
1193 allocations. (But note that unquoted_line is more like a LOOP than it is
1194 grammatical.)
1195
1196 Grammar:
1197
1198 end = J8_Newline | Eol_Tok
1199
1200 empty_line = WS_Space? end
1201
1202 # special case: read until end token, but REMOVE trailing WS_Space
1203 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1204
1205 j8_line = WS_Space? J8_String WS_Space? end
1206
1207 lines = (empty_line | unquoted_line | j8_line)*
1208
1209 where Lit_Chars is valid UTF-8
1210
1211 Notes:
1212
1213 (1) We disallow multiple strings on a line, like:
1214
1215 "json" "json2"
1216 "json" unquoted
1217
1218 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1219
1220 foo "" u''
1221
1222 The "" and u'' are not a decoded string, because the line started with
1223 Id.Lit_Chars literals.
1224
1225 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1226 Does it have - for empty cell?
1227 """
1228
1229 def __init__(self, s):
1230 # type: (str) -> None
1231 _Parser.__init__(self, s, True)
1232
1233 def _Show(self, s):
1234 # type: (str) -> None
1235 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1236 self.end_pos)
1237
1238 def _ParseLine(self, out):
1239 # type: (List[str]) -> None
1240 """ May append a line to 'out' """
1241 #self._Show('1')
1242 if self.tok_id == Id.WS_Space:
1243 self._NextForLines()
1244
1245 # Empty line - return without doing anything
1246 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1247 self._NextForLines()
1248 return
1249
1250 # Quoted string on line
1251 if self.tok_id == Id.J8_String:
1252 out.append(self.decoded)
1253 self._NextForLines()
1254
1255 if self.tok_id == Id.WS_Space: # trailing whitespace
1256 self._NextForLines()
1257
1258 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1259 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1260 Id_str(self.tok_id))
1261
1262 self._NextForLines()
1263 return
1264
1265 # Unquoted line
1266 if self.tok_id == Id.Lit_Chars:
1267 # ' unquoted "" text on line ' # read every token until end
1268 string_start = self.start_pos
1269 while True:
1270 # for stripping whitespace
1271 prev_id = self.tok_id
1272 prev_start = self.start_pos
1273
1274 self._NextForLines()
1275
1276 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1277 # \r, but we're sticking with the JSON spec definition of
1278 # whitespace. (As another data point, CPython on Unix allows
1279 # \r in the middle of expressions, treating it as whitespace.)
1280 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1281 break
1282
1283 if prev_id == Id.WS_Space:
1284 string_end = prev_start # remove trailing whitespace
1285 else:
1286 string_end = self.start_pos
1287
1288 out.append(self.s[string_start:string_end])
1289
1290 self._NextForLines() # past newline
1291 return
1292
1293 raise AssertionError(Id_str(self.tok_id))
1294
1295 def Parse(self):
1296 # type: () -> List[str]
1297 """ Raises error.Decode. """
1298 self._NextForLines()
1299
1300 lines = [] # type: List[str]
1301 while self.tok_id != Id.Eol_Tok:
1302 self._ParseLine(lines)
1303
1304 if self.tok_id != Id.Eol_Tok:
1305 raise self._ParseError('Unexpected trailing input in J8 Lines')
1306
1307 return lines
1308
1309
1310def SplitJ8Lines(s):
1311 # type: (str) -> List[str]
1312 """Used by @(echo split command sub)
1313
1314 Raises:
1315 error.Decode
1316
1317 3 Errors:
1318 - J8 string syntax error inside quotes
1319 - Extra input on line
1320 - unquoted line isn't utf-8
1321 """
1322 p = J8LinesParser(s)
1323 return p.Parse()
1324
1325
1326# vim: sw=4