OILS / data_lang / j8.py View on Github | oilshell.org

1386 lines, 697 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193if 0:
194
195 def Repr(val):
196 # type: (value_t) -> str
197 """ Unused
198 This is like Python's repr
199 """
200 # error.Encode should be impossible - we show cycles and non-data
201 buf = mylib.BufWriter()
202 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
203 return buf.getvalue()
204
205
206def EncodeString(s, buf, unquoted_ok=False):
207 # type: (str, mylib.BufWriter, bool) -> None
208 """ For pp proc, etc."""
209
210 if unquoted_ok and fastfunc.CanOmitQuotes(s):
211 buf.write(s)
212 return
213
214 _Print(value.Str(s), buf, -1)
215
216
217def MaybeEncodeString(s):
218 # type: (str) -> str
219 """ For write --json8 $s and compexport """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223
224 buf = mylib.BufWriter()
225 _Print(value.Str(s), buf, -1)
226 return buf.getvalue()
227
228
229def MaybeEncodeJsonString(s):
230 # type: (str) -> str
231 """ For write --json """
232
233 # TODO: add unquoted_ok here?
234 # /usr/local/foo-bar/x.y/a_b
235 buf = mylib.BufWriter()
236 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237 return buf.getvalue()
238
239
240class InstancePrinter(object):
241 """Print a value tree as J8/JSON."""
242
243 def __init__(self, buf, indent, options):
244 # type: (mylib.BufWriter, int, int) -> None
245 self.buf = buf
246 self.indent = indent
247 self.options = options
248
249 # Key is vm.HeapValueId(val)
250 self.visiting = {} # type: Dict[int, bool]
251
252 def _ItemIndent(self, level):
253 # type: (int) -> None
254
255 if self.indent == -1:
256 return
257
258 self.buf.write_spaces((level + 1) * self.indent)
259
260 def _BracketIndent(self, level):
261 # type: (int) -> None
262
263 if self.indent == -1:
264 return
265
266 self.buf.write_spaces(level * self.indent)
267
268 def _MaybeNewline(self):
269 # type: () -> None
270 if self.indent == -1:
271 return
272 self.buf.write('\n')
273
274 def _MaybeSpace(self):
275 # type: () -> None
276 if self.indent == -1:
277 return
278 self.buf.write(' ')
279
280 def _PrintList(self, val, level):
281 # type: (value.List, int) -> None
282
283 if len(val.items) == 0: # Special case like Python/JS
284 self.buf.write('[]')
285 else:
286 self.buf.write('[')
287 self._MaybeNewline()
288 for i, item in enumerate(val.items):
289 if i != 0:
290 self.buf.write(',')
291 self._MaybeNewline()
292
293 self._ItemIndent(level)
294 self.Print(item, level + 1)
295 self._MaybeNewline()
296
297 self._BracketIndent(level)
298 self.buf.write(']')
299
300 def _PrintMapping(self, d, left, right, level):
301 # type: (Dict[str, value_t], str, str, int) -> None
302 if len(d) == 0: # Special case like Python/JS
303 self.buf.write(left)
304 self.buf.write(right)
305 else:
306 self.buf.write(left)
307 self._MaybeNewline()
308 i = 0
309 for k, v in iteritems(d):
310 if i != 0:
311 self.buf.write(',')
312 self._MaybeNewline()
313
314 self._ItemIndent(level)
315
316 pyj8.WriteString(k, self.options, self.buf)
317
318 self.buf.write(':')
319 self._MaybeSpace()
320
321 self.Print(v, level + 1)
322
323 i += 1
324
325 self._MaybeNewline()
326 self._BracketIndent(level)
327 self.buf.write(right)
328
329 def _PrintDict(self, val, level):
330 # type: (value.Dict, int) -> None
331 self._PrintMapping(val.d, '{', '}', level)
332
333 def _PrintObj(self, val, level):
334 # type: (Obj, int) -> None
335
336 self._PrintMapping(val.d, '(', ')', level)
337
338 if val.prototype:
339 self.buf.write(' --> ')
340 self._PrintObj(val.prototype, level)
341
342 def _PrintBashPrefix(self, type_str, level):
343 # type: (str, int) -> None
344
345 self.buf.write('{')
346 self._MaybeNewline()
347 self._ItemIndent(level)
348 self.buf.write('"type":')
349 self._MaybeSpace()
350 self.buf.write(type_str) # "BashArray", or "BashAssoc",
351
352 self._MaybeNewline()
353
354 self._ItemIndent(level)
355 self.buf.write('"data":')
356 self._MaybeSpace()
357
358 def _PrintBashSuffix(self, level):
359 # type: (int) -> None
360 self._MaybeNewline()
361 self._BracketIndent(level)
362 self.buf.write('}')
363
364 def _PrintSparseArray(self, val, level):
365 # type: (value.SparseArray, int) -> None
366
367 self._PrintBashPrefix('"SparseArray",', level)
368
369 if len(val.d) == 0: # Special case like Python/JS
370 self.buf.write('{}')
371 else:
372 self.buf.write('{')
373 self._MaybeNewline()
374
375 i = 0
376 for k, v in iteritems(val.d):
377 if i != 0:
378 self.buf.write(',')
379 self._MaybeNewline()
380
381 self._ItemIndent(level + 1)
382 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
383
384 self.buf.write(':')
385 self._MaybeSpace()
386
387 pyj8.WriteString(v, self.options, self.buf)
388
389 i += 1
390
391 self._MaybeNewline()
392
393 self._BracketIndent(level + 1)
394 self.buf.write('}')
395
396 self._PrintBashSuffix(level)
397
398 def _PrintBashArray(self, val, level):
399 # type: (value.BashArray, int) -> None
400
401 self._PrintBashPrefix('"BashArray",', level)
402
403 if len(val.strs) == 0: # Special case like Python/JS
404 self.buf.write('{}')
405 else:
406 self.buf.write('{')
407 self._MaybeNewline()
408
409 first = True
410 for i, s in enumerate(val.strs):
411 if s is None:
412 continue
413
414 if not first:
415 self.buf.write(',')
416 self._MaybeNewline()
417
418 self._ItemIndent(level + 1)
419 pyj8.WriteString(str(i), self.options, self.buf)
420
421 self.buf.write(':')
422 self._MaybeSpace()
423
424 pyj8.WriteString(s, self.options, self.buf)
425
426 first = False
427
428 self._MaybeNewline()
429
430 self._BracketIndent(level + 1)
431 self.buf.write('}')
432
433 self._PrintBashSuffix(level)
434
435 def _PrintBashAssoc(self, val, level):
436 # type: (value.BashAssoc, int) -> None
437
438 self._PrintBashPrefix('"BashAssoc",', level)
439
440 if len(val.d) == 0: # Special case like Python/JS
441 self.buf.write('{}')
442 else:
443 self.buf.write('{')
444 self._MaybeNewline()
445
446 i = 0
447 for k2, v2 in iteritems(val.d):
448 if i != 0:
449 self.buf.write(',')
450 self._MaybeNewline()
451
452 self._ItemIndent(level + 1)
453 pyj8.WriteString(k2, self.options, self.buf)
454
455 self.buf.write(':')
456 self._MaybeSpace()
457
458 pyj8.WriteString(v2, self.options, self.buf)
459
460 i += 1
461
462 self._MaybeNewline()
463
464 self._BracketIndent(level + 1)
465 self.buf.write('}')
466
467 self._PrintBashSuffix(level)
468
469 def Print(self, val, level=0):
470 # type: (value_t, int) -> None
471
472 # special value that means everything is on one line
473 # It's like
474 # JSON.stringify(d, null, 0)
475 # except we use -1, not 0. 0 can still have newlines.
476
477 UP_val = val
478 with tagswitch(val) as case:
479 if case(value_e.Null):
480 self.buf.write('null')
481
482 elif case(value_e.Bool):
483 val = cast(value.Bool, UP_val)
484 self.buf.write('true' if val.b else 'false')
485
486 elif case(value_e.Int):
487 val = cast(value.Int, UP_val)
488 # TODO: avoid intermediate allocation with
489 # self.buf.WriteBigInt(val.i)
490 #
491 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
492 # be of arbitrary length, and will need a growth strategy.
493 # Although that is not very common, so we could allocate in
494 # that case.
495
496 self.buf.write(mops.ToStr(val.i))
497
498 elif case(value_e.Float):
499 val = cast(value.Float, UP_val)
500
501 fl = val.f
502 if math.isinf(fl):
503 if self.options & INF_NAN_ARE_NULL:
504 s = 'null' # negative infinity is null too
505 else:
506 s = 'INFINITY'
507 if fl < 0:
508 s = '-' + s
509 elif math.isnan(fl):
510 if self.options & INF_NAN_ARE_NULL:
511 # JavaScript JSON lib behavior: Inf and NaN are null
512 # Python has a bug in the encoder by default, and then
513 # allow_nan=False raises an error
514 s = 'null'
515 else:
516 s = 'NAN'
517 else:
518 # TODO: can we avoid intermediate allocation?
519 # self.buf.WriteFloat(val.f)
520 s = str(fl)
521
522 self.buf.write(s)
523
524 elif case(value_e.Str):
525 val = cast(value.Str, UP_val)
526
527 pyj8.WriteString(val.s, self.options, self.buf)
528
529 elif case(value_e.List):
530 val = cast(value.List, UP_val)
531
532 # Cycle detection, only for containers that can be in cycles
533 heap_id = HeapValueId(val)
534
535 if self.visiting.get(heap_id, False):
536 if self.options & SHOW_CYCLES:
537 # Showing the ID would be nice for pretty printing, but
538 # the problem is we'd have to show it TWICE to make it
539 # meaningful
540 #
541 #self.buf.write('[ -->%s ]' % ValueIdString(val))
542 self.buf.write('[...]')
543 return
544 else:
545 # node.js prints which index closes the cycle
546 raise error.Encode(
547 "Can't encode List%s in object cycle" %
548 ValueIdString(val))
549 else:
550 self.visiting[heap_id] = True
551 self._PrintList(val, level)
552 self.visiting[heap_id] = False
553
554 elif case(value_e.Dict):
555 val = cast(value.Dict, UP_val)
556
557 # Cycle detection, only for containers that can be in cycles
558 heap_id = HeapValueId(val)
559
560 if self.visiting.get(heap_id, False):
561 if self.options & SHOW_CYCLES:
562 self.buf.write('{...}')
563 return
564 else:
565 # node.js prints which key closes the cycle
566 raise error.Encode(
567 "Can't encode Dict%s in object cycle" %
568 ValueIdString(val))
569 else:
570 self.visiting[heap_id] = True
571 self._PrintDict(val, level)
572 self.visiting[heap_id] = False
573
574 elif case(value_e.Obj):
575 val = cast(Obj, UP_val)
576
577 if not (self.options & SHOW_NON_DATA):
578 raise error.Encode("Can't encode value of type Obj")
579
580 # Cycle detection, only for containers that can be in cycles
581 heap_id = HeapValueId(val)
582
583 if self.visiting.get(heap_id, False):
584 if self.options & SHOW_CYCLES:
585 self.buf.write('(...)')
586 return
587 else:
588 # node.js prints which key closes the cycle
589 raise error.Encode(
590 "Can't encode Obj%s in object cycle" %
591 ValueIdString(val))
592 else:
593 self.visiting[heap_id] = True
594 self._PrintObj(val, level)
595 self.visiting[heap_id] = False
596
597 elif case(value_e.SparseArray):
598 val = cast(value.SparseArray, UP_val)
599 self._PrintSparseArray(val, level)
600
601 elif case(value_e.BashArray):
602 val = cast(value.BashArray, UP_val)
603 self._PrintBashArray(val, level)
604
605 elif case(value_e.BashAssoc):
606 val = cast(value.BashAssoc, UP_val)
607 self._PrintBashAssoc(val, level)
608
609 else:
610 pass # mycpp workaround
611 if self.options & SHOW_NON_DATA:
612 # Similar to = operator, ui.DebugPrint()
613 # TODO: that prints value.Range in a special way
614 ysh_type = ValType(val)
615 # Don't show ID in 'pp test_'
616 #id_str = ValueIdString(val)
617 self.buf.write('<%s>' % ysh_type)
618 else:
619 raise error.Encode("Can't serialize object of type %s" %
620 ValType(val))
621
622
623class PrettyPrinter(object):
624 """ Unused right now, but could enhance the = operator.
625
626 Output to polymorphic ColorOutput
627
628 Features like asdl/format.py:
629 - line wrapping
630 - color
631 - sharing detection by passing in a REF COUTN dict
632 - print @123 the first time, and then print ... the second time
633
634 and
635
636 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
637 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
638
639 - Omitting commas for ASDL? Maybe we can use two spaces
640
641 (Token id: Id.VSub_DollarName start: 0 length: 3)
642 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
643 """
644
645 def __init__(self, max_col):
646 # type: (int) -> None
647 self.max_col = max_col
648
649 # This could be an optimized set an C++ bit set like
650 # mark_sweep_heap.h, rather than a Dict
651 #self.unique_objs = mylib.UniqueObjects()
652
653 # first pass of object ID -> number of times references
654
655 self.ref_count = {} # type: Dict[int, int]
656
657 def PrettyTree(self, val, f):
658 # type: (value_t, fmt.ColorOutput) -> None
659
660 # TODO: first convert to hnode.asdl types?
661
662 # Although we might want
663 # hnode.AlreadyShown = (str type, int unique_id)
664 pass
665
666 def Print(self, val, buf):
667 # type: (value_t, mylib.BufWriter) -> None
668
669 # Or print to stderr?
670 f = fmt.DetectConsoleOutput(mylib.Stdout())
671 self.PrettyTree(val, f)
672
673 # Then print those with ASDL
674 pass
675
676
677class LexerDecoder(object):
678 """J8 lexer and string decoder.
679
680 Similar interface as SimpleLexer, except we return an optional decoded
681 string
682 """
683
684 def __init__(self, s, is_j8, lang_str):
685 # type: (str, bool, str) -> None
686 self.s = s
687 self.is_j8 = is_j8
688 self.lang_str = lang_str
689
690 self.pos = 0
691
692 # current line being lexed -- for error messages
693 self.cur_line_num = 1
694
695 # Reuse this instance to save GC objects. JSON objects could have
696 # thousands of strings.
697 self.decoded = mylib.BufWriter()
698
699 def _Error(self, msg, end_pos):
700 # type: (str, int) -> error.Decode
701
702 # Use the current position as start pos
703 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
704
705 def Next(self):
706 # type: () -> Tuple[Id_t, int, Optional[str]]
707 """ Returns a token and updates self.pos """
708
709 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
710
711 if not self.is_j8:
712 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
713 raise self._Error(
714 "Single quotes aren't part of JSON; you may want 'json8 read'",
715 end_pos)
716 if tok_id == Id.Ignored_Comment:
717 raise self._Error(
718 "Comments aren't part of JSON; you may want 'json8 read'",
719 end_pos)
720
721 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
722 Id.Left_USingleQuote):
723 return self._DecodeString(tok_id, end_pos)
724
725 if tok_id == Id.Left_JDoubleQuote:
726 if self.is_j8:
727 return self._DecodeString(tok_id, end_pos)
728 else:
729 raise self._Error('Pure JSON does not accept j"" prefix',
730 end_pos)
731
732 if tok_id == Id.Ignored_Newline:
733 #log('LINE %d', self.cur_line_num)
734 self.cur_line_num += 1
735
736 self.pos = end_pos
737 return tok_id, end_pos, None
738
739 def NextForLines(self):
740 # type: () -> Tuple[Id_t, int, Optional[str]]
741 """ Like Next(), but for J8 Lines """
742
743 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
744
745 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
746 Id.Left_BSingleQuote, Id.Left_USingleQuote):
747 return self._DecodeString(tok_id, end_pos)
748
749 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
750 # this for quoted strings.)
751 if (tok_id == Id.Lit_Chars and
752 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
753 raise self._Error(
754 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
755 if tok_id == Id.Char_AsciiControl:
756 raise self._Error(
757 "J8 Lines can't have unescaped ASCII control chars", end_pos)
758
759 if tok_id == Id.J8_Newline:
760 #log('LINE %d', self.cur_line_num)
761 self.cur_line_num += 1
762
763 self.pos = end_pos
764 return tok_id, end_pos, None
765
766 def _DecodeString(self, left_id, str_pos):
767 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
768 """ Returns a string token and updates self.pos """
769
770 while True:
771 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
772 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
773 else:
774 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
775
776 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
777
778 if tok_id == Id.Eol_Tok:
779 # TODO: point to beginning of # quote?
780 raise self._Error(
781 'Unexpected EOF while lexing %s string' % self.lang_str,
782 str_end)
783 if tok_id == Id.Unknown_Backslash:
784 raise self._Error(
785 'Bad backslash escape in %s string' % self.lang_str,
786 str_end)
787 if tok_id == Id.Char_AsciiControl:
788 raise self._Error(
789 "%s strings can't have unescaped ASCII control chars" %
790 self.lang_str, str_end)
791
792 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
793
794 self.pos = str_end
795
796 s = self.decoded.getvalue()
797 self.decoded.clear() # reuse this instance
798
799 #log('decoded %r', self.decoded.getvalue())
800 return Id.J8_String, str_end, s
801
802 #
803 # Now handle each kind of token
804 #
805
806 if tok_id == Id.Lit_Chars: # JSON and J8
807 part = self.s[str_pos:str_end]
808 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
809 raise self._Error(
810 'Invalid UTF-8 in %s string literal' % self.lang_str,
811 str_end)
812
813 # TODO: would be nice to avoid allocation in all these cases.
814 # But LookupCharC() would have to change.
815
816 elif tok_id == Id.Char_OneChar: # JSON and J8
817 ch = self.s[str_pos + 1]
818 part = consts.LookupCharC(ch)
819
820 elif tok_id == Id.Char_UBraced: # J8 only
821 h = self.s[str_pos + 3:str_end - 1]
822 i = int(h, 16)
823
824 # Same checks in osh/word_compile.py
825 if i > 0x10ffff:
826 raise self._Error(
827 "Code point can't be greater than U+10ffff", str_end)
828 if 0xD800 <= i and i < 0xE000:
829 raise self._Error(
830 r"\u{%s} escape is illegal because it's in the surrogate range"
831 % h, str_end)
832
833 part = Utf8Encode(i)
834
835 elif tok_id == Id.Char_YHex: # J8 only
836 h = self.s[str_pos + 2:str_end]
837
838 # Same check in osh/word_parse.py
839 if left_id != Id.Left_BSingleQuote:
840 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
841 raise self._Error(
842 r"\y%s escapes not allowed in u'' strings" % h,
843 str_end)
844
845 i = int(h, 16)
846 part = chr(i)
847
848 elif tok_id == Id.Char_SurrogatePair:
849 h1 = self.s[str_pos + 2:str_pos + 6]
850 h2 = self.s[str_pos + 8:str_pos + 12]
851
852 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
853 i1 = int(h1, 16) - 0xD800 # high surrogate
854 i2 = int(h2, 16) - 0xDC00 # low surrogate
855 code_point = 0x10000 + (i1 << 10) + i2
856
857 part = Utf8Encode(code_point)
858
859 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
860 h = self.s[str_pos + 2:str_end]
861 i = int(h, 16)
862 part = Utf8Encode(i)
863
864 else:
865 # Should never happen
866 raise AssertionError(Id_str(tok_id))
867
868 #log('%s part %r', Id_str(tok_id), part)
869 self.decoded.write(part)
870 str_pos = str_end
871
872
873class _Parser(object):
874
875 def __init__(self, s, is_j8):
876 # type: (str, bool) -> None
877 self.s = s
878 self.is_j8 = is_j8
879 self.lang_str = "J8" if is_j8 else "JSON"
880
881 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
882 self.tok_id = Id.Undefined_Tok
883 self.start_pos = 0
884 self.end_pos = 0
885 self.decoded = '' # decoded J8 string
886
887 def _Next(self):
888 # type: () -> None
889
890 # This isn't the start of a J8_Bool token, it's the END of the token before it
891 while True:
892 self.start_pos = self.end_pos
893 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
894 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
895 Id.Ignored_Comment):
896 break
897 # TODO: add Ignored_Newline to count lines, and show line numbers
898 # in errors messages. The position of the last newline and a token
899 # can be used to calculate a column number.
900
901 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
902
903 def _Eat(self, tok_id):
904 # type: (Id_t) -> None
905
906 if self.tok_id != tok_id:
907 #log('position %r %d-%d %r', self.s, self.start_pos,
908 # self.end_pos, self.s[self.start_pos:self.end_pos])
909 raise self._ParseError("Expected %s, got %s" %
910 (Id_str(tok_id), Id_str(self.tok_id)))
911 self._Next()
912
913 def _NextForLines(self):
914 # type: () -> None
915 """Like _Next, but use the J8 Lines lexer."""
916 self.start_pos = self.end_pos
917 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
918
919 def _ParseError(self, msg):
920 # type: (str) -> error.Decode
921 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
922 self.lexer.cur_line_num)
923
924
925class Parser(_Parser):
926 """JSON and JSON8 Parser."""
927
928 def __init__(self, s, is_j8):
929 # type: (str, bool) -> None
930 _Parser.__init__(self, s, is_j8)
931
932 def _ParsePair(self):
933 # type: () -> Tuple[str, value_t]
934
935 k = self.decoded # Save the potential string value
936 self._Eat(Id.J8_String) # Check that it's a string
937 assert k is not None
938
939 self._Eat(Id.J8_Colon)
940
941 v = self._ParseValue()
942 return k, v
943
944 def _ParseDict(self):
945 # type: () -> value_t
946 """
947 pair = string ':' value
948 Dict = '{' '}'
949 | '{' pair (',' pair)* '}'
950 """
951 # precondition
952 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
953
954 #log('> Dict')
955
956 d = NewDict() # type: Dict[str, value_t]
957
958 self._Next()
959 if self.tok_id == Id.J8_RBrace:
960 self._Next()
961 return value.Dict(d)
962
963 k, v = self._ParsePair()
964 d[k] = v
965 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
966
967 while self.tok_id == Id.J8_Comma:
968 self._Next()
969 k, v = self._ParsePair()
970 d[k] = v
971 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
972
973 self._Eat(Id.J8_RBrace)
974
975 #log('< Dict')
976
977 return value.Dict(d)
978
979 def _ParseList(self):
980 # type: () -> value_t
981 """
982 List = '[' ']'
983 | '[' value (',' value)* ']'
984 """
985 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
986
987 items = [] # type: List[value_t]
988
989 self._Next()
990 if self.tok_id == Id.J8_RBracket:
991 self._Next()
992 return value.List(items)
993
994 items.append(self._ParseValue())
995
996 while self.tok_id == Id.J8_Comma:
997 self._Next()
998 items.append(self._ParseValue())
999
1000 self._Eat(Id.J8_RBracket)
1001
1002 return value.List(items)
1003
1004 def _ParseValue(self):
1005 # type: () -> value_t
1006 if self.tok_id == Id.J8_LBrace:
1007 return self._ParseDict()
1008
1009 elif self.tok_id == Id.J8_LBracket:
1010 return self._ParseList()
1011
1012 elif self.tok_id == Id.J8_Null:
1013 self._Next()
1014 return value.Null
1015
1016 elif self.tok_id == Id.J8_Bool:
1017 #log('%r %d', self.s[self.start_pos], self.start_pos)
1018 b = value.Bool(self.s[self.start_pos] == 't')
1019 self._Next()
1020 return b
1021
1022 elif self.tok_id == Id.J8_Int:
1023 part = self.s[self.start_pos:self.end_pos]
1024 self._Next()
1025 ok, big = mops.FromStr2(part)
1026 if not ok:
1027 raise self._ParseError('Integer is too big')
1028 return value.Int(big)
1029
1030 elif self.tok_id == Id.J8_Float:
1031 part = self.s[self.start_pos:self.end_pos]
1032 self._Next()
1033 return value.Float(float(part))
1034
1035 # UString, BString too
1036 elif self.tok_id == Id.J8_String:
1037 str_val = value.Str(self.decoded)
1038 #log('d %r', self.decoded)
1039 self._Next()
1040 return str_val
1041
1042 elif self.tok_id == Id.Eol_Tok:
1043 raise self._ParseError('Unexpected EOF while parsing %s' %
1044 self.lang_str)
1045
1046 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1047 raise self._ParseError('Invalid token while parsing %s: %s' %
1048 (self.lang_str, Id_str(self.tok_id)))
1049
1050 def ParseValue(self):
1051 # type: () -> value_t
1052 """ Raises error.Decode. """
1053 self._Next()
1054 obj = self._ParseValue()
1055
1056 n = len(self.s)
1057 if self.start_pos != n:
1058 extra = n - self.start_pos
1059 #log('n %d pos %d', n, self.start_pos)
1060 raise self._ParseError(
1061 'Got %d bytes of unexpected trailing input' % extra)
1062 return obj
1063
1064
1065class Nil8Parser(_Parser):
1066 """
1067 Tokens not in JSON8:
1068 LParen RParen Symbol
1069
1070 Tokens not in JSON, but in JSON8 and NIL8:
1071 Identifier (unquoted keys)
1072 Ignored_Comment
1073 """
1074
1075 def __init__(self, s, is_j8):
1076 # type: (str, bool) -> None
1077 _Parser.__init__(self, s, is_j8)
1078
1079 if 0:
1080
1081 def _LookAhead(self):
1082 # type: () -> Id_t
1083 """
1084 Don't need this right now
1085 """
1086 end_pos = self.end_pos # look ahead from last token
1087 while True:
1088 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1089 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1090 Id.Ignored_Comment):
1091 break
1092 return tok_id
1093
1094 def _ParseRecord(self):
1095 # type: () -> nvalue_t
1096 """
1097 Yaks
1098 (self->Next) => (-> self Next)
1099 (self->Next obj.field) => ((-> self Next) (. obj field))
1100
1101 Similar to
1102 ((identity identity) 42) => 42 in Clojure
1103
1104 ASDL
1105 (Node left:(. x4beef2))
1106 (Node left !x4beef2)
1107
1108 # Ambiguous because value can be identifier.
1109 # We have to look ahead to and see if there's a colon :
1110 field =
1111 Identifier ':' value
1112 | value
1113
1114 record = '(' head field* ')'
1115
1116 - Identifier | Symbol are treated the same, it's a side effect of
1117 the lexing style
1118 - do positional args come before named args
1119 - () is invalid? Use [] for empty list
1120 """
1121 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1122
1123 items = [] # type: List[nvalue_t]
1124
1125 self._Next()
1126 if self.tok_id == Id.J8_RParen:
1127 self._Next()
1128 return nvalue.List(items)
1129
1130 #log('TOK %s', Id_str(self.tok_id))
1131 while self.tok_id != Id.J8_RParen:
1132 items.append(self._ParseNil8())
1133 #log('TOK 2 %s', Id_str(self.tok_id))
1134
1135 self._Eat(Id.J8_RParen)
1136
1137 return nvalue.List(items)
1138
1139 def _ParseList8(self):
1140 # type: () -> nvalue_t
1141 """
1142 List8 = '[' value* ']'
1143
1144 No commas, not even optional ones for now.
1145 """
1146 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1147
1148 items = [] # type: List[nvalue_t]
1149
1150 self._Next()
1151 if self.tok_id == Id.J8_RBracket:
1152 self._Next()
1153 return nvalue.List(items)
1154
1155 #log('TOK %s', Id_str(self.tok_id))
1156 while self.tok_id != Id.J8_RBracket:
1157 items.append(self._ParseNil8())
1158 #log('TOK 2 %s', Id_str(self.tok_id))
1159
1160 self._Eat(Id.J8_RBracket)
1161
1162 return nvalue.List(items)
1163
1164 def _ParseNil8(self):
1165 # type: () -> nvalue_t
1166 if self.tok_id == Id.J8_LParen:
1167 obj = self._ParseRecord() # type: nvalue_t
1168 #return obj
1169
1170 elif self.tok_id == Id.J8_LBracket:
1171 obj = self._ParseList8()
1172 #return obj
1173
1174 # Primitives are copied from J8 above.
1175 # TODO: We also want hex literals.
1176 elif self.tok_id == Id.J8_Null:
1177 self._Next()
1178 obj = nvalue.Null
1179
1180 elif self.tok_id == Id.J8_Bool:
1181 b = nvalue.Bool(self.s[self.start_pos] == 't')
1182 self._Next()
1183 obj = b
1184
1185 elif self.tok_id == Id.J8_Int:
1186 part = self.s[self.start_pos:self.end_pos]
1187 self._Next()
1188 obj = nvalue.Int(int(part))
1189
1190 elif self.tok_id == Id.J8_Float:
1191 part = self.s[self.start_pos:self.end_pos]
1192 self._Next()
1193 obj = nvalue.Float(float(part))
1194
1195 elif self.tok_id == Id.J8_String:
1196 str_val = nvalue.Str(self.decoded)
1197 self._Next()
1198 obj = str_val
1199
1200 # <- etc.
1201 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1202 Id.J8_Comma):
1203 # unquoted "word" treated like a string
1204 part = self.s[self.start_pos:self.end_pos]
1205 self._Next()
1206 obj = nvalue.Symbol(part)
1207
1208 elif self.tok_id == Id.Eol_Tok:
1209 raise self._ParseError('Unexpected EOF while parsing %s' %
1210 self.lang_str)
1211
1212 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1213 raise self._ParseError('Invalid token while parsing %s: %s' %
1214 (self.lang_str, Id_str(self.tok_id)))
1215
1216 #log('YO %s', Id_str(self.tok_id))
1217 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1218 #log('AT %s', Id_str(self.tok_id))
1219
1220 # key: "value" -> (: key "value")
1221 part = self.s[self.start_pos:self.end_pos]
1222 op = nvalue.Symbol(part)
1223
1224 self._Next()
1225 operand2 = self._ParseNil8()
1226 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1227 #print("--> INFIX %d %s" % (id(infix), infix))
1228 return infix
1229
1230 #next_id = self._LookAhead()
1231 #print('NEXT %s' % Id_str(next_id))
1232
1233 #raise AssertionError()
1234 #print("--> OBJ %d %s" % (id(obj), obj))
1235 return obj
1236
1237 def ParseNil8(self):
1238 # type: () -> nvalue_t
1239 """ Raises error.Decode. """
1240 self._Next()
1241 #print('yo')
1242 obj = self._ParseNil8()
1243 #print("==> %d %s" % (id(obj), obj))
1244 if self.tok_id != Id.Eol_Tok:
1245 raise self._ParseError('Unexpected trailing input')
1246 return obj
1247
1248
1249class J8LinesParser(_Parser):
1250 """Decode lines from a string with newlines.
1251
1252 We specify this with a grammar, to preserve location info and to reduce
1253 allocations. (But note that unquoted_line is more like a LOOP than it is
1254 grammatical.)
1255
1256 Grammar:
1257
1258 end = J8_Newline | Eol_Tok
1259
1260 empty_line = WS_Space? end
1261
1262 # special case: read until end token, but REMOVE trailing WS_Space
1263 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1264
1265 j8_line = WS_Space? J8_String WS_Space? end
1266
1267 lines = (empty_line | unquoted_line | j8_line)*
1268
1269 where Lit_Chars is valid UTF-8
1270
1271 Notes:
1272
1273 (1) We disallow multiple strings on a line, like:
1274
1275 "json" "json2"
1276 "json" unquoted
1277
1278 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1279
1280 foo "" u''
1281
1282 The "" and u'' are not a decoded string, because the line started with
1283 Id.Lit_Chars literals.
1284
1285 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1286 Does it have - for empty cell?
1287 """
1288
1289 def __init__(self, s):
1290 # type: (str) -> None
1291 _Parser.__init__(self, s, True)
1292
1293 def _Show(self, s):
1294 # type: (str) -> None
1295 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1296 self.end_pos)
1297
1298 def _ParseLine(self, out):
1299 # type: (List[str]) -> None
1300 """ May append a line to 'out' """
1301 #self._Show('1')
1302 if self.tok_id == Id.WS_Space:
1303 self._NextForLines()
1304
1305 # Empty line - return without doing anything
1306 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1307 self._NextForLines()
1308 return
1309
1310 # Quoted string on line
1311 if self.tok_id == Id.J8_String:
1312 out.append(self.decoded)
1313 self._NextForLines()
1314
1315 if self.tok_id == Id.WS_Space: # trailing whitespace
1316 self._NextForLines()
1317
1318 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1319 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1320 Id_str(self.tok_id))
1321
1322 self._NextForLines()
1323 return
1324
1325 # Unquoted line
1326 if self.tok_id == Id.Lit_Chars:
1327 # ' unquoted "" text on line ' # read every token until end
1328 string_start = self.start_pos
1329 while True:
1330 # for stripping whitespace
1331 prev_id = self.tok_id
1332 prev_start = self.start_pos
1333
1334 self._NextForLines()
1335
1336 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1337 # \r, but we're sticking with the JSON spec definition of
1338 # whitespace. (As another data point, CPython on Unix allows
1339 # \r in the middle of expressions, treating it as whitespace.)
1340 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1341 break
1342
1343 if prev_id == Id.WS_Space:
1344 string_end = prev_start # remove trailing whitespace
1345 else:
1346 string_end = self.start_pos
1347
1348 out.append(self.s[string_start:string_end])
1349
1350 self._NextForLines() # past newline
1351 return
1352
1353 raise AssertionError(Id_str(self.tok_id))
1354
1355 def Parse(self):
1356 # type: () -> List[str]
1357 """ Raises error.Decode. """
1358 self._NextForLines()
1359
1360 lines = [] # type: List[str]
1361 while self.tok_id != Id.Eol_Tok:
1362 self._ParseLine(lines)
1363
1364 if self.tok_id != Id.Eol_Tok:
1365 raise self._ParseError('Unexpected trailing input in J8 Lines')
1366
1367 return lines
1368
1369
1370def SplitJ8Lines(s):
1371 # type: (str) -> List[str]
1372 """Used by @(echo split command sub)
1373
1374 Raises:
1375 error.Decode
1376
1377 3 Errors:
1378 - J8 string syntax error inside quotes
1379 - Extra input on line
1380 - unquoted line isn't utf-8
1381 """
1382 p = J8LinesParser(s)
1383 return p.Parse()
1384
1385
1386# vim: sw=4