OILS / data_lang / j8.py View on Github | oilshell.org

1387 lines, 698 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193if 0:
194
195 def Repr(val):
196 # type: (value_t) -> str
197 """ Unused
198 This is like Python's repr
199 """
200 # error.Encode should be impossible - we show cycles and non-data
201 buf = mylib.BufWriter()
202 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
203 return buf.getvalue()
204
205
206def EncodeString(s, buf, unquoted_ok=False):
207 # type: (str, mylib.BufWriter, bool) -> None
208 """ For pp proc, etc."""
209
210 if unquoted_ok and fastfunc.CanOmitQuotes(s):
211 buf.write(s)
212 return
213
214 _Print(value.Str(s), buf, -1)
215
216
217def MaybeEncodeString(s):
218 # type: (str) -> str
219 """ For write --json8 $s and compexport """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223
224 buf = mylib.BufWriter()
225 _Print(value.Str(s), buf, -1)
226 return buf.getvalue()
227
228
229def MaybeEncodeJsonString(s):
230 # type: (str) -> str
231 """ For write --json """
232
233 # TODO: add unquoted_ok here?
234 # /usr/local/foo-bar/x.y/a_b
235 buf = mylib.BufWriter()
236 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237 return buf.getvalue()
238
239
240class InstancePrinter(object):
241 """Print a value tree as J8/JSON."""
242
243 def __init__(self, buf, indent, options):
244 # type: (mylib.BufWriter, int, int) -> None
245 self.buf = buf
246 self.indent = indent
247 self.options = options
248
249 # Key is vm.HeapValueId(val)
250 self.visiting = {} # type: Dict[int, bool]
251
252 def _ItemIndent(self, level):
253 # type: (int) -> None
254
255 if self.indent == -1:
256 return
257
258 self.buf.write_spaces((level + 1) * self.indent)
259
260 def _BracketIndent(self, level):
261 # type: (int) -> None
262
263 if self.indent == -1:
264 return
265
266 self.buf.write_spaces(level * self.indent)
267
268 def _MaybeNewline(self):
269 # type: () -> None
270 if self.indent == -1:
271 return
272 self.buf.write('\n')
273
274 def _MaybeSpace(self):
275 # type: () -> None
276 if self.indent == -1:
277 return
278 self.buf.write(' ')
279
280 def _PrintList(self, val, level):
281 # type: (value.List, int) -> None
282
283 if len(val.items) == 0: # Special case like Python/JS
284 self.buf.write('[]')
285 else:
286 self.buf.write('[')
287 self._MaybeNewline()
288 for i, item in enumerate(val.items):
289 if i != 0:
290 self.buf.write(',')
291 self._MaybeNewline()
292
293 self._ItemIndent(level)
294 self.Print(item, level + 1)
295 self._MaybeNewline()
296
297 self._BracketIndent(level)
298 self.buf.write(']')
299
300 def _PrintMapping(self, d, level):
301 # type: (Dict[str, value_t], int) -> None
302 if len(d) == 0: # Special case like Python/JS
303 self.buf.write('{}')
304 else:
305 self.buf.write('{')
306 self._MaybeNewline()
307 i = 0
308 for k, v in iteritems(d):
309 if i != 0:
310 self.buf.write(',')
311 self._MaybeNewline()
312
313 self._ItemIndent(level)
314
315 pyj8.WriteString(k, self.options, self.buf)
316
317 self.buf.write(':')
318 self._MaybeSpace()
319
320 self.Print(v, level + 1)
321
322 i += 1
323
324 self._MaybeNewline()
325 self._BracketIndent(level)
326 self.buf.write('}')
327
328 def _PrintDict(self, val, level):
329 # type: (value.Dict, int) -> None
330 self._PrintMapping(val.d, level)
331
332 def _PrintObj(self, val, level):
333 # type: (Obj, int) -> None
334
335 self._PrintMapping(val.d, level)
336
337 if val.prototype:
338 self.buf.write(' ==> ')
339 self._PrintObj(val.prototype, level)
340
341 def _PrintBashPrefix(self, type_str, level):
342 # type: (str, int) -> None
343
344 self.buf.write('{')
345 self._MaybeNewline()
346 self._ItemIndent(level)
347 self.buf.write('"type":')
348 self._MaybeSpace()
349 self.buf.write(type_str) # "BashArray", or "BashAssoc",
350
351 self._MaybeNewline()
352
353 self._ItemIndent(level)
354 self.buf.write('"data":')
355 self._MaybeSpace()
356
357 def _PrintBashSuffix(self, level):
358 # type: (int) -> None
359 self._MaybeNewline()
360 self._BracketIndent(level)
361 self.buf.write('}')
362
363 def _PrintSparseArray(self, val, level):
364 # type: (value.SparseArray, int) -> None
365
366 self._PrintBashPrefix('"SparseArray",', level)
367
368 if len(val.d) == 0: # Special case like Python/JS
369 self.buf.write('{}')
370 else:
371 self.buf.write('{')
372 self._MaybeNewline()
373
374 first = True
375 i = 0
376 for k, v in iteritems(val.d):
377 if i != 0:
378 self.buf.write(',')
379 self._MaybeNewline()
380
381 self._ItemIndent(level + 1)
382 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
383
384 self.buf.write(':')
385 self._MaybeSpace()
386
387 pyj8.WriteString(v, self.options, self.buf)
388
389 i += 1
390
391 self._MaybeNewline()
392
393 self._BracketIndent(level + 1)
394 self.buf.write('}')
395
396 self._PrintBashSuffix(level)
397
398 def _PrintBashArray(self, val, level):
399 # type: (value.BashArray, int) -> None
400
401 self._PrintBashPrefix('"BashArray",', level)
402
403 if len(val.strs) == 0: # Special case like Python/JS
404 self.buf.write('{}')
405 else:
406 self.buf.write('{')
407 self._MaybeNewline()
408
409 first = True
410 for i, s in enumerate(val.strs):
411 if s is None:
412 continue
413
414 if not first:
415 self.buf.write(',')
416 self._MaybeNewline()
417
418 self._ItemIndent(level + 1)
419 pyj8.WriteString(str(i), self.options, self.buf)
420
421 self.buf.write(':')
422 self._MaybeSpace()
423
424 pyj8.WriteString(s, self.options, self.buf)
425
426 first = False
427
428 self._MaybeNewline()
429
430 self._BracketIndent(level + 1)
431 self.buf.write('}')
432
433 self._PrintBashSuffix(level)
434
435 def _PrintBashAssoc(self, val, level):
436 # type: (value.BashAssoc, int) -> None
437
438 self._PrintBashPrefix('"BashAssoc",', level)
439
440 if len(val.d) == 0: # Special case like Python/JS
441 self.buf.write('{}')
442 else:
443 self.buf.write('{')
444 self._MaybeNewline()
445
446 i = 0
447 for k2, v2 in iteritems(val.d):
448 if i != 0:
449 self.buf.write(',')
450 self._MaybeNewline()
451
452 self._ItemIndent(level + 1)
453 pyj8.WriteString(k2, self.options, self.buf)
454
455 self.buf.write(':')
456 self._MaybeSpace()
457
458 pyj8.WriteString(v2, self.options, self.buf)
459
460 i += 1
461
462 self._MaybeNewline()
463
464 self._BracketIndent(level + 1)
465 self.buf.write('}')
466
467 self._PrintBashSuffix(level)
468
469 def Print(self, val, level=0):
470 # type: (value_t, int) -> None
471
472 # special value that means everything is on one line
473 # It's like
474 # JSON.stringify(d, null, 0)
475 # except we use -1, not 0. 0 can still have newlines.
476
477 UP_val = val
478 with tagswitch(val) as case:
479 if case(value_e.Null):
480 self.buf.write('null')
481
482 elif case(value_e.Bool):
483 val = cast(value.Bool, UP_val)
484 self.buf.write('true' if val.b else 'false')
485
486 elif case(value_e.Int):
487 val = cast(value.Int, UP_val)
488 # TODO: avoid intermediate allocation with
489 # self.buf.WriteBigInt(val.i)
490 #
491 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
492 # be of arbitrary length, and will need a growth strategy.
493 # Although that is not very common, so we could allocate in
494 # that case.
495
496 self.buf.write(mops.ToStr(val.i))
497
498 elif case(value_e.Float):
499 val = cast(value.Float, UP_val)
500
501 fl = val.f
502 if math.isinf(fl):
503 if self.options & INF_NAN_ARE_NULL:
504 s = 'null' # negative infinity is null too
505 else:
506 s = 'INFINITY'
507 if fl < 0:
508 s = '-' + s
509 elif math.isnan(fl):
510 if self.options & INF_NAN_ARE_NULL:
511 # JavaScript JSON lib behavior: Inf and NaN are null
512 # Python has a bug in the encoder by default, and then
513 # allow_nan=False raises an error
514 s = 'null'
515 else:
516 s = 'NAN'
517 else:
518 # TODO: can we avoid intermediate allocation?
519 # self.buf.WriteFloat(val.f)
520 s = str(fl)
521
522 self.buf.write(s)
523
524 elif case(value_e.Str):
525 val = cast(value.Str, UP_val)
526
527 pyj8.WriteString(val.s, self.options, self.buf)
528
529 elif case(value_e.List):
530 val = cast(value.List, UP_val)
531
532 # Cycle detection, only for containers that can be in cycles
533 heap_id = HeapValueId(val)
534
535 if self.visiting.get(heap_id, False):
536 if self.options & SHOW_CYCLES:
537 # Showing the ID would be nice for pretty printing, but
538 # the problem is we'd have to show it TWICE to make it
539 # meaningful
540 #
541 #self.buf.write('[ -->%s ]' % ValueIdString(val))
542 self.buf.write('[...]')
543 return
544 else:
545 # node.js prints which index closes the cycle
546 raise error.Encode(
547 "Can't encode List%s in object cycle" %
548 ValueIdString(val))
549 else:
550 self.visiting[heap_id] = True
551 self._PrintList(val, level)
552 self.visiting[heap_id] = False
553
554 elif case(value_e.Dict):
555 val = cast(value.Dict, UP_val)
556
557 # Cycle detection, only for containers that can be in cycles
558 heap_id = HeapValueId(val)
559
560 if self.visiting.get(heap_id, False):
561 if self.options & SHOW_CYCLES:
562 self.buf.write('{...}')
563 return
564 else:
565 # node.js prints which key closes the cycle
566 raise error.Encode(
567 "Can't encode Dict%s in object cycle" %
568 ValueIdString(val))
569 else:
570 self.visiting[heap_id] = True
571 self._PrintDict(val, level)
572 self.visiting[heap_id] = False
573
574 elif case(value_e.Obj):
575 val = cast(Obj, UP_val)
576
577 if not (self.options & SHOW_NON_DATA):
578 raise error.Encode("Can't encode value of type Obj")
579
580 # Cycle detection, only for containers that can be in cycles
581 heap_id = HeapValueId(val)
582
583 if self.visiting.get(heap_id, False):
584 if self.options & SHOW_CYCLES:
585 self.buf.write('{...}')
586 return
587 else:
588 # node.js prints which key closes the cycle
589 raise error.Encode(
590 "Can't encode Obj%s in object cycle" %
591 ValueIdString(val))
592 else:
593 self.visiting[heap_id] = True
594 self._PrintObj(val, level)
595 self.visiting[heap_id] = False
596
597 elif case(value_e.SparseArray):
598 val = cast(value.SparseArray, UP_val)
599 self._PrintSparseArray(val, level)
600
601 elif case(value_e.BashArray):
602 val = cast(value.BashArray, UP_val)
603 self._PrintBashArray(val, level)
604
605 elif case(value_e.BashAssoc):
606 val = cast(value.BashAssoc, UP_val)
607 self._PrintBashAssoc(val, level)
608
609 else:
610 pass # mycpp workaround
611 if self.options & SHOW_NON_DATA:
612 # Similar to = operator, ui.DebugPrint()
613 # TODO: that prints value.Range in a special way
614 ysh_type = ValType(val)
615 # Don't show ID in 'pp test_'
616 #id_str = ValueIdString(val)
617 self.buf.write('<%s>' % ysh_type)
618 else:
619 raise error.Encode("Can't serialize object of type %s" %
620 ValType(val))
621
622
623class PrettyPrinter(object):
624 """ Unused right now, but could enhance the = operator.
625
626 Output to polymorphic ColorOutput
627
628 Features like asdl/format.py:
629 - line wrapping
630 - color
631 - sharing detection by passing in a REF COUTN dict
632 - print @123 the first time, and then print ... the second time
633
634 and
635
636 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
637 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
638
639 - Omitting commas for ASDL? Maybe we can use two spaces
640
641 (Token id: Id.VSub_DollarName start: 0 length: 3)
642 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
643 """
644
645 def __init__(self, max_col):
646 # type: (int) -> None
647 self.max_col = max_col
648
649 # This could be an optimized set an C++ bit set like
650 # mark_sweep_heap.h, rather than a Dict
651 #self.unique_objs = mylib.UniqueObjects()
652
653 # first pass of object ID -> number of times references
654
655 self.ref_count = {} # type: Dict[int, int]
656
657 def PrettyTree(self, val, f):
658 # type: (value_t, fmt.ColorOutput) -> None
659
660 # TODO: first convert to hnode.asdl types?
661
662 # Although we might want
663 # hnode.AlreadyShown = (str type, int unique_id)
664 pass
665
666 def Print(self, val, buf):
667 # type: (value_t, mylib.BufWriter) -> None
668
669 # Or print to stderr?
670 f = fmt.DetectConsoleOutput(mylib.Stdout())
671 self.PrettyTree(val, f)
672
673 # Then print those with ASDL
674 pass
675
676
677class LexerDecoder(object):
678 """J8 lexer and string decoder.
679
680 Similar interface as SimpleLexer, except we return an optional decoded
681 string
682 """
683
684 def __init__(self, s, is_j8, lang_str):
685 # type: (str, bool, str) -> None
686 self.s = s
687 self.is_j8 = is_j8
688 self.lang_str = lang_str
689
690 self.pos = 0
691
692 # current line being lexed -- for error messages
693 self.cur_line_num = 1
694
695 # Reuse this instance to save GC objects. JSON objects could have
696 # thousands of strings.
697 self.decoded = mylib.BufWriter()
698
699 def _Error(self, msg, end_pos):
700 # type: (str, int) -> error.Decode
701
702 # Use the current position as start pos
703 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
704
705 def Next(self):
706 # type: () -> Tuple[Id_t, int, Optional[str]]
707 """ Returns a token and updates self.pos """
708
709 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
710
711 if not self.is_j8:
712 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
713 raise self._Error(
714 "Single quotes aren't part of JSON; you may want 'json8 read'",
715 end_pos)
716 if tok_id == Id.Ignored_Comment:
717 raise self._Error(
718 "Comments aren't part of JSON; you may want 'json8 read'",
719 end_pos)
720
721 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
722 Id.Left_USingleQuote):
723 return self._DecodeString(tok_id, end_pos)
724
725 if tok_id == Id.Left_JDoubleQuote:
726 if self.is_j8:
727 return self._DecodeString(tok_id, end_pos)
728 else:
729 raise self._Error('Pure JSON does not accept j"" prefix',
730 end_pos)
731
732 if tok_id == Id.Ignored_Newline:
733 #log('LINE %d', self.cur_line_num)
734 self.cur_line_num += 1
735
736 self.pos = end_pos
737 return tok_id, end_pos, None
738
739 def NextForLines(self):
740 # type: () -> Tuple[Id_t, int, Optional[str]]
741 """ Like Next(), but for J8 Lines """
742
743 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
744
745 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
746 Id.Left_BSingleQuote, Id.Left_USingleQuote):
747 return self._DecodeString(tok_id, end_pos)
748
749 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
750 # this for quoted strings.)
751 if (tok_id == Id.Lit_Chars and
752 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
753 raise self._Error(
754 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
755 if tok_id == Id.Char_AsciiControl:
756 raise self._Error(
757 "J8 Lines can't have unescaped ASCII control chars", end_pos)
758
759 if tok_id == Id.J8_Newline:
760 #log('LINE %d', self.cur_line_num)
761 self.cur_line_num += 1
762
763 self.pos = end_pos
764 return tok_id, end_pos, None
765
766 def _DecodeString(self, left_id, str_pos):
767 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
768 """ Returns a string token and updates self.pos """
769
770 while True:
771 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
772 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
773 else:
774 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
775
776 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
777
778 if tok_id == Id.Eol_Tok:
779 # TODO: point to beginning of # quote?
780 raise self._Error(
781 'Unexpected EOF while lexing %s string' % self.lang_str,
782 str_end)
783 if tok_id == Id.Unknown_Backslash:
784 raise self._Error(
785 'Bad backslash escape in %s string' % self.lang_str,
786 str_end)
787 if tok_id == Id.Char_AsciiControl:
788 raise self._Error(
789 "%s strings can't have unescaped ASCII control chars" %
790 self.lang_str, str_end)
791
792 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
793
794 self.pos = str_end
795
796 s = self.decoded.getvalue()
797 self.decoded.clear() # reuse this instance
798
799 #log('decoded %r', self.decoded.getvalue())
800 return Id.J8_String, str_end, s
801
802 #
803 # Now handle each kind of token
804 #
805
806 if tok_id == Id.Lit_Chars: # JSON and J8
807 part = self.s[str_pos:str_end]
808 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
809 raise self._Error(
810 'Invalid UTF-8 in %s string literal' % self.lang_str,
811 str_end)
812
813 # TODO: would be nice to avoid allocation in all these cases.
814 # But LookupCharC() would have to change.
815
816 elif tok_id == Id.Char_OneChar: # JSON and J8
817 ch = self.s[str_pos + 1]
818 part = consts.LookupCharC(ch)
819
820 elif tok_id == Id.Char_UBraced: # J8 only
821 h = self.s[str_pos + 3:str_end - 1]
822 i = int(h, 16)
823
824 # Same checks in osh/word_compile.py
825 if i > 0x10ffff:
826 raise self._Error(
827 "Code point can't be greater than U+10ffff", str_end)
828 if 0xD800 <= i and i < 0xE000:
829 raise self._Error(
830 r"\u{%s} escape is illegal because it's in the surrogate range"
831 % h, str_end)
832
833 part = Utf8Encode(i)
834
835 elif tok_id == Id.Char_YHex: # J8 only
836 h = self.s[str_pos + 2:str_end]
837
838 # Same check in osh/word_parse.py
839 if left_id != Id.Left_BSingleQuote:
840 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
841 raise self._Error(
842 r"\y%s escapes not allowed in u'' strings" % h,
843 str_end)
844
845 i = int(h, 16)
846 part = chr(i)
847
848 elif tok_id == Id.Char_SurrogatePair:
849 h1 = self.s[str_pos + 2:str_pos + 6]
850 h2 = self.s[str_pos + 8:str_pos + 12]
851
852 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
853 i1 = int(h1, 16) - 0xD800 # high surrogate
854 i2 = int(h2, 16) - 0xDC00 # low surrogate
855 code_point = 0x10000 + (i1 << 10) + i2
856
857 part = Utf8Encode(code_point)
858
859 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
860 h = self.s[str_pos + 2:str_end]
861 i = int(h, 16)
862 part = Utf8Encode(i)
863
864 else:
865 # Should never happen
866 raise AssertionError(Id_str(tok_id))
867
868 #log('%s part %r', Id_str(tok_id), part)
869 self.decoded.write(part)
870 str_pos = str_end
871
872
873class _Parser(object):
874
875 def __init__(self, s, is_j8):
876 # type: (str, bool) -> None
877 self.s = s
878 self.is_j8 = is_j8
879 self.lang_str = "J8" if is_j8 else "JSON"
880
881 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
882 self.tok_id = Id.Undefined_Tok
883 self.start_pos = 0
884 self.end_pos = 0
885 self.decoded = '' # decoded J8 string
886
887 def _Next(self):
888 # type: () -> None
889
890 # This isn't the start of a J8_Bool token, it's the END of the token before it
891 while True:
892 self.start_pos = self.end_pos
893 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
894 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
895 Id.Ignored_Comment):
896 break
897 # TODO: add Ignored_Newline to count lines, and show line numbers
898 # in errors messages. The position of the last newline and a token
899 # can be used to calculate a column number.
900
901 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
902
903 def _Eat(self, tok_id):
904 # type: (Id_t) -> None
905
906 if self.tok_id != tok_id:
907 #log('position %r %d-%d %r', self.s, self.start_pos,
908 # self.end_pos, self.s[self.start_pos:self.end_pos])
909 raise self._ParseError("Expected %s, got %s" %
910 (Id_str(tok_id), Id_str(self.tok_id)))
911 self._Next()
912
913 def _NextForLines(self):
914 # type: () -> None
915 """Like _Next, but use the J8 Lines lexer."""
916 self.start_pos = self.end_pos
917 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
918
919 def _ParseError(self, msg):
920 # type: (str) -> error.Decode
921 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
922 self.lexer.cur_line_num)
923
924
925class Parser(_Parser):
926 """JSON and JSON8 Parser."""
927
928 def __init__(self, s, is_j8):
929 # type: (str, bool) -> None
930 _Parser.__init__(self, s, is_j8)
931
932 def _ParsePair(self):
933 # type: () -> Tuple[str, value_t]
934
935 k = self.decoded # Save the potential string value
936 self._Eat(Id.J8_String) # Check that it's a string
937 assert k is not None
938
939 self._Eat(Id.J8_Colon)
940
941 v = self._ParseValue()
942 return k, v
943
944 def _ParseDict(self):
945 # type: () -> value_t
946 """
947 pair = string ':' value
948 Dict = '{' '}'
949 | '{' pair (',' pair)* '}'
950 """
951 # precondition
952 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
953
954 #log('> Dict')
955
956 d = NewDict() # type: Dict[str, value_t]
957
958 self._Next()
959 if self.tok_id == Id.J8_RBrace:
960 self._Next()
961 return value.Dict(d)
962
963 k, v = self._ParsePair()
964 d[k] = v
965 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
966
967 while self.tok_id == Id.J8_Comma:
968 self._Next()
969 k, v = self._ParsePair()
970 d[k] = v
971 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
972
973 self._Eat(Id.J8_RBrace)
974
975 #log('< Dict')
976
977 return value.Dict(d)
978
979 def _ParseList(self):
980 # type: () -> value_t
981 """
982 List = '[' ']'
983 | '[' value (',' value)* ']'
984 """
985 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
986
987 items = [] # type: List[value_t]
988
989 self._Next()
990 if self.tok_id == Id.J8_RBracket:
991 self._Next()
992 return value.List(items)
993
994 items.append(self._ParseValue())
995
996 while self.tok_id == Id.J8_Comma:
997 self._Next()
998 items.append(self._ParseValue())
999
1000 self._Eat(Id.J8_RBracket)
1001
1002 return value.List(items)
1003
1004 def _ParseValue(self):
1005 # type: () -> value_t
1006 if self.tok_id == Id.J8_LBrace:
1007 return self._ParseDict()
1008
1009 elif self.tok_id == Id.J8_LBracket:
1010 return self._ParseList()
1011
1012 elif self.tok_id == Id.J8_Null:
1013 self._Next()
1014 return value.Null
1015
1016 elif self.tok_id == Id.J8_Bool:
1017 #log('%r %d', self.s[self.start_pos], self.start_pos)
1018 b = value.Bool(self.s[self.start_pos] == 't')
1019 self._Next()
1020 return b
1021
1022 elif self.tok_id == Id.J8_Int:
1023 part = self.s[self.start_pos:self.end_pos]
1024 self._Next()
1025 try:
1026 big = mops.FromStr(part)
1027 except ValueError:
1028 raise self._ParseError('Integer is too big')
1029 return value.Int(big)
1030
1031 elif self.tok_id == Id.J8_Float:
1032 part = self.s[self.start_pos:self.end_pos]
1033 self._Next()
1034 return value.Float(float(part))
1035
1036 # UString, BString too
1037 elif self.tok_id == Id.J8_String:
1038 str_val = value.Str(self.decoded)
1039 #log('d %r', self.decoded)
1040 self._Next()
1041 return str_val
1042
1043 elif self.tok_id == Id.Eol_Tok:
1044 raise self._ParseError('Unexpected EOF while parsing %s' %
1045 self.lang_str)
1046
1047 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1048 raise self._ParseError('Invalid token while parsing %s: %s' %
1049 (self.lang_str, Id_str(self.tok_id)))
1050
1051 def ParseValue(self):
1052 # type: () -> value_t
1053 """ Raises error.Decode. """
1054 self._Next()
1055 obj = self._ParseValue()
1056
1057 n = len(self.s)
1058 if self.start_pos != n:
1059 extra = n - self.start_pos
1060 #log('n %d pos %d', n, self.start_pos)
1061 raise self._ParseError(
1062 'Got %d bytes of unexpected trailing input' % extra)
1063 return obj
1064
1065
1066class Nil8Parser(_Parser):
1067 """
1068 Tokens not in JSON8:
1069 LParen RParen Symbol
1070
1071 Tokens not in JSON, but in JSON8 and NIL8:
1072 Identifier (unquoted keys)
1073 Ignored_Comment
1074 """
1075
1076 def __init__(self, s, is_j8):
1077 # type: (str, bool) -> None
1078 _Parser.__init__(self, s, is_j8)
1079
1080 if 0:
1081
1082 def _LookAhead(self):
1083 # type: () -> Id_t
1084 """
1085 Don't need this right now
1086 """
1087 end_pos = self.end_pos # look ahead from last token
1088 while True:
1089 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1090 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1091 Id.Ignored_Comment):
1092 break
1093 return tok_id
1094
1095 def _ParseRecord(self):
1096 # type: () -> nvalue_t
1097 """
1098 Yaks
1099 (self->Next) => (-> self Next)
1100 (self->Next obj.field) => ((-> self Next) (. obj field))
1101
1102 Similar to
1103 ((identity identity) 42) => 42 in Clojure
1104
1105 ASDL
1106 (Node left:(. x4beef2))
1107 (Node left !x4beef2)
1108
1109 # Ambiguous because value can be identifier.
1110 # We have to look ahead to and see if there's a colon :
1111 field =
1112 Identifier ':' value
1113 | value
1114
1115 record = '(' head field* ')'
1116
1117 - Identifier | Symbol are treated the same, it's a side effect of
1118 the lexing style
1119 - do positional args come before named args
1120 - () is invalid? Use [] for empty list
1121 """
1122 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1123
1124 items = [] # type: List[nvalue_t]
1125
1126 self._Next()
1127 if self.tok_id == Id.J8_RParen:
1128 self._Next()
1129 return nvalue.List(items)
1130
1131 #log('TOK %s', Id_str(self.tok_id))
1132 while self.tok_id != Id.J8_RParen:
1133 items.append(self._ParseNil8())
1134 #log('TOK 2 %s', Id_str(self.tok_id))
1135
1136 self._Eat(Id.J8_RParen)
1137
1138 return nvalue.List(items)
1139
1140 def _ParseList8(self):
1141 # type: () -> nvalue_t
1142 """
1143 List8 = '[' value* ']'
1144
1145 No commas, not even optional ones for now.
1146 """
1147 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1148
1149 items = [] # type: List[nvalue_t]
1150
1151 self._Next()
1152 if self.tok_id == Id.J8_RBracket:
1153 self._Next()
1154 return nvalue.List(items)
1155
1156 #log('TOK %s', Id_str(self.tok_id))
1157 while self.tok_id != Id.J8_RBracket:
1158 items.append(self._ParseNil8())
1159 #log('TOK 2 %s', Id_str(self.tok_id))
1160
1161 self._Eat(Id.J8_RBracket)
1162
1163 return nvalue.List(items)
1164
1165 def _ParseNil8(self):
1166 # type: () -> nvalue_t
1167 if self.tok_id == Id.J8_LParen:
1168 obj = self._ParseRecord() # type: nvalue_t
1169 #return obj
1170
1171 elif self.tok_id == Id.J8_LBracket:
1172 obj = self._ParseList8()
1173 #return obj
1174
1175 # Primitives are copied from J8 above.
1176 # TODO: We also want hex literals.
1177 elif self.tok_id == Id.J8_Null:
1178 self._Next()
1179 obj = nvalue.Null
1180
1181 elif self.tok_id == Id.J8_Bool:
1182 b = nvalue.Bool(self.s[self.start_pos] == 't')
1183 self._Next()
1184 obj = b
1185
1186 elif self.tok_id == Id.J8_Int:
1187 part = self.s[self.start_pos:self.end_pos]
1188 self._Next()
1189 obj = nvalue.Int(int(part))
1190
1191 elif self.tok_id == Id.J8_Float:
1192 part = self.s[self.start_pos:self.end_pos]
1193 self._Next()
1194 obj = nvalue.Float(float(part))
1195
1196 elif self.tok_id == Id.J8_String:
1197 str_val = nvalue.Str(self.decoded)
1198 self._Next()
1199 obj = str_val
1200
1201 # <- etc.
1202 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1203 Id.J8_Comma):
1204 # unquoted "word" treated like a string
1205 part = self.s[self.start_pos:self.end_pos]
1206 self._Next()
1207 obj = nvalue.Symbol(part)
1208
1209 elif self.tok_id == Id.Eol_Tok:
1210 raise self._ParseError('Unexpected EOF while parsing %s' %
1211 self.lang_str)
1212
1213 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1214 raise self._ParseError('Invalid token while parsing %s: %s' %
1215 (self.lang_str, Id_str(self.tok_id)))
1216
1217 #log('YO %s', Id_str(self.tok_id))
1218 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1219 #log('AT %s', Id_str(self.tok_id))
1220
1221 # key: "value" -> (: key "value")
1222 part = self.s[self.start_pos:self.end_pos]
1223 op = nvalue.Symbol(part)
1224
1225 self._Next()
1226 operand2 = self._ParseNil8()
1227 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1228 #print("--> INFIX %d %s" % (id(infix), infix))
1229 return infix
1230
1231 #next_id = self._LookAhead()
1232 #print('NEXT %s' % Id_str(next_id))
1233
1234 #raise AssertionError()
1235 #print("--> OBJ %d %s" % (id(obj), obj))
1236 return obj
1237
1238 def ParseNil8(self):
1239 # type: () -> nvalue_t
1240 """ Raises error.Decode. """
1241 self._Next()
1242 #print('yo')
1243 obj = self._ParseNil8()
1244 #print("==> %d %s" % (id(obj), obj))
1245 if self.tok_id != Id.Eol_Tok:
1246 raise self._ParseError('Unexpected trailing input')
1247 return obj
1248
1249
1250class J8LinesParser(_Parser):
1251 """Decode lines from a string with newlines.
1252
1253 We specify this with a grammar, to preserve location info and to reduce
1254 allocations. (But note that unquoted_line is more like a LOOP than it is
1255 grammatical.)
1256
1257 Grammar:
1258
1259 end = J8_Newline | Eol_Tok
1260
1261 empty_line = WS_Space? end
1262
1263 # special case: read until end token, but REMOVE trailing WS_Space
1264 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1265
1266 j8_line = WS_Space? J8_String WS_Space? end
1267
1268 lines = (empty_line | unquoted_line | j8_line)*
1269
1270 where Lit_Chars is valid UTF-8
1271
1272 Notes:
1273
1274 (1) We disallow multiple strings on a line, like:
1275
1276 "json" "json2"
1277 "json" unquoted
1278
1279 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1280
1281 foo "" u''
1282
1283 The "" and u'' are not a decoded string, because the line started with
1284 Id.Lit_Chars literals.
1285
1286 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1287 Does it have - for empty cell?
1288 """
1289
1290 def __init__(self, s):
1291 # type: (str) -> None
1292 _Parser.__init__(self, s, True)
1293
1294 def _Show(self, s):
1295 # type: (str) -> None
1296 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1297 self.end_pos)
1298
1299 def _ParseLine(self, out):
1300 # type: (List[str]) -> None
1301 """ May append a line to 'out' """
1302 #self._Show('1')
1303 if self.tok_id == Id.WS_Space:
1304 self._NextForLines()
1305
1306 # Empty line - return without doing anything
1307 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1308 self._NextForLines()
1309 return
1310
1311 # Quoted string on line
1312 if self.tok_id == Id.J8_String:
1313 out.append(self.decoded)
1314 self._NextForLines()
1315
1316 if self.tok_id == Id.WS_Space: # trailing whitespace
1317 self._NextForLines()
1318
1319 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1320 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1321 Id_str(self.tok_id))
1322
1323 self._NextForLines()
1324 return
1325
1326 # Unquoted line
1327 if self.tok_id == Id.Lit_Chars:
1328 # ' unquoted "" text on line ' # read every token until end
1329 string_start = self.start_pos
1330 while True:
1331 # for stripping whitespace
1332 prev_id = self.tok_id
1333 prev_start = self.start_pos
1334
1335 self._NextForLines()
1336
1337 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1338 # \r, but we're sticking with the JSON spec definition of
1339 # whitespace. (As another data point, CPython on Unix allows
1340 # \r in the middle of expressions, treating it as whitespace.)
1341 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1342 break
1343
1344 if prev_id == Id.WS_Space:
1345 string_end = prev_start # remove trailing whitespace
1346 else:
1347 string_end = self.start_pos
1348
1349 out.append(self.s[string_start:string_end])
1350
1351 self._NextForLines() # past newline
1352 return
1353
1354 raise AssertionError(Id_str(self.tok_id))
1355
1356 def Parse(self):
1357 # type: () -> List[str]
1358 """ Raises error.Decode. """
1359 self._NextForLines()
1360
1361 lines = [] # type: List[str]
1362 while self.tok_id != Id.Eol_Tok:
1363 self._ParseLine(lines)
1364
1365 if self.tok_id != Id.Eol_Tok:
1366 raise self._ParseError('Unexpected trailing input in J8 Lines')
1367
1368 return lines
1369
1370
1371def SplitJ8Lines(s):
1372 # type: (str) -> List[str]
1373 """Used by @(echo split command sub)
1374
1375 Raises:
1376 error.Decode
1377
1378 3 Errors:
1379 - J8 string syntax error inside quotes
1380 - Extra input on line
1381 - unquoted line isn't utf-8
1382 """
1383 p = J8LinesParser(s)
1384 return p.Parse()
1385
1386
1387# vim: sw=4