OILS / data_lang / j8.py View on Github | oilshell.org

1416 lines, 710 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193if 0:
194
195 def Repr(val):
196 # type: (value_t) -> str
197 """ Unused
198 This is like Python's repr
199 """
200 # error.Encode should be impossible - we show cycles and non-data
201 buf = mylib.BufWriter()
202 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
203 return buf.getvalue()
204
205
206def EncodeString(s, buf, unquoted_ok=False):
207 # type: (str, mylib.BufWriter, bool) -> None
208 """ For pp proc, etc."""
209
210 if unquoted_ok and fastfunc.CanOmitQuotes(s):
211 buf.write(s)
212 return
213
214 _Print(value.Str(s), buf, -1)
215
216
217def MaybeEncodeString(s):
218 # type: (str) -> str
219 """ For write --json8 $s and compexport """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223
224 buf = mylib.BufWriter()
225 _Print(value.Str(s), buf, -1)
226 return buf.getvalue()
227
228
229def MaybeEncodeJsonString(s):
230 # type: (str) -> str
231 """ For write --json """
232
233 # TODO: add unquoted_ok here?
234 # /usr/local/foo-bar/x.y/a_b
235 buf = mylib.BufWriter()
236 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237 return buf.getvalue()
238
239
240# DFS traversal state
241UNSEEN = 0
242EXPLORING = 1
243FINISHED = 2
244
245
246class InstancePrinter(object):
247 """Print a value tree as J8/JSON."""
248
249 def __init__(self, buf, indent, options):
250 # type: (mylib.BufWriter, int, int) -> None
251 self.buf = buf
252 self.indent = indent
253 self.options = options
254
255 # Key is vm.HeapValueId(val)
256 # Value is always True
257 # Dict[int, None] doesn't translate -- it would be nice to have a set()
258 self.visited = {} # type: Dict[int, int]
259
260 def _ItemIndent(self, level):
261 # type: (int) -> None
262
263 if self.indent == -1:
264 return
265
266 self.buf.write_spaces((level + 1) * self.indent)
267
268 def _BracketIndent(self, level):
269 # type: (int) -> None
270
271 if self.indent == -1:
272 return
273
274 self.buf.write_spaces(level * self.indent)
275
276 def _MaybeNewline(self):
277 # type: () -> None
278 if self.indent == -1:
279 return
280 self.buf.write('\n')
281
282 def _MaybeSpace(self):
283 # type: () -> None
284 if self.indent == -1:
285 return
286 self.buf.write(' ')
287
288 def _PrintList(self, val, level):
289 # type: (value.List, int) -> None
290
291 if len(val.items) == 0: # Special case like Python/JS
292 self.buf.write('[]')
293 else:
294 self.buf.write('[')
295 self._MaybeNewline()
296 for i, item in enumerate(val.items):
297 if i != 0:
298 self.buf.write(',')
299 self._MaybeNewline()
300
301 self._ItemIndent(level)
302 self.Print(item, level + 1)
303 self._MaybeNewline()
304
305 self._BracketIndent(level)
306 self.buf.write(']')
307
308 def _PrintMapping(self, d, level):
309 # type: (Dict[str, value_t], int) -> None
310 if len(d) == 0: # Special case like Python/JS
311 self.buf.write('{}')
312 else:
313 self.buf.write('{')
314 self._MaybeNewline()
315 i = 0
316 for k, v in iteritems(d):
317 if i != 0:
318 self.buf.write(',')
319 self._MaybeNewline()
320
321 self._ItemIndent(level)
322
323 pyj8.WriteString(k, self.options, self.buf)
324
325 self.buf.write(':')
326 self._MaybeSpace()
327
328 self.Print(v, level + 1)
329
330 i += 1
331
332 self._MaybeNewline()
333 self._BracketIndent(level)
334 self.buf.write('}')
335
336 def _PrintDict(self, val, level):
337 # type: (value.Dict, int) -> None
338 self._PrintMapping(val.d, level)
339
340 def _PrintObj(self, val, level):
341 # type: (Obj, int) -> None
342
343 self._PrintMapping(val.d, level)
344
345 if val.prototype:
346 self.buf.write(' ==> ')
347 self._PrintObj(val.prototype, level)
348
349 def _PrintBashPrefix(self, type_str, level):
350 # type: (str, int) -> None
351
352 self.buf.write('{')
353 self._MaybeNewline()
354 self._ItemIndent(level)
355 self.buf.write('"type":')
356 self._MaybeSpace()
357 self.buf.write(type_str) # "BashArray", or "BashAssoc",
358
359 self._MaybeNewline()
360
361 self._ItemIndent(level)
362 self.buf.write('"data":')
363 self._MaybeSpace()
364
365 def _PrintBashSuffix(self, level):
366 # type: (int) -> None
367 self._MaybeNewline()
368 self._BracketIndent(level)
369 self.buf.write('}')
370
371 def _PrintSparseArray(self, val, level):
372 # type: (value.SparseArray, int) -> None
373
374 self._PrintBashPrefix('"SparseArray",', level)
375
376 if len(val.d) == 0: # Special case like Python/JS
377 self.buf.write('{}')
378 else:
379 self.buf.write('{')
380 self._MaybeNewline()
381
382 first = True
383 i = 0
384 for k, v in iteritems(val.d):
385 if i != 0:
386 self.buf.write(',')
387 self._MaybeNewline()
388
389 self._ItemIndent(level + 1)
390 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
391
392 self.buf.write(':')
393 self._MaybeSpace()
394
395 pyj8.WriteString(v, self.options, self.buf)
396
397 i += 1
398
399 self._MaybeNewline()
400
401 self._BracketIndent(level + 1)
402 self.buf.write('}')
403
404 self._PrintBashSuffix(level)
405
406 def _PrintBashArray(self, val, level):
407 # type: (value.BashArray, int) -> None
408
409 self._PrintBashPrefix('"BashArray",', level)
410
411 if len(val.strs) == 0: # Special case like Python/JS
412 self.buf.write('{}')
413 else:
414 self.buf.write('{')
415 self._MaybeNewline()
416
417 first = True
418 for i, s in enumerate(val.strs):
419 if s is None:
420 continue
421
422 if not first:
423 self.buf.write(',')
424 self._MaybeNewline()
425
426 self._ItemIndent(level + 1)
427 pyj8.WriteString(str(i), self.options, self.buf)
428
429 self.buf.write(':')
430 self._MaybeSpace()
431
432 pyj8.WriteString(s, self.options, self.buf)
433
434 first = False
435
436 self._MaybeNewline()
437
438 self._BracketIndent(level + 1)
439 self.buf.write('}')
440
441 self._PrintBashSuffix(level)
442
443 def _PrintBashAssoc(self, val, level):
444 # type: (value.BashAssoc, int) -> None
445
446 self._PrintBashPrefix('"BashAssoc",', level)
447
448 if len(val.d) == 0: # Special case like Python/JS
449 self.buf.write('{}')
450 else:
451 self.buf.write('{')
452 self._MaybeNewline()
453
454 i = 0
455 for k2, v2 in iteritems(val.d):
456 if i != 0:
457 self.buf.write(',')
458 self._MaybeNewline()
459
460 self._ItemIndent(level + 1)
461 pyj8.WriteString(k2, self.options, self.buf)
462
463 self.buf.write(':')
464 self._MaybeSpace()
465
466 pyj8.WriteString(v2, self.options, self.buf)
467
468 i += 1
469
470 self._MaybeNewline()
471
472 self._BracketIndent(level + 1)
473 self.buf.write('}')
474
475 self._PrintBashSuffix(level)
476
477 def Print(self, val, level=0):
478 # type: (value_t, int) -> None
479
480 # special value that means everything is on one line
481 # It's like
482 # JSON.stringify(d, null, 0)
483 # except we use -1, not 0. 0 can still have newlines.
484
485 UP_val = val
486 with tagswitch(val) as case:
487 if case(value_e.Null):
488 self.buf.write('null')
489
490 elif case(value_e.Bool):
491 val = cast(value.Bool, UP_val)
492 self.buf.write('true' if val.b else 'false')
493
494 elif case(value_e.Int):
495 val = cast(value.Int, UP_val)
496 # TODO: avoid intermediate allocation with
497 # self.buf.WriteBigInt(val.i)
498 #
499 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
500 # be of arbitrary length, and will need a growth strategy.
501 # Although that is not very common, so we could allocate in
502 # that case.
503
504 self.buf.write(mops.ToStr(val.i))
505
506 elif case(value_e.Float):
507 val = cast(value.Float, UP_val)
508
509 fl = val.f
510 if math.isinf(fl):
511 if self.options & INF_NAN_ARE_NULL:
512 s = 'null' # negative infinity is null too
513 else:
514 s = 'INFINITY'
515 if fl < 0:
516 s = '-' + s
517 elif math.isnan(fl):
518 if self.options & INF_NAN_ARE_NULL:
519 # JavaScript JSON lib behavior: Inf and NaN are null
520 # Python has a bug in the encoder by default, and then
521 # allow_nan=False raises an error
522 s = 'null'
523 else:
524 s = 'NAN'
525 else:
526 # TODO: can we avoid intermediate allocation?
527 # self.buf.WriteFloat(val.f)
528 s = str(fl)
529
530 self.buf.write(s)
531
532 elif case(value_e.Str):
533 val = cast(value.Str, UP_val)
534
535 pyj8.WriteString(val.s, self.options, self.buf)
536
537 elif case(value_e.List):
538 val = cast(value.List, UP_val)
539
540 # Cycle detection, only for containers that can be in cycles
541 heap_id = HeapValueId(val)
542
543 node_state = self.visited.get(heap_id, UNSEEN)
544 if node_state == FINISHED:
545 # Print it AGAIN. We print a JSON tree, which means we can
546 # visit and print nodes MANY TIMES, as long as they're not
547 # in a cycle.
548 self._PrintList(val, level)
549 return
550 if node_state == EXPLORING:
551 if self.options & SHOW_CYCLES:
552 self.buf.write('[ -->%s ]' % ValueIdString(val))
553 return
554 else:
555 # node.js prints which index closes the cycle
556 raise error.Encode(
557 "Can't encode List%s in object cycle" %
558 ValueIdString(val))
559
560 self.visited[heap_id] = EXPLORING
561 self._PrintList(val, level)
562 self.visited[heap_id] = FINISHED
563
564 elif case(value_e.Dict):
565 val = cast(value.Dict, UP_val)
566
567 # Cycle detection, only for containers that can be in cycles
568 heap_id = HeapValueId(val)
569
570 node_state = self.visited.get(heap_id, UNSEEN)
571 if node_state == FINISHED:
572 # Print it AGAIN. We print a JSON tree, which means we can
573 # visit and print nodes MANY TIMES, as long as they're not
574 # in a cycle.
575 self._PrintDict(val, level)
576 return
577 if node_state == EXPLORING:
578 if self.options & SHOW_CYCLES:
579 self.buf.write('{ -->%s }' % ValueIdString(val))
580 return
581 else:
582 # node.js prints which key closes the cycle
583 raise error.Encode(
584 "Can't encode Dict%s in object cycle" %
585 ValueIdString(val))
586
587 self.visited[heap_id] = EXPLORING
588 self._PrintDict(val, level)
589 self.visited[heap_id] = FINISHED
590
591 elif case(value_e.Obj):
592 val = cast(Obj, UP_val)
593
594 if not (self.options & SHOW_NON_DATA):
595 raise error.Encode("Can't encode value of type Obj")
596
597 # Cycle detection, only for containers that can be in cycles
598 heap_id = HeapValueId(val)
599
600 node_state = self.visited.get(heap_id, UNSEEN)
601 if node_state == FINISHED:
602 # Print it AGAIN. We print a JSON tree, which means we can
603 # visit and print nodes MANY TIMES, as long as they're not
604 # in a cycle.
605 self._PrintObj(val, level)
606 return
607 if node_state == EXPLORING:
608 if self.options & SHOW_CYCLES:
609 self.buf.write('{ -->%s }' % ValueIdString(val))
610 return
611 else:
612 # node.js prints which key closes the cycle
613 raise error.Encode(
614 "Can't encode Obj%s in object cycle" %
615 ValueIdString(val))
616
617 # TODO: cycle detection is a bit wrong, I think because the
618 # properties are a Dict[str, value_t], not something with an
619 # identity
620 #
621 # This is only used for pp test_, because SHOW_NON_DATA.
622 self.visited[heap_id] = EXPLORING
623 self._PrintObj(val, level)
624 self.visited[heap_id] = FINISHED
625
626 elif case(value_e.SparseArray):
627 val = cast(value.SparseArray, UP_val)
628 self._PrintSparseArray(val, level)
629
630 elif case(value_e.BashArray):
631 val = cast(value.BashArray, UP_val)
632 self._PrintBashArray(val, level)
633
634 elif case(value_e.BashAssoc):
635 val = cast(value.BashAssoc, UP_val)
636 self._PrintBashAssoc(val, level)
637
638 else:
639 pass # mycpp workaround
640 if self.options & SHOW_NON_DATA:
641 # Similar to = operator, ui.DebugPrint()
642 # TODO: that prints value.Range in a special way
643 ysh_type = ValType(val)
644 # Don't show ID in 'pp test_'
645 #id_str = ValueIdString(val)
646 self.buf.write('<%s>' % ysh_type)
647 else:
648 raise error.Encode("Can't serialize object of type %s" %
649 ValType(val))
650
651
652class PrettyPrinter(object):
653 """ Unused right now, but could enhance the = operator.
654
655 Output to polymorphic ColorOutput
656
657 Features like asdl/format.py:
658 - line wrapping
659 - color
660 - sharing detection by passing in a REF COUTN dict
661 - print @123 the first time, and then print ... the second time
662
663 and
664
665 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
666 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
667
668 - Omitting commas for ASDL? Maybe we can use two spaces
669
670 (Token id: Id.VSub_DollarName start: 0 length: 3)
671 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
672 """
673
674 def __init__(self, max_col):
675 # type: (int) -> None
676 self.max_col = max_col
677
678 # This could be an optimized set an C++ bit set like
679 # mark_sweep_heap.h, rather than a Dict
680 #self.unique_objs = mylib.UniqueObjects()
681
682 # first pass of object ID -> number of times references
683
684 self.ref_count = {} # type: Dict[int, int]
685
686 def PrettyTree(self, val, f):
687 # type: (value_t, fmt.ColorOutput) -> None
688
689 # TODO: first convert to hnode.asdl types?
690
691 # Although we might want
692 # hnode.AlreadyShown = (str type, int unique_id)
693 pass
694
695 def Print(self, val, buf):
696 # type: (value_t, mylib.BufWriter) -> None
697
698 # Or print to stderr?
699 f = fmt.DetectConsoleOutput(mylib.Stdout())
700 self.PrettyTree(val, f)
701
702 # Then print those with ASDL
703 pass
704
705
706class LexerDecoder(object):
707 """J8 lexer and string decoder.
708
709 Similar interface as SimpleLexer, except we return an optional decoded
710 string
711 """
712
713 def __init__(self, s, is_j8, lang_str):
714 # type: (str, bool, str) -> None
715 self.s = s
716 self.is_j8 = is_j8
717 self.lang_str = lang_str
718
719 self.pos = 0
720
721 # current line being lexed -- for error messages
722 self.cur_line_num = 1
723
724 # Reuse this instance to save GC objects. JSON objects could have
725 # thousands of strings.
726 self.decoded = mylib.BufWriter()
727
728 def _Error(self, msg, end_pos):
729 # type: (str, int) -> error.Decode
730
731 # Use the current position as start pos
732 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
733
734 def Next(self):
735 # type: () -> Tuple[Id_t, int, Optional[str]]
736 """ Returns a token and updates self.pos """
737
738 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
739
740 if not self.is_j8:
741 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
742 raise self._Error(
743 "Single quotes aren't part of JSON; you may want 'json8 read'",
744 end_pos)
745 if tok_id == Id.Ignored_Comment:
746 raise self._Error(
747 "Comments aren't part of JSON; you may want 'json8 read'",
748 end_pos)
749
750 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
751 Id.Left_USingleQuote):
752 return self._DecodeString(tok_id, end_pos)
753
754 if tok_id == Id.Left_JDoubleQuote:
755 if self.is_j8:
756 return self._DecodeString(tok_id, end_pos)
757 else:
758 raise self._Error('Pure JSON does not accept j"" prefix',
759 end_pos)
760
761 if tok_id == Id.Ignored_Newline:
762 #log('LINE %d', self.cur_line_num)
763 self.cur_line_num += 1
764
765 self.pos = end_pos
766 return tok_id, end_pos, None
767
768 def NextForLines(self):
769 # type: () -> Tuple[Id_t, int, Optional[str]]
770 """ Like Next(), but for J8 Lines """
771
772 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
773
774 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
775 Id.Left_BSingleQuote, Id.Left_USingleQuote):
776 return self._DecodeString(tok_id, end_pos)
777
778 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
779 # this for quoted strings.)
780 if (tok_id == Id.Lit_Chars and
781 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
782 raise self._Error(
783 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
784 if tok_id == Id.Char_AsciiControl:
785 raise self._Error(
786 "J8 Lines can't have unescaped ASCII control chars", end_pos)
787
788 if tok_id == Id.J8_Newline:
789 #log('LINE %d', self.cur_line_num)
790 self.cur_line_num += 1
791
792 self.pos = end_pos
793 return tok_id, end_pos, None
794
795 def _DecodeString(self, left_id, str_pos):
796 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
797 """ Returns a string token and updates self.pos """
798
799 while True:
800 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
801 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
802 else:
803 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
804
805 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
806
807 if tok_id == Id.Eol_Tok:
808 # TODO: point to beginning of # quote?
809 raise self._Error(
810 'Unexpected EOF while lexing %s string' % self.lang_str,
811 str_end)
812 if tok_id == Id.Unknown_Backslash:
813 raise self._Error(
814 'Bad backslash escape in %s string' % self.lang_str,
815 str_end)
816 if tok_id == Id.Char_AsciiControl:
817 raise self._Error(
818 "%s strings can't have unescaped ASCII control chars" %
819 self.lang_str, str_end)
820
821 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
822
823 self.pos = str_end
824
825 s = self.decoded.getvalue()
826 self.decoded.clear() # reuse this instance
827
828 #log('decoded %r', self.decoded.getvalue())
829 return Id.J8_String, str_end, s
830
831 #
832 # Now handle each kind of token
833 #
834
835 if tok_id == Id.Lit_Chars: # JSON and J8
836 part = self.s[str_pos:str_end]
837 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
838 raise self._Error(
839 'Invalid UTF-8 in %s string literal' % self.lang_str,
840 str_end)
841
842 # TODO: would be nice to avoid allocation in all these cases.
843 # But LookupCharC() would have to change.
844
845 elif tok_id == Id.Char_OneChar: # JSON and J8
846 ch = self.s[str_pos + 1]
847 part = consts.LookupCharC(ch)
848
849 elif tok_id == Id.Char_UBraced: # J8 only
850 h = self.s[str_pos + 3:str_end - 1]
851 i = int(h, 16)
852
853 # Same checks in osh/word_compile.py
854 if i > 0x10ffff:
855 raise self._Error(
856 "Code point can't be greater than U+10ffff", str_end)
857 if 0xD800 <= i and i < 0xE000:
858 raise self._Error(
859 r"\u{%s} escape is illegal because it's in the surrogate range"
860 % h, str_end)
861
862 part = Utf8Encode(i)
863
864 elif tok_id == Id.Char_YHex: # J8 only
865 h = self.s[str_pos + 2:str_end]
866
867 # Same check in osh/word_parse.py
868 if left_id != Id.Left_BSingleQuote:
869 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
870 raise self._Error(
871 r"\y%s escapes not allowed in u'' strings" % h,
872 str_end)
873
874 i = int(h, 16)
875 part = chr(i)
876
877 elif tok_id == Id.Char_SurrogatePair:
878 h1 = self.s[str_pos + 2:str_pos + 6]
879 h2 = self.s[str_pos + 8:str_pos + 12]
880
881 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
882 i1 = int(h1, 16) - 0xD800 # high surrogate
883 i2 = int(h2, 16) - 0xDC00 # low surrogate
884 code_point = 0x10000 + (i1 << 10) + i2
885
886 part = Utf8Encode(code_point)
887
888 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
889 h = self.s[str_pos + 2:str_end]
890 i = int(h, 16)
891 part = Utf8Encode(i)
892
893 else:
894 # Should never happen
895 raise AssertionError(Id_str(tok_id))
896
897 #log('%s part %r', Id_str(tok_id), part)
898 self.decoded.write(part)
899 str_pos = str_end
900
901
902class _Parser(object):
903
904 def __init__(self, s, is_j8):
905 # type: (str, bool) -> None
906 self.s = s
907 self.is_j8 = is_j8
908 self.lang_str = "J8" if is_j8 else "JSON"
909
910 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
911 self.tok_id = Id.Undefined_Tok
912 self.start_pos = 0
913 self.end_pos = 0
914 self.decoded = '' # decoded J8 string
915
916 def _Next(self):
917 # type: () -> None
918
919 # This isn't the start of a J8_Bool token, it's the END of the token before it
920 while True:
921 self.start_pos = self.end_pos
922 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
923 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
924 Id.Ignored_Comment):
925 break
926 # TODO: add Ignored_Newline to count lines, and show line numbers
927 # in errors messages. The position of the last newline and a token
928 # can be used to calculate a column number.
929
930 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
931
932 def _Eat(self, tok_id):
933 # type: (Id_t) -> None
934
935 if self.tok_id != tok_id:
936 #log('position %r %d-%d %r', self.s, self.start_pos,
937 # self.end_pos, self.s[self.start_pos:self.end_pos])
938 raise self._ParseError("Expected %s, got %s" %
939 (Id_str(tok_id), Id_str(self.tok_id)))
940 self._Next()
941
942 def _NextForLines(self):
943 # type: () -> None
944 """Like _Next, but use the J8 Lines lexer."""
945 self.start_pos = self.end_pos
946 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
947
948 def _ParseError(self, msg):
949 # type: (str) -> error.Decode
950 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
951 self.lexer.cur_line_num)
952
953
954class Parser(_Parser):
955 """JSON and JSON8 Parser."""
956
957 def __init__(self, s, is_j8):
958 # type: (str, bool) -> None
959 _Parser.__init__(self, s, is_j8)
960
961 def _ParsePair(self):
962 # type: () -> Tuple[str, value_t]
963
964 k = self.decoded # Save the potential string value
965 self._Eat(Id.J8_String) # Check that it's a string
966 assert k is not None
967
968 self._Eat(Id.J8_Colon)
969
970 v = self._ParseValue()
971 return k, v
972
973 def _ParseDict(self):
974 # type: () -> value_t
975 """
976 pair = string ':' value
977 Dict = '{' '}'
978 | '{' pair (',' pair)* '}'
979 """
980 # precondition
981 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
982
983 #log('> Dict')
984
985 d = NewDict() # type: Dict[str, value_t]
986
987 self._Next()
988 if self.tok_id == Id.J8_RBrace:
989 self._Next()
990 return value.Dict(d)
991
992 k, v = self._ParsePair()
993 d[k] = v
994 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
995
996 while self.tok_id == Id.J8_Comma:
997 self._Next()
998 k, v = self._ParsePair()
999 d[k] = v
1000 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
1001
1002 self._Eat(Id.J8_RBrace)
1003
1004 #log('< Dict')
1005
1006 return value.Dict(d)
1007
1008 def _ParseList(self):
1009 # type: () -> value_t
1010 """
1011 List = '[' ']'
1012 | '[' value (',' value)* ']'
1013 """
1014 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1015
1016 items = [] # type: List[value_t]
1017
1018 self._Next()
1019 if self.tok_id == Id.J8_RBracket:
1020 self._Next()
1021 return value.List(items)
1022
1023 items.append(self._ParseValue())
1024
1025 while self.tok_id == Id.J8_Comma:
1026 self._Next()
1027 items.append(self._ParseValue())
1028
1029 self._Eat(Id.J8_RBracket)
1030
1031 return value.List(items)
1032
1033 def _ParseValue(self):
1034 # type: () -> value_t
1035 if self.tok_id == Id.J8_LBrace:
1036 return self._ParseDict()
1037
1038 elif self.tok_id == Id.J8_LBracket:
1039 return self._ParseList()
1040
1041 elif self.tok_id == Id.J8_Null:
1042 self._Next()
1043 return value.Null
1044
1045 elif self.tok_id == Id.J8_Bool:
1046 #log('%r %d', self.s[self.start_pos], self.start_pos)
1047 b = value.Bool(self.s[self.start_pos] == 't')
1048 self._Next()
1049 return b
1050
1051 elif self.tok_id == Id.J8_Int:
1052 part = self.s[self.start_pos:self.end_pos]
1053 self._Next()
1054 try:
1055 big = mops.FromStr(part)
1056 except ValueError:
1057 raise self._ParseError('Integer is too big')
1058 return value.Int(big)
1059
1060 elif self.tok_id == Id.J8_Float:
1061 part = self.s[self.start_pos:self.end_pos]
1062 self._Next()
1063 return value.Float(float(part))
1064
1065 # UString, BString too
1066 elif self.tok_id == Id.J8_String:
1067 str_val = value.Str(self.decoded)
1068 #log('d %r', self.decoded)
1069 self._Next()
1070 return str_val
1071
1072 elif self.tok_id == Id.Eol_Tok:
1073 raise self._ParseError('Unexpected EOF while parsing %s' %
1074 self.lang_str)
1075
1076 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1077 raise self._ParseError('Invalid token while parsing %s: %s' %
1078 (self.lang_str, Id_str(self.tok_id)))
1079
1080 def ParseValue(self):
1081 # type: () -> value_t
1082 """ Raises error.Decode. """
1083 self._Next()
1084 obj = self._ParseValue()
1085
1086 n = len(self.s)
1087 if self.start_pos != n:
1088 extra = n - self.start_pos
1089 #log('n %d pos %d', n, self.start_pos)
1090 raise self._ParseError(
1091 'Got %d bytes of unexpected trailing input' % extra)
1092 return obj
1093
1094
1095class Nil8Parser(_Parser):
1096 """
1097 Tokens not in JSON8:
1098 LParen RParen Symbol
1099
1100 Tokens not in JSON, but in JSON8 and NIL8:
1101 Identifier (unquoted keys)
1102 Ignored_Comment
1103 """
1104
1105 def __init__(self, s, is_j8):
1106 # type: (str, bool) -> None
1107 _Parser.__init__(self, s, is_j8)
1108
1109 if 0:
1110
1111 def _LookAhead(self):
1112 # type: () -> Id_t
1113 """
1114 Don't need this right now
1115 """
1116 end_pos = self.end_pos # look ahead from last token
1117 while True:
1118 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1119 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1120 Id.Ignored_Comment):
1121 break
1122 return tok_id
1123
1124 def _ParseRecord(self):
1125 # type: () -> nvalue_t
1126 """
1127 Yaks
1128 (self->Next) => (-> self Next)
1129 (self->Next obj.field) => ((-> self Next) (. obj field))
1130
1131 Similar to
1132 ((identity identity) 42) => 42 in Clojure
1133
1134 ASDL
1135 (Node left:(. x4beef2))
1136 (Node left !x4beef2)
1137
1138 # Ambiguous because value can be identifier.
1139 # We have to look ahead to and see if there's a colon :
1140 field =
1141 Identifier ':' value
1142 | value
1143
1144 record = '(' head field* ')'
1145
1146 - Identifier | Symbol are treated the same, it's a side effect of
1147 the lexing style
1148 - do positional args come before named args
1149 - () is invalid? Use [] for empty list
1150 """
1151 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1152
1153 items = [] # type: List[nvalue_t]
1154
1155 self._Next()
1156 if self.tok_id == Id.J8_RParen:
1157 self._Next()
1158 return nvalue.List(items)
1159
1160 #log('TOK %s', Id_str(self.tok_id))
1161 while self.tok_id != Id.J8_RParen:
1162 items.append(self._ParseNil8())
1163 #log('TOK 2 %s', Id_str(self.tok_id))
1164
1165 self._Eat(Id.J8_RParen)
1166
1167 return nvalue.List(items)
1168
1169 def _ParseList8(self):
1170 # type: () -> nvalue_t
1171 """
1172 List8 = '[' value* ']'
1173
1174 No commas, not even optional ones for now.
1175 """
1176 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1177
1178 items = [] # type: List[nvalue_t]
1179
1180 self._Next()
1181 if self.tok_id == Id.J8_RBracket:
1182 self._Next()
1183 return nvalue.List(items)
1184
1185 #log('TOK %s', Id_str(self.tok_id))
1186 while self.tok_id != Id.J8_RBracket:
1187 items.append(self._ParseNil8())
1188 #log('TOK 2 %s', Id_str(self.tok_id))
1189
1190 self._Eat(Id.J8_RBracket)
1191
1192 return nvalue.List(items)
1193
1194 def _ParseNil8(self):
1195 # type: () -> nvalue_t
1196 if self.tok_id == Id.J8_LParen:
1197 obj = self._ParseRecord() # type: nvalue_t
1198 #return obj
1199
1200 elif self.tok_id == Id.J8_LBracket:
1201 obj = self._ParseList8()
1202 #return obj
1203
1204 # Primitives are copied from J8 above.
1205 # TODO: We also want hex literals.
1206 elif self.tok_id == Id.J8_Null:
1207 self._Next()
1208 obj = nvalue.Null
1209
1210 elif self.tok_id == Id.J8_Bool:
1211 b = nvalue.Bool(self.s[self.start_pos] == 't')
1212 self._Next()
1213 obj = b
1214
1215 elif self.tok_id == Id.J8_Int:
1216 part = self.s[self.start_pos:self.end_pos]
1217 self._Next()
1218 obj = nvalue.Int(int(part))
1219
1220 elif self.tok_id == Id.J8_Float:
1221 part = self.s[self.start_pos:self.end_pos]
1222 self._Next()
1223 obj = nvalue.Float(float(part))
1224
1225 elif self.tok_id == Id.J8_String:
1226 str_val = nvalue.Str(self.decoded)
1227 self._Next()
1228 obj = str_val
1229
1230 # <- etc.
1231 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1232 Id.J8_Comma):
1233 # unquoted "word" treated like a string
1234 part = self.s[self.start_pos:self.end_pos]
1235 self._Next()
1236 obj = nvalue.Symbol(part)
1237
1238 elif self.tok_id == Id.Eol_Tok:
1239 raise self._ParseError('Unexpected EOF while parsing %s' %
1240 self.lang_str)
1241
1242 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1243 raise self._ParseError('Invalid token while parsing %s: %s' %
1244 (self.lang_str, Id_str(self.tok_id)))
1245
1246 #log('YO %s', Id_str(self.tok_id))
1247 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1248 #log('AT %s', Id_str(self.tok_id))
1249
1250 # key: "value" -> (: key "value")
1251 part = self.s[self.start_pos:self.end_pos]
1252 op = nvalue.Symbol(part)
1253
1254 self._Next()
1255 operand2 = self._ParseNil8()
1256 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1257 #print("--> INFIX %d %s" % (id(infix), infix))
1258 return infix
1259
1260 #next_id = self._LookAhead()
1261 #print('NEXT %s' % Id_str(next_id))
1262
1263 #raise AssertionError()
1264 #print("--> OBJ %d %s" % (id(obj), obj))
1265 return obj
1266
1267 def ParseNil8(self):
1268 # type: () -> nvalue_t
1269 """ Raises error.Decode. """
1270 self._Next()
1271 #print('yo')
1272 obj = self._ParseNil8()
1273 #print("==> %d %s" % (id(obj), obj))
1274 if self.tok_id != Id.Eol_Tok:
1275 raise self._ParseError('Unexpected trailing input')
1276 return obj
1277
1278
1279class J8LinesParser(_Parser):
1280 """Decode lines from a string with newlines.
1281
1282 We specify this with a grammar, to preserve location info and to reduce
1283 allocations. (But note that unquoted_line is more like a LOOP than it is
1284 grammatical.)
1285
1286 Grammar:
1287
1288 end = J8_Newline | Eol_Tok
1289
1290 empty_line = WS_Space? end
1291
1292 # special case: read until end token, but REMOVE trailing WS_Space
1293 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1294
1295 j8_line = WS_Space? J8_String WS_Space? end
1296
1297 lines = (empty_line | unquoted_line | j8_line)*
1298
1299 where Lit_Chars is valid UTF-8
1300
1301 Notes:
1302
1303 (1) We disallow multiple strings on a line, like:
1304
1305 "json" "json2"
1306 "json" unquoted
1307
1308 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1309
1310 foo "" u''
1311
1312 The "" and u'' are not a decoded string, because the line started with
1313 Id.Lit_Chars literals.
1314
1315 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1316 Does it have - for empty cell?
1317 """
1318
1319 def __init__(self, s):
1320 # type: (str) -> None
1321 _Parser.__init__(self, s, True)
1322
1323 def _Show(self, s):
1324 # type: (str) -> None
1325 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1326 self.end_pos)
1327
1328 def _ParseLine(self, out):
1329 # type: (List[str]) -> None
1330 """ May append a line to 'out' """
1331 #self._Show('1')
1332 if self.tok_id == Id.WS_Space:
1333 self._NextForLines()
1334
1335 # Empty line - return without doing anything
1336 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1337 self._NextForLines()
1338 return
1339
1340 # Quoted string on line
1341 if self.tok_id == Id.J8_String:
1342 out.append(self.decoded)
1343 self._NextForLines()
1344
1345 if self.tok_id == Id.WS_Space: # trailing whitespace
1346 self._NextForLines()
1347
1348 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1349 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1350 Id_str(self.tok_id))
1351
1352 self._NextForLines()
1353 return
1354
1355 # Unquoted line
1356 if self.tok_id == Id.Lit_Chars:
1357 # ' unquoted "" text on line ' # read every token until end
1358 string_start = self.start_pos
1359 while True:
1360 # for stripping whitespace
1361 prev_id = self.tok_id
1362 prev_start = self.start_pos
1363
1364 self._NextForLines()
1365
1366 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1367 # \r, but we're sticking with the JSON spec definition of
1368 # whitespace. (As another data point, CPython on Unix allows
1369 # \r in the middle of expressions, treating it as whitespace.)
1370 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1371 break
1372
1373 if prev_id == Id.WS_Space:
1374 string_end = prev_start # remove trailing whitespace
1375 else:
1376 string_end = self.start_pos
1377
1378 out.append(self.s[string_start:string_end])
1379
1380 self._NextForLines() # past newline
1381 return
1382
1383 raise AssertionError(Id_str(self.tok_id))
1384
1385 def Parse(self):
1386 # type: () -> List[str]
1387 """ Raises error.Decode. """
1388 self._NextForLines()
1389
1390 lines = [] # type: List[str]
1391 while self.tok_id != Id.Eol_Tok:
1392 self._ParseLine(lines)
1393
1394 if self.tok_id != Id.Eol_Tok:
1395 raise self._ParseError('Unexpected trailing input in J8 Lines')
1396
1397 return lines
1398
1399
1400def SplitJ8Lines(s):
1401 # type: (str) -> List[str]
1402 """Used by @(echo split command sub)
1403
1404 Raises:
1405 error.Decode
1406
1407 3 Errors:
1408 - J8 string syntax error inside quotes
1409 - Extra input on line
1410 - unquoted line isn't utf-8
1411 """
1412 p = J8LinesParser(s)
1413 return p.Parse()
1414
1415
1416# vim: sw=4