OILS / data_lang / j8.py View on Github | oilshell.org

1388 lines, 699 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193if 0:
194
195 def Repr(val):
196 # type: (value_t) -> str
197 """ Unused
198 This is like Python's repr
199 """
200 # error.Encode should be impossible - we show cycles and non-data
201 buf = mylib.BufWriter()
202 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
203 return buf.getvalue()
204
205
206def EncodeString(s, buf, unquoted_ok=False):
207 # type: (str, mylib.BufWriter, bool) -> None
208 """ For pp proc, etc."""
209
210 if unquoted_ok and fastfunc.CanOmitQuotes(s):
211 buf.write(s)
212 return
213
214 _Print(value.Str(s), buf, -1)
215
216
217def MaybeEncodeString(s):
218 # type: (str) -> str
219 """ For write --json8 $s and compexport """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223
224 buf = mylib.BufWriter()
225 _Print(value.Str(s), buf, -1)
226 return buf.getvalue()
227
228
229def MaybeEncodeJsonString(s):
230 # type: (str) -> str
231 """ For write --json """
232
233 # TODO: add unquoted_ok here?
234 # /usr/local/foo-bar/x.y/a_b
235 buf = mylib.BufWriter()
236 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237 return buf.getvalue()
238
239
240class InstancePrinter(object):
241 """Print a value tree as J8/JSON."""
242
243 def __init__(self, buf, indent, options):
244 # type: (mylib.BufWriter, int, int) -> None
245 self.buf = buf
246 self.indent = indent
247 self.options = options
248
249 # Key is vm.HeapValueId(val)
250 self.visiting = {} # type: Dict[int, bool]
251
252 def _ItemIndent(self, level):
253 # type: (int) -> None
254
255 if self.indent == -1:
256 return
257
258 self.buf.write_spaces((level + 1) * self.indent)
259
260 def _BracketIndent(self, level):
261 # type: (int) -> None
262
263 if self.indent == -1:
264 return
265
266 self.buf.write_spaces(level * self.indent)
267
268 def _MaybeNewline(self):
269 # type: () -> None
270 if self.indent == -1:
271 return
272 self.buf.write('\n')
273
274 def _MaybeSpace(self):
275 # type: () -> None
276 if self.indent == -1:
277 return
278 self.buf.write(' ')
279
280 def _PrintList(self, val, level):
281 # type: (value.List, int) -> None
282
283 if len(val.items) == 0: # Special case like Python/JS
284 self.buf.write('[]')
285 else:
286 self.buf.write('[')
287 self._MaybeNewline()
288 for i, item in enumerate(val.items):
289 if i != 0:
290 self.buf.write(',')
291 self._MaybeNewline()
292
293 self._ItemIndent(level)
294 self.Print(item, level + 1)
295 self._MaybeNewline()
296
297 self._BracketIndent(level)
298 self.buf.write(']')
299
300 def _PrintMapping(self, d, left, right, level):
301 # type: (Dict[str, value_t], str, str, int) -> None
302 if len(d) == 0: # Special case like Python/JS
303 self.buf.write(left)
304 self.buf.write(right)
305 else:
306 self.buf.write(left)
307 self._MaybeNewline()
308 i = 0
309 for k, v in iteritems(d):
310 if i != 0:
311 self.buf.write(',')
312 self._MaybeNewline()
313
314 self._ItemIndent(level)
315
316 pyj8.WriteString(k, self.options, self.buf)
317
318 self.buf.write(':')
319 self._MaybeSpace()
320
321 self.Print(v, level + 1)
322
323 i += 1
324
325 self._MaybeNewline()
326 self._BracketIndent(level)
327 self.buf.write(right)
328
329 def _PrintDict(self, val, level):
330 # type: (value.Dict, int) -> None
331 self._PrintMapping(val.d, '{', '}', level)
332
333 def _PrintObj(self, val, level):
334 # type: (Obj, int) -> None
335
336 self._PrintMapping(val.d, '(', ')', level)
337
338 if val.prototype:
339 self.buf.write(' --> ')
340 self._PrintObj(val.prototype, level)
341
342 def _PrintBashPrefix(self, type_str, level):
343 # type: (str, int) -> None
344
345 self.buf.write('{')
346 self._MaybeNewline()
347 self._ItemIndent(level)
348 self.buf.write('"type":')
349 self._MaybeSpace()
350 self.buf.write(type_str) # "BashArray", or "BashAssoc",
351
352 self._MaybeNewline()
353
354 self._ItemIndent(level)
355 self.buf.write('"data":')
356 self._MaybeSpace()
357
358 def _PrintBashSuffix(self, level):
359 # type: (int) -> None
360 self._MaybeNewline()
361 self._BracketIndent(level)
362 self.buf.write('}')
363
364 def _PrintSparseArray(self, val, level):
365 # type: (value.SparseArray, int) -> None
366
367 self._PrintBashPrefix('"SparseArray",', level)
368
369 if len(val.d) == 0: # Special case like Python/JS
370 self.buf.write('{}')
371 else:
372 self.buf.write('{')
373 self._MaybeNewline()
374
375 first = True
376 i = 0
377 for k, v in iteritems(val.d):
378 if i != 0:
379 self.buf.write(',')
380 self._MaybeNewline()
381
382 self._ItemIndent(level + 1)
383 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
384
385 self.buf.write(':')
386 self._MaybeSpace()
387
388 pyj8.WriteString(v, self.options, self.buf)
389
390 i += 1
391
392 self._MaybeNewline()
393
394 self._BracketIndent(level + 1)
395 self.buf.write('}')
396
397 self._PrintBashSuffix(level)
398
399 def _PrintBashArray(self, val, level):
400 # type: (value.BashArray, int) -> None
401
402 self._PrintBashPrefix('"BashArray",', level)
403
404 if len(val.strs) == 0: # Special case like Python/JS
405 self.buf.write('{}')
406 else:
407 self.buf.write('{')
408 self._MaybeNewline()
409
410 first = True
411 for i, s in enumerate(val.strs):
412 if s is None:
413 continue
414
415 if not first:
416 self.buf.write(',')
417 self._MaybeNewline()
418
419 self._ItemIndent(level + 1)
420 pyj8.WriteString(str(i), self.options, self.buf)
421
422 self.buf.write(':')
423 self._MaybeSpace()
424
425 pyj8.WriteString(s, self.options, self.buf)
426
427 first = False
428
429 self._MaybeNewline()
430
431 self._BracketIndent(level + 1)
432 self.buf.write('}')
433
434 self._PrintBashSuffix(level)
435
436 def _PrintBashAssoc(self, val, level):
437 # type: (value.BashAssoc, int) -> None
438
439 self._PrintBashPrefix('"BashAssoc",', level)
440
441 if len(val.d) == 0: # Special case like Python/JS
442 self.buf.write('{}')
443 else:
444 self.buf.write('{')
445 self._MaybeNewline()
446
447 i = 0
448 for k2, v2 in iteritems(val.d):
449 if i != 0:
450 self.buf.write(',')
451 self._MaybeNewline()
452
453 self._ItemIndent(level + 1)
454 pyj8.WriteString(k2, self.options, self.buf)
455
456 self.buf.write(':')
457 self._MaybeSpace()
458
459 pyj8.WriteString(v2, self.options, self.buf)
460
461 i += 1
462
463 self._MaybeNewline()
464
465 self._BracketIndent(level + 1)
466 self.buf.write('}')
467
468 self._PrintBashSuffix(level)
469
470 def Print(self, val, level=0):
471 # type: (value_t, int) -> None
472
473 # special value that means everything is on one line
474 # It's like
475 # JSON.stringify(d, null, 0)
476 # except we use -1, not 0. 0 can still have newlines.
477
478 UP_val = val
479 with tagswitch(val) as case:
480 if case(value_e.Null):
481 self.buf.write('null')
482
483 elif case(value_e.Bool):
484 val = cast(value.Bool, UP_val)
485 self.buf.write('true' if val.b else 'false')
486
487 elif case(value_e.Int):
488 val = cast(value.Int, UP_val)
489 # TODO: avoid intermediate allocation with
490 # self.buf.WriteBigInt(val.i)
491 #
492 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
493 # be of arbitrary length, and will need a growth strategy.
494 # Although that is not very common, so we could allocate in
495 # that case.
496
497 self.buf.write(mops.ToStr(val.i))
498
499 elif case(value_e.Float):
500 val = cast(value.Float, UP_val)
501
502 fl = val.f
503 if math.isinf(fl):
504 if self.options & INF_NAN_ARE_NULL:
505 s = 'null' # negative infinity is null too
506 else:
507 s = 'INFINITY'
508 if fl < 0:
509 s = '-' + s
510 elif math.isnan(fl):
511 if self.options & INF_NAN_ARE_NULL:
512 # JavaScript JSON lib behavior: Inf and NaN are null
513 # Python has a bug in the encoder by default, and then
514 # allow_nan=False raises an error
515 s = 'null'
516 else:
517 s = 'NAN'
518 else:
519 # TODO: can we avoid intermediate allocation?
520 # self.buf.WriteFloat(val.f)
521 s = str(fl)
522
523 self.buf.write(s)
524
525 elif case(value_e.Str):
526 val = cast(value.Str, UP_val)
527
528 pyj8.WriteString(val.s, self.options, self.buf)
529
530 elif case(value_e.List):
531 val = cast(value.List, UP_val)
532
533 # Cycle detection, only for containers that can be in cycles
534 heap_id = HeapValueId(val)
535
536 if self.visiting.get(heap_id, False):
537 if self.options & SHOW_CYCLES:
538 # Showing the ID would be nice for pretty printing, but
539 # the problem is we'd have to show it TWICE to make it
540 # meaningful
541 #
542 #self.buf.write('[ -->%s ]' % ValueIdString(val))
543 self.buf.write('[...]')
544 return
545 else:
546 # node.js prints which index closes the cycle
547 raise error.Encode(
548 "Can't encode List%s in object cycle" %
549 ValueIdString(val))
550 else:
551 self.visiting[heap_id] = True
552 self._PrintList(val, level)
553 self.visiting[heap_id] = False
554
555 elif case(value_e.Dict):
556 val = cast(value.Dict, UP_val)
557
558 # Cycle detection, only for containers that can be in cycles
559 heap_id = HeapValueId(val)
560
561 if self.visiting.get(heap_id, False):
562 if self.options & SHOW_CYCLES:
563 self.buf.write('{...}')
564 return
565 else:
566 # node.js prints which key closes the cycle
567 raise error.Encode(
568 "Can't encode Dict%s in object cycle" %
569 ValueIdString(val))
570 else:
571 self.visiting[heap_id] = True
572 self._PrintDict(val, level)
573 self.visiting[heap_id] = False
574
575 elif case(value_e.Obj):
576 val = cast(Obj, UP_val)
577
578 if not (self.options & SHOW_NON_DATA):
579 raise error.Encode("Can't encode value of type Obj")
580
581 # Cycle detection, only for containers that can be in cycles
582 heap_id = HeapValueId(val)
583
584 if self.visiting.get(heap_id, False):
585 if self.options & SHOW_CYCLES:
586 self.buf.write('(...)')
587 return
588 else:
589 # node.js prints which key closes the cycle
590 raise error.Encode(
591 "Can't encode Obj%s in object cycle" %
592 ValueIdString(val))
593 else:
594 self.visiting[heap_id] = True
595 self._PrintObj(val, level)
596 self.visiting[heap_id] = False
597
598 elif case(value_e.SparseArray):
599 val = cast(value.SparseArray, UP_val)
600 self._PrintSparseArray(val, level)
601
602 elif case(value_e.BashArray):
603 val = cast(value.BashArray, UP_val)
604 self._PrintBashArray(val, level)
605
606 elif case(value_e.BashAssoc):
607 val = cast(value.BashAssoc, UP_val)
608 self._PrintBashAssoc(val, level)
609
610 else:
611 pass # mycpp workaround
612 if self.options & SHOW_NON_DATA:
613 # Similar to = operator, ui.DebugPrint()
614 # TODO: that prints value.Range in a special way
615 ysh_type = ValType(val)
616 # Don't show ID in 'pp test_'
617 #id_str = ValueIdString(val)
618 self.buf.write('<%s>' % ysh_type)
619 else:
620 raise error.Encode("Can't serialize object of type %s" %
621 ValType(val))
622
623
624class PrettyPrinter(object):
625 """ Unused right now, but could enhance the = operator.
626
627 Output to polymorphic ColorOutput
628
629 Features like asdl/format.py:
630 - line wrapping
631 - color
632 - sharing detection by passing in a REF COUTN dict
633 - print @123 the first time, and then print ... the second time
634
635 and
636
637 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
638 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
639
640 - Omitting commas for ASDL? Maybe we can use two spaces
641
642 (Token id: Id.VSub_DollarName start: 0 length: 3)
643 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
644 """
645
646 def __init__(self, max_col):
647 # type: (int) -> None
648 self.max_col = max_col
649
650 # This could be an optimized set an C++ bit set like
651 # mark_sweep_heap.h, rather than a Dict
652 #self.unique_objs = mylib.UniqueObjects()
653
654 # first pass of object ID -> number of times references
655
656 self.ref_count = {} # type: Dict[int, int]
657
658 def PrettyTree(self, val, f):
659 # type: (value_t, fmt.ColorOutput) -> None
660
661 # TODO: first convert to hnode.asdl types?
662
663 # Although we might want
664 # hnode.AlreadyShown = (str type, int unique_id)
665 pass
666
667 def Print(self, val, buf):
668 # type: (value_t, mylib.BufWriter) -> None
669
670 # Or print to stderr?
671 f = fmt.DetectConsoleOutput(mylib.Stdout())
672 self.PrettyTree(val, f)
673
674 # Then print those with ASDL
675 pass
676
677
678class LexerDecoder(object):
679 """J8 lexer and string decoder.
680
681 Similar interface as SimpleLexer, except we return an optional decoded
682 string
683 """
684
685 def __init__(self, s, is_j8, lang_str):
686 # type: (str, bool, str) -> None
687 self.s = s
688 self.is_j8 = is_j8
689 self.lang_str = lang_str
690
691 self.pos = 0
692
693 # current line being lexed -- for error messages
694 self.cur_line_num = 1
695
696 # Reuse this instance to save GC objects. JSON objects could have
697 # thousands of strings.
698 self.decoded = mylib.BufWriter()
699
700 def _Error(self, msg, end_pos):
701 # type: (str, int) -> error.Decode
702
703 # Use the current position as start pos
704 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
705
706 def Next(self):
707 # type: () -> Tuple[Id_t, int, Optional[str]]
708 """ Returns a token and updates self.pos """
709
710 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
711
712 if not self.is_j8:
713 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
714 raise self._Error(
715 "Single quotes aren't part of JSON; you may want 'json8 read'",
716 end_pos)
717 if tok_id == Id.Ignored_Comment:
718 raise self._Error(
719 "Comments aren't part of JSON; you may want 'json8 read'",
720 end_pos)
721
722 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
723 Id.Left_USingleQuote):
724 return self._DecodeString(tok_id, end_pos)
725
726 if tok_id == Id.Left_JDoubleQuote:
727 if self.is_j8:
728 return self._DecodeString(tok_id, end_pos)
729 else:
730 raise self._Error('Pure JSON does not accept j"" prefix',
731 end_pos)
732
733 if tok_id == Id.Ignored_Newline:
734 #log('LINE %d', self.cur_line_num)
735 self.cur_line_num += 1
736
737 self.pos = end_pos
738 return tok_id, end_pos, None
739
740 def NextForLines(self):
741 # type: () -> Tuple[Id_t, int, Optional[str]]
742 """ Like Next(), but for J8 Lines """
743
744 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
745
746 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
747 Id.Left_BSingleQuote, Id.Left_USingleQuote):
748 return self._DecodeString(tok_id, end_pos)
749
750 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
751 # this for quoted strings.)
752 if (tok_id == Id.Lit_Chars and
753 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
754 raise self._Error(
755 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
756 if tok_id == Id.Char_AsciiControl:
757 raise self._Error(
758 "J8 Lines can't have unescaped ASCII control chars", end_pos)
759
760 if tok_id == Id.J8_Newline:
761 #log('LINE %d', self.cur_line_num)
762 self.cur_line_num += 1
763
764 self.pos = end_pos
765 return tok_id, end_pos, None
766
767 def _DecodeString(self, left_id, str_pos):
768 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
769 """ Returns a string token and updates self.pos """
770
771 while True:
772 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
773 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
774 else:
775 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
776
777 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
778
779 if tok_id == Id.Eol_Tok:
780 # TODO: point to beginning of # quote?
781 raise self._Error(
782 'Unexpected EOF while lexing %s string' % self.lang_str,
783 str_end)
784 if tok_id == Id.Unknown_Backslash:
785 raise self._Error(
786 'Bad backslash escape in %s string' % self.lang_str,
787 str_end)
788 if tok_id == Id.Char_AsciiControl:
789 raise self._Error(
790 "%s strings can't have unescaped ASCII control chars" %
791 self.lang_str, str_end)
792
793 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
794
795 self.pos = str_end
796
797 s = self.decoded.getvalue()
798 self.decoded.clear() # reuse this instance
799
800 #log('decoded %r', self.decoded.getvalue())
801 return Id.J8_String, str_end, s
802
803 #
804 # Now handle each kind of token
805 #
806
807 if tok_id == Id.Lit_Chars: # JSON and J8
808 part = self.s[str_pos:str_end]
809 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
810 raise self._Error(
811 'Invalid UTF-8 in %s string literal' % self.lang_str,
812 str_end)
813
814 # TODO: would be nice to avoid allocation in all these cases.
815 # But LookupCharC() would have to change.
816
817 elif tok_id == Id.Char_OneChar: # JSON and J8
818 ch = self.s[str_pos + 1]
819 part = consts.LookupCharC(ch)
820
821 elif tok_id == Id.Char_UBraced: # J8 only
822 h = self.s[str_pos + 3:str_end - 1]
823 i = int(h, 16)
824
825 # Same checks in osh/word_compile.py
826 if i > 0x10ffff:
827 raise self._Error(
828 "Code point can't be greater than U+10ffff", str_end)
829 if 0xD800 <= i and i < 0xE000:
830 raise self._Error(
831 r"\u{%s} escape is illegal because it's in the surrogate range"
832 % h, str_end)
833
834 part = Utf8Encode(i)
835
836 elif tok_id == Id.Char_YHex: # J8 only
837 h = self.s[str_pos + 2:str_end]
838
839 # Same check in osh/word_parse.py
840 if left_id != Id.Left_BSingleQuote:
841 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
842 raise self._Error(
843 r"\y%s escapes not allowed in u'' strings" % h,
844 str_end)
845
846 i = int(h, 16)
847 part = chr(i)
848
849 elif tok_id == Id.Char_SurrogatePair:
850 h1 = self.s[str_pos + 2:str_pos + 6]
851 h2 = self.s[str_pos + 8:str_pos + 12]
852
853 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
854 i1 = int(h1, 16) - 0xD800 # high surrogate
855 i2 = int(h2, 16) - 0xDC00 # low surrogate
856 code_point = 0x10000 + (i1 << 10) + i2
857
858 part = Utf8Encode(code_point)
859
860 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
861 h = self.s[str_pos + 2:str_end]
862 i = int(h, 16)
863 part = Utf8Encode(i)
864
865 else:
866 # Should never happen
867 raise AssertionError(Id_str(tok_id))
868
869 #log('%s part %r', Id_str(tok_id), part)
870 self.decoded.write(part)
871 str_pos = str_end
872
873
874class _Parser(object):
875
876 def __init__(self, s, is_j8):
877 # type: (str, bool) -> None
878 self.s = s
879 self.is_j8 = is_j8
880 self.lang_str = "J8" if is_j8 else "JSON"
881
882 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
883 self.tok_id = Id.Undefined_Tok
884 self.start_pos = 0
885 self.end_pos = 0
886 self.decoded = '' # decoded J8 string
887
888 def _Next(self):
889 # type: () -> None
890
891 # This isn't the start of a J8_Bool token, it's the END of the token before it
892 while True:
893 self.start_pos = self.end_pos
894 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
895 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
896 Id.Ignored_Comment):
897 break
898 # TODO: add Ignored_Newline to count lines, and show line numbers
899 # in errors messages. The position of the last newline and a token
900 # can be used to calculate a column number.
901
902 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
903
904 def _Eat(self, tok_id):
905 # type: (Id_t) -> None
906
907 if self.tok_id != tok_id:
908 #log('position %r %d-%d %r', self.s, self.start_pos,
909 # self.end_pos, self.s[self.start_pos:self.end_pos])
910 raise self._ParseError("Expected %s, got %s" %
911 (Id_str(tok_id), Id_str(self.tok_id)))
912 self._Next()
913
914 def _NextForLines(self):
915 # type: () -> None
916 """Like _Next, but use the J8 Lines lexer."""
917 self.start_pos = self.end_pos
918 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
919
920 def _ParseError(self, msg):
921 # type: (str) -> error.Decode
922 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
923 self.lexer.cur_line_num)
924
925
926class Parser(_Parser):
927 """JSON and JSON8 Parser."""
928
929 def __init__(self, s, is_j8):
930 # type: (str, bool) -> None
931 _Parser.__init__(self, s, is_j8)
932
933 def _ParsePair(self):
934 # type: () -> Tuple[str, value_t]
935
936 k = self.decoded # Save the potential string value
937 self._Eat(Id.J8_String) # Check that it's a string
938 assert k is not None
939
940 self._Eat(Id.J8_Colon)
941
942 v = self._ParseValue()
943 return k, v
944
945 def _ParseDict(self):
946 # type: () -> value_t
947 """
948 pair = string ':' value
949 Dict = '{' '}'
950 | '{' pair (',' pair)* '}'
951 """
952 # precondition
953 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
954
955 #log('> Dict')
956
957 d = NewDict() # type: Dict[str, value_t]
958
959 self._Next()
960 if self.tok_id == Id.J8_RBrace:
961 self._Next()
962 return value.Dict(d)
963
964 k, v = self._ParsePair()
965 d[k] = v
966 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
967
968 while self.tok_id == Id.J8_Comma:
969 self._Next()
970 k, v = self._ParsePair()
971 d[k] = v
972 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
973
974 self._Eat(Id.J8_RBrace)
975
976 #log('< Dict')
977
978 return value.Dict(d)
979
980 def _ParseList(self):
981 # type: () -> value_t
982 """
983 List = '[' ']'
984 | '[' value (',' value)* ']'
985 """
986 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
987
988 items = [] # type: List[value_t]
989
990 self._Next()
991 if self.tok_id == Id.J8_RBracket:
992 self._Next()
993 return value.List(items)
994
995 items.append(self._ParseValue())
996
997 while self.tok_id == Id.J8_Comma:
998 self._Next()
999 items.append(self._ParseValue())
1000
1001 self._Eat(Id.J8_RBracket)
1002
1003 return value.List(items)
1004
1005 def _ParseValue(self):
1006 # type: () -> value_t
1007 if self.tok_id == Id.J8_LBrace:
1008 return self._ParseDict()
1009
1010 elif self.tok_id == Id.J8_LBracket:
1011 return self._ParseList()
1012
1013 elif self.tok_id == Id.J8_Null:
1014 self._Next()
1015 return value.Null
1016
1017 elif self.tok_id == Id.J8_Bool:
1018 #log('%r %d', self.s[self.start_pos], self.start_pos)
1019 b = value.Bool(self.s[self.start_pos] == 't')
1020 self._Next()
1021 return b
1022
1023 elif self.tok_id == Id.J8_Int:
1024 part = self.s[self.start_pos:self.end_pos]
1025 self._Next()
1026 try:
1027 big = mops.FromStr(part)
1028 except ValueError:
1029 raise self._ParseError('Integer is too big')
1030 return value.Int(big)
1031
1032 elif self.tok_id == Id.J8_Float:
1033 part = self.s[self.start_pos:self.end_pos]
1034 self._Next()
1035 return value.Float(float(part))
1036
1037 # UString, BString too
1038 elif self.tok_id == Id.J8_String:
1039 str_val = value.Str(self.decoded)
1040 #log('d %r', self.decoded)
1041 self._Next()
1042 return str_val
1043
1044 elif self.tok_id == Id.Eol_Tok:
1045 raise self._ParseError('Unexpected EOF while parsing %s' %
1046 self.lang_str)
1047
1048 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1049 raise self._ParseError('Invalid token while parsing %s: %s' %
1050 (self.lang_str, Id_str(self.tok_id)))
1051
1052 def ParseValue(self):
1053 # type: () -> value_t
1054 """ Raises error.Decode. """
1055 self._Next()
1056 obj = self._ParseValue()
1057
1058 n = len(self.s)
1059 if self.start_pos != n:
1060 extra = n - self.start_pos
1061 #log('n %d pos %d', n, self.start_pos)
1062 raise self._ParseError(
1063 'Got %d bytes of unexpected trailing input' % extra)
1064 return obj
1065
1066
1067class Nil8Parser(_Parser):
1068 """
1069 Tokens not in JSON8:
1070 LParen RParen Symbol
1071
1072 Tokens not in JSON, but in JSON8 and NIL8:
1073 Identifier (unquoted keys)
1074 Ignored_Comment
1075 """
1076
1077 def __init__(self, s, is_j8):
1078 # type: (str, bool) -> None
1079 _Parser.__init__(self, s, is_j8)
1080
1081 if 0:
1082
1083 def _LookAhead(self):
1084 # type: () -> Id_t
1085 """
1086 Don't need this right now
1087 """
1088 end_pos = self.end_pos # look ahead from last token
1089 while True:
1090 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1091 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1092 Id.Ignored_Comment):
1093 break
1094 return tok_id
1095
1096 def _ParseRecord(self):
1097 # type: () -> nvalue_t
1098 """
1099 Yaks
1100 (self->Next) => (-> self Next)
1101 (self->Next obj.field) => ((-> self Next) (. obj field))
1102
1103 Similar to
1104 ((identity identity) 42) => 42 in Clojure
1105
1106 ASDL
1107 (Node left:(. x4beef2))
1108 (Node left !x4beef2)
1109
1110 # Ambiguous because value can be identifier.
1111 # We have to look ahead to and see if there's a colon :
1112 field =
1113 Identifier ':' value
1114 | value
1115
1116 record = '(' head field* ')'
1117
1118 - Identifier | Symbol are treated the same, it's a side effect of
1119 the lexing style
1120 - do positional args come before named args
1121 - () is invalid? Use [] for empty list
1122 """
1123 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1124
1125 items = [] # type: List[nvalue_t]
1126
1127 self._Next()
1128 if self.tok_id == Id.J8_RParen:
1129 self._Next()
1130 return nvalue.List(items)
1131
1132 #log('TOK %s', Id_str(self.tok_id))
1133 while self.tok_id != Id.J8_RParen:
1134 items.append(self._ParseNil8())
1135 #log('TOK 2 %s', Id_str(self.tok_id))
1136
1137 self._Eat(Id.J8_RParen)
1138
1139 return nvalue.List(items)
1140
1141 def _ParseList8(self):
1142 # type: () -> nvalue_t
1143 """
1144 List8 = '[' value* ']'
1145
1146 No commas, not even optional ones for now.
1147 """
1148 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1149
1150 items = [] # type: List[nvalue_t]
1151
1152 self._Next()
1153 if self.tok_id == Id.J8_RBracket:
1154 self._Next()
1155 return nvalue.List(items)
1156
1157 #log('TOK %s', Id_str(self.tok_id))
1158 while self.tok_id != Id.J8_RBracket:
1159 items.append(self._ParseNil8())
1160 #log('TOK 2 %s', Id_str(self.tok_id))
1161
1162 self._Eat(Id.J8_RBracket)
1163
1164 return nvalue.List(items)
1165
1166 def _ParseNil8(self):
1167 # type: () -> nvalue_t
1168 if self.tok_id == Id.J8_LParen:
1169 obj = self._ParseRecord() # type: nvalue_t
1170 #return obj
1171
1172 elif self.tok_id == Id.J8_LBracket:
1173 obj = self._ParseList8()
1174 #return obj
1175
1176 # Primitives are copied from J8 above.
1177 # TODO: We also want hex literals.
1178 elif self.tok_id == Id.J8_Null:
1179 self._Next()
1180 obj = nvalue.Null
1181
1182 elif self.tok_id == Id.J8_Bool:
1183 b = nvalue.Bool(self.s[self.start_pos] == 't')
1184 self._Next()
1185 obj = b
1186
1187 elif self.tok_id == Id.J8_Int:
1188 part = self.s[self.start_pos:self.end_pos]
1189 self._Next()
1190 obj = nvalue.Int(int(part))
1191
1192 elif self.tok_id == Id.J8_Float:
1193 part = self.s[self.start_pos:self.end_pos]
1194 self._Next()
1195 obj = nvalue.Float(float(part))
1196
1197 elif self.tok_id == Id.J8_String:
1198 str_val = nvalue.Str(self.decoded)
1199 self._Next()
1200 obj = str_val
1201
1202 # <- etc.
1203 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1204 Id.J8_Comma):
1205 # unquoted "word" treated like a string
1206 part = self.s[self.start_pos:self.end_pos]
1207 self._Next()
1208 obj = nvalue.Symbol(part)
1209
1210 elif self.tok_id == Id.Eol_Tok:
1211 raise self._ParseError('Unexpected EOF while parsing %s' %
1212 self.lang_str)
1213
1214 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1215 raise self._ParseError('Invalid token while parsing %s: %s' %
1216 (self.lang_str, Id_str(self.tok_id)))
1217
1218 #log('YO %s', Id_str(self.tok_id))
1219 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1220 #log('AT %s', Id_str(self.tok_id))
1221
1222 # key: "value" -> (: key "value")
1223 part = self.s[self.start_pos:self.end_pos]
1224 op = nvalue.Symbol(part)
1225
1226 self._Next()
1227 operand2 = self._ParseNil8()
1228 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1229 #print("--> INFIX %d %s" % (id(infix), infix))
1230 return infix
1231
1232 #next_id = self._LookAhead()
1233 #print('NEXT %s' % Id_str(next_id))
1234
1235 #raise AssertionError()
1236 #print("--> OBJ %d %s" % (id(obj), obj))
1237 return obj
1238
1239 def ParseNil8(self):
1240 # type: () -> nvalue_t
1241 """ Raises error.Decode. """
1242 self._Next()
1243 #print('yo')
1244 obj = self._ParseNil8()
1245 #print("==> %d %s" % (id(obj), obj))
1246 if self.tok_id != Id.Eol_Tok:
1247 raise self._ParseError('Unexpected trailing input')
1248 return obj
1249
1250
1251class J8LinesParser(_Parser):
1252 """Decode lines from a string with newlines.
1253
1254 We specify this with a grammar, to preserve location info and to reduce
1255 allocations. (But note that unquoted_line is more like a LOOP than it is
1256 grammatical.)
1257
1258 Grammar:
1259
1260 end = J8_Newline | Eol_Tok
1261
1262 empty_line = WS_Space? end
1263
1264 # special case: read until end token, but REMOVE trailing WS_Space
1265 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1266
1267 j8_line = WS_Space? J8_String WS_Space? end
1268
1269 lines = (empty_line | unquoted_line | j8_line)*
1270
1271 where Lit_Chars is valid UTF-8
1272
1273 Notes:
1274
1275 (1) We disallow multiple strings on a line, like:
1276
1277 "json" "json2"
1278 "json" unquoted
1279
1280 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1281
1282 foo "" u''
1283
1284 The "" and u'' are not a decoded string, because the line started with
1285 Id.Lit_Chars literals.
1286
1287 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1288 Does it have - for empty cell?
1289 """
1290
1291 def __init__(self, s):
1292 # type: (str) -> None
1293 _Parser.__init__(self, s, True)
1294
1295 def _Show(self, s):
1296 # type: (str) -> None
1297 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1298 self.end_pos)
1299
1300 def _ParseLine(self, out):
1301 # type: (List[str]) -> None
1302 """ May append a line to 'out' """
1303 #self._Show('1')
1304 if self.tok_id == Id.WS_Space:
1305 self._NextForLines()
1306
1307 # Empty line - return without doing anything
1308 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1309 self._NextForLines()
1310 return
1311
1312 # Quoted string on line
1313 if self.tok_id == Id.J8_String:
1314 out.append(self.decoded)
1315 self._NextForLines()
1316
1317 if self.tok_id == Id.WS_Space: # trailing whitespace
1318 self._NextForLines()
1319
1320 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1321 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1322 Id_str(self.tok_id))
1323
1324 self._NextForLines()
1325 return
1326
1327 # Unquoted line
1328 if self.tok_id == Id.Lit_Chars:
1329 # ' unquoted "" text on line ' # read every token until end
1330 string_start = self.start_pos
1331 while True:
1332 # for stripping whitespace
1333 prev_id = self.tok_id
1334 prev_start = self.start_pos
1335
1336 self._NextForLines()
1337
1338 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1339 # \r, but we're sticking with the JSON spec definition of
1340 # whitespace. (As another data point, CPython on Unix allows
1341 # \r in the middle of expressions, treating it as whitespace.)
1342 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1343 break
1344
1345 if prev_id == Id.WS_Space:
1346 string_end = prev_start # remove trailing whitespace
1347 else:
1348 string_end = self.start_pos
1349
1350 out.append(self.s[string_start:string_end])
1351
1352 self._NextForLines() # past newline
1353 return
1354
1355 raise AssertionError(Id_str(self.tok_id))
1356
1357 def Parse(self):
1358 # type: () -> List[str]
1359 """ Raises error.Decode. """
1360 self._NextForLines()
1361
1362 lines = [] # type: List[str]
1363 while self.tok_id != Id.Eol_Tok:
1364 self._ParseLine(lines)
1365
1366 if self.tok_id != Id.Eol_Tok:
1367 raise self._ParseError('Unexpected trailing input in J8 Lines')
1368
1369 return lines
1370
1371
1372def SplitJ8Lines(s):
1373 # type: (str) -> List[str]
1374 """Used by @(echo split command sub)
1375
1376 Raises:
1377 error.Decode
1378
1379 3 Errors:
1380 - J8 string syntax error inside quotes
1381 - Extra input on line
1382 - unquoted line isn't utf-8
1383 """
1384 p = J8LinesParser(s)
1385 return p.Parse()
1386
1387
1388# vim: sw=4