data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1387 lines, 698 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	if 0:
194
195	def Repr(val):
196	# type: (value_t) -> str
197	""" Unused
198	This is like Python's repr
199	"""
200	# error.Encode should be impossible - we show cycles and non-data
201	buf = mylib.BufWriter()
202	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
203	return buf.getvalue()
204
205
206	def EncodeString(s, buf, unquoted_ok=False):
207	# type: (str, mylib.BufWriter, bool) -> None
208	""" For pp proc, etc."""
209
210	if unquoted_ok and fastfunc.CanOmitQuotes(s):
211	buf.write(s)
212	return
213
214	_Print(value.Str(s), buf, -1)
215
216
217	def MaybeEncodeString(s):
218	# type: (str) -> str
219	""" For write --json8 $s and compexport """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223
224	buf = mylib.BufWriter()
225	_Print(value.Str(s), buf, -1)
226	return buf.getvalue()
227
228
229	def MaybeEncodeJsonString(s):
230	# type: (str) -> str
231	""" For write --json """
232
233	# TODO: add unquoted_ok here?
234	# /usr/local/foo-bar/x.y/a_b
235	buf = mylib.BufWriter()
236	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237	return buf.getvalue()
238
239
240	class InstancePrinter(object):
241	"""Print a value tree as J8/JSON."""
242
243	def __init__(self, buf, indent, options):
244	# type: (mylib.BufWriter, int, int) -> None
245	self.buf = buf
246	self.indent = indent
247	self.options = options
248
249	# Key is vm.HeapValueId(val)
250	self.visiting = {} # type: Dict[int, bool]
251
252	def _ItemIndent(self, level):
253	# type: (int) -> None
254
255	if self.indent == -1:
256	return
257
258	self.buf.write_spaces((level + 1) * self.indent)
259
260	def _BracketIndent(self, level):
261	# type: (int) -> None
262
263	if self.indent == -1:
264	return
265
266	self.buf.write_spaces(level * self.indent)
267
268	def _MaybeNewline(self):
269	# type: () -> None
270	if self.indent == -1:
271	return
272	self.buf.write('\n')
273
274	def _MaybeSpace(self):
275	# type: () -> None
276	if self.indent == -1:
277	return
278	self.buf.write(' ')
279
280	def _PrintList(self, val, level):
281	# type: (value.List, int) -> None
282
283	if len(val.items) == 0: # Special case like Python/JS
284	self.buf.write('[]')
285	else:
286	self.buf.write('[')
287	self._MaybeNewline()
288	for i, item in enumerate(val.items):
289	if i != 0:
290	self.buf.write(',')
291	self._MaybeNewline()
292
293	self._ItemIndent(level)
294	self.Print(item, level + 1)
295	self._MaybeNewline()
296
297	self._BracketIndent(level)
298	self.buf.write(']')
299
300	def _PrintMapping(self, d, level):
301	# type: (Dict[str, value_t], int) -> None
302	if len(d) == 0: # Special case like Python/JS
303	self.buf.write('{}')
304	else:
305	self.buf.write('{')
306	self._MaybeNewline()
307	i = 0
308	for k, v in iteritems(d):
309	if i != 0:
310	self.buf.write(',')
311	self._MaybeNewline()
312
313	self._ItemIndent(level)
314
315	pyj8.WriteString(k, self.options, self.buf)
316
317	self.buf.write(':')
318	self._MaybeSpace()
319
320	self.Print(v, level + 1)
321
322	i += 1
323
324	self._MaybeNewline()
325	self._BracketIndent(level)
326	self.buf.write('}')
327
328	def _PrintDict(self, val, level):
329	# type: (value.Dict, int) -> None
330	self._PrintMapping(val.d, level)
331
332	def _PrintObj(self, val, level):
333	# type: (Obj, int) -> None
334
335	self._PrintMapping(val.d, level)
336
337	if val.prototype:
338	self.buf.write(' ==> ')
339	self._PrintObj(val.prototype, level)
340
341	def _PrintBashPrefix(self, type_str, level):
342	# type: (str, int) -> None
343
344	self.buf.write('{')
345	self._MaybeNewline()
346	self._ItemIndent(level)
347	self.buf.write('"type":')
348	self._MaybeSpace()
349	self.buf.write(type_str) # "BashArray", or "BashAssoc",
350
351	self._MaybeNewline()
352
353	self._ItemIndent(level)
354	self.buf.write('"data":')
355	self._MaybeSpace()
356
357	def _PrintBashSuffix(self, level):
358	# type: (int) -> None
359	self._MaybeNewline()
360	self._BracketIndent(level)
361	self.buf.write('}')
362
363	def _PrintSparseArray(self, val, level):
364	# type: (value.SparseArray, int) -> None
365
366	self._PrintBashPrefix('"SparseArray",', level)
367
368	if len(val.d) == 0: # Special case like Python/JS
369	self.buf.write('{}')
370	else:
371	self.buf.write('{')
372	self._MaybeNewline()
373
374	first = True
375	i = 0
376	for k, v in iteritems(val.d):
377	if i != 0:
378	self.buf.write(',')
379	self._MaybeNewline()
380
381	self._ItemIndent(level + 1)
382	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
383
384	self.buf.write(':')
385	self._MaybeSpace()
386
387	pyj8.WriteString(v, self.options, self.buf)
388
389	i += 1
390
391	self._MaybeNewline()
392
393	self._BracketIndent(level + 1)
394	self.buf.write('}')
395
396	self._PrintBashSuffix(level)
397
398	def _PrintBashArray(self, val, level):
399	# type: (value.BashArray, int) -> None
400
401	self._PrintBashPrefix('"BashArray",', level)
402
403	if len(val.strs) == 0: # Special case like Python/JS
404	self.buf.write('{}')
405	else:
406	self.buf.write('{')
407	self._MaybeNewline()
408
409	first = True
410	for i, s in enumerate(val.strs):
411	if s is None:
412	continue
413
414	if not first:
415	self.buf.write(',')
416	self._MaybeNewline()
417
418	self._ItemIndent(level + 1)
419	pyj8.WriteString(str(i), self.options, self.buf)
420
421	self.buf.write(':')
422	self._MaybeSpace()
423
424	pyj8.WriteString(s, self.options, self.buf)
425
426	first = False
427
428	self._MaybeNewline()
429
430	self._BracketIndent(level + 1)
431	self.buf.write('}')
432
433	self._PrintBashSuffix(level)
434
435	def _PrintBashAssoc(self, val, level):
436	# type: (value.BashAssoc, int) -> None
437
438	self._PrintBashPrefix('"BashAssoc",', level)
439
440	if len(val.d) == 0: # Special case like Python/JS
441	self.buf.write('{}')
442	else:
443	self.buf.write('{')
444	self._MaybeNewline()
445
446	i = 0
447	for k2, v2 in iteritems(val.d):
448	if i != 0:
449	self.buf.write(',')
450	self._MaybeNewline()
451
452	self._ItemIndent(level + 1)
453	pyj8.WriteString(k2, self.options, self.buf)
454
455	self.buf.write(':')
456	self._MaybeSpace()
457
458	pyj8.WriteString(v2, self.options, self.buf)
459
460	i += 1
461
462	self._MaybeNewline()
463
464	self._BracketIndent(level + 1)
465	self.buf.write('}')
466
467	self._PrintBashSuffix(level)
468
469	def Print(self, val, level=0):
470	# type: (value_t, int) -> None
471
472	# special value that means everything is on one line
473	# It's like
474	# JSON.stringify(d, null, 0)
475	# except we use -1, not 0. 0 can still have newlines.
476
477	UP_val = val
478	with tagswitch(val) as case:
479	if case(value_e.Null):
480	self.buf.write('null')
481
482	elif case(value_e.Bool):
483	val = cast(value.Bool, UP_val)
484	self.buf.write('true' if val.b else 'false')
485
486	elif case(value_e.Int):
487	val = cast(value.Int, UP_val)
488	# TODO: avoid intermediate allocation with
489	# self.buf.WriteBigInt(val.i)
490	#
491	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
492	# be of arbitrary length, and will need a growth strategy.
493	# Although that is not very common, so we could allocate in
494	# that case.
495
496	self.buf.write(mops.ToStr(val.i))
497
498	elif case(value_e.Float):
499	val = cast(value.Float, UP_val)
500
501	fl = val.f
502	if math.isinf(fl):
503	if self.options & INF_NAN_ARE_NULL:
504	s = 'null' # negative infinity is null too
505	else:
506	s = 'INFINITY'
507	if fl < 0:
508	s = '-' + s
509	elif math.isnan(fl):
510	if self.options & INF_NAN_ARE_NULL:
511	# JavaScript JSON lib behavior: Inf and NaN are null
512	# Python has a bug in the encoder by default, and then
513	# allow_nan=False raises an error
514	s = 'null'
515	else:
516	s = 'NAN'
517	else:
518	# TODO: can we avoid intermediate allocation?
519	# self.buf.WriteFloat(val.f)
520	s = str(fl)
521
522	self.buf.write(s)
523
524	elif case(value_e.Str):
525	val = cast(value.Str, UP_val)
526
527	pyj8.WriteString(val.s, self.options, self.buf)
528
529	elif case(value_e.List):
530	val = cast(value.List, UP_val)
531
532	# Cycle detection, only for containers that can be in cycles
533	heap_id = HeapValueId(val)
534
535	if self.visiting.get(heap_id, False):
536	if self.options & SHOW_CYCLES:
537	# Showing the ID would be nice for pretty printing, but
538	# the problem is we'd have to show it TWICE to make it
539	# meaningful
540	#
541	#self.buf.write('[ -->%s ]' % ValueIdString(val))
542	self.buf.write('[...]')
543	return
544	else:
545	# node.js prints which index closes the cycle
546	raise error.Encode(
547	"Can't encode List%s in object cycle" %
548	ValueIdString(val))
549	else:
550	self.visiting[heap_id] = True
551	self._PrintList(val, level)
552	self.visiting[heap_id] = False
553
554	elif case(value_e.Dict):
555	val = cast(value.Dict, UP_val)
556
557	# Cycle detection, only for containers that can be in cycles
558	heap_id = HeapValueId(val)
559
560	if self.visiting.get(heap_id, False):
561	if self.options & SHOW_CYCLES:
562	self.buf.write('{...}')
563	return
564	else:
565	# node.js prints which key closes the cycle
566	raise error.Encode(
567	"Can't encode Dict%s in object cycle" %
568	ValueIdString(val))
569	else:
570	self.visiting[heap_id] = True
571	self._PrintDict(val, level)
572	self.visiting[heap_id] = False
573
574	elif case(value_e.Obj):
575	val = cast(Obj, UP_val)
576
577	if not (self.options & SHOW_NON_DATA):
578	raise error.Encode("Can't encode value of type Obj")
579
580	# Cycle detection, only for containers that can be in cycles
581	heap_id = HeapValueId(val)
582
583	if self.visiting.get(heap_id, False):
584	if self.options & SHOW_CYCLES:
585	self.buf.write('{...}')
586	return
587	else:
588	# node.js prints which key closes the cycle
589	raise error.Encode(
590	"Can't encode Obj%s in object cycle" %
591	ValueIdString(val))
592	else:
593	self.visiting[heap_id] = True
594	self._PrintObj(val, level)
595	self.visiting[heap_id] = False
596
597	elif case(value_e.SparseArray):
598	val = cast(value.SparseArray, UP_val)
599	self._PrintSparseArray(val, level)
600
601	elif case(value_e.BashArray):
602	val = cast(value.BashArray, UP_val)
603	self._PrintBashArray(val, level)
604
605	elif case(value_e.BashAssoc):
606	val = cast(value.BashAssoc, UP_val)
607	self._PrintBashAssoc(val, level)
608
609	else:
610	pass # mycpp workaround
611	if self.options & SHOW_NON_DATA:
612	# Similar to = operator, ui.DebugPrint()
613	# TODO: that prints value.Range in a special way
614	ysh_type = ValType(val)
615	# Don't show ID in 'pp test_'
616	#id_str = ValueIdString(val)
617	self.buf.write('<%s>' % ysh_type)
618	else:
619	raise error.Encode("Can't serialize object of type %s" %
620	ValType(val))
621
622
623	class PrettyPrinter(object):
624	""" Unused right now, but could enhance the = operator.
625
626	Output to polymorphic ColorOutput
627
628	Features like asdl/format.py:
629	- line wrapping
630	- color
631	- sharing detection by passing in a REF COUTN dict
632	- print @123 the first time, and then print ... the second time
633
634	and
635
636	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
637	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
638
639	- Omitting commas for ASDL? Maybe we can use two spaces
640
641	(Token id: Id.VSub_DollarName start: 0 length: 3)
642	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
643	"""
644
645	def __init__(self, max_col):
646	# type: (int) -> None
647	self.max_col = max_col
648
649	# This could be an optimized set an C++ bit set like
650	# mark_sweep_heap.h, rather than a Dict
651	#self.unique_objs = mylib.UniqueObjects()
652
653	# first pass of object ID -> number of times references
654
655	self.ref_count = {} # type: Dict[int, int]
656
657	def PrettyTree(self, val, f):
658	# type: (value_t, fmt.ColorOutput) -> None
659
660	# TODO: first convert to hnode.asdl types?
661
662	# Although we might want
663	# hnode.AlreadyShown = (str type, int unique_id)
664	pass
665
666	def Print(self, val, buf):
667	# type: (value_t, mylib.BufWriter) -> None
668
669	# Or print to stderr?
670	f = fmt.DetectConsoleOutput(mylib.Stdout())
671	self.PrettyTree(val, f)
672
673	# Then print those with ASDL
674	pass
675
676
677	class LexerDecoder(object):
678	"""J8 lexer and string decoder.
679
680	Similar interface as SimpleLexer, except we return an optional decoded
681	string
682	"""
683
684	def __init__(self, s, is_j8, lang_str):
685	# type: (str, bool, str) -> None
686	self.s = s
687	self.is_j8 = is_j8
688	self.lang_str = lang_str
689
690	self.pos = 0
691
692	# current line being lexed -- for error messages
693	self.cur_line_num = 1
694
695	# Reuse this instance to save GC objects. JSON objects could have
696	# thousands of strings.
697	self.decoded = mylib.BufWriter()
698
699	def _Error(self, msg, end_pos):
700	# type: (str, int) -> error.Decode
701
702	# Use the current position as start pos
703	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
704
705	def Next(self):
706	# type: () -> Tuple[Id_t, int, Optional[str]]
707	""" Returns a token and updates self.pos """
708
709	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
710
711	if not self.is_j8:
712	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
713	raise self._Error(
714	"Single quotes aren't part of JSON; you may want 'json8 read'",
715	end_pos)
716	if tok_id == Id.Ignored_Comment:
717	raise self._Error(
718	"Comments aren't part of JSON; you may want 'json8 read'",
719	end_pos)
720
721	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
722	Id.Left_USingleQuote):
723	return self._DecodeString(tok_id, end_pos)
724
725	if tok_id == Id.Left_JDoubleQuote:
726	if self.is_j8:
727	return self._DecodeString(tok_id, end_pos)
728	else:
729	raise self._Error('Pure JSON does not accept j"" prefix',
730	end_pos)
731
732	if tok_id == Id.Ignored_Newline:
733	#log('LINE %d', self.cur_line_num)
734	self.cur_line_num += 1
735
736	self.pos = end_pos
737	return tok_id, end_pos, None
738
739	def NextForLines(self):
740	# type: () -> Tuple[Id_t, int, Optional[str]]
741	""" Like Next(), but for J8 Lines """
742
743	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
744
745	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
746	Id.Left_BSingleQuote, Id.Left_USingleQuote):
747	return self._DecodeString(tok_id, end_pos)
748
749	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
750	# this for quoted strings.)
751	if (tok_id == Id.Lit_Chars and
752	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
753	raise self._Error(
754	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
755	if tok_id == Id.Char_AsciiControl:
756	raise self._Error(
757	"J8 Lines can't have unescaped ASCII control chars", end_pos)
758
759	if tok_id == Id.J8_Newline:
760	#log('LINE %d', self.cur_line_num)
761	self.cur_line_num += 1
762
763	self.pos = end_pos
764	return tok_id, end_pos, None
765
766	def _DecodeString(self, left_id, str_pos):
767	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
768	""" Returns a string token and updates self.pos """
769
770	while True:
771	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
772	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
773	else:
774	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
775
776	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
777
778	if tok_id == Id.Eol_Tok:
779	# TODO: point to beginning of # quote?
780	raise self._Error(
781	'Unexpected EOF while lexing %s string' % self.lang_str,
782	str_end)
783	if tok_id == Id.Unknown_Backslash:
784	raise self._Error(
785	'Bad backslash escape in %s string' % self.lang_str,
786	str_end)
787	if tok_id == Id.Char_AsciiControl:
788	raise self._Error(
789	"%s strings can't have unescaped ASCII control chars" %
790	self.lang_str, str_end)
791
792	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
793
794	self.pos = str_end
795
796	s = self.decoded.getvalue()
797	self.decoded.clear() # reuse this instance
798
799	#log('decoded %r', self.decoded.getvalue())
800	return Id.J8_String, str_end, s
801
802	#
803	# Now handle each kind of token
804	#
805
806	if tok_id == Id.Lit_Chars: # JSON and J8
807	part = self.s[str_pos:str_end]
808	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
809	raise self._Error(
810	'Invalid UTF-8 in %s string literal' % self.lang_str,
811	str_end)
812
813	# TODO: would be nice to avoid allocation in all these cases.
814	# But LookupCharC() would have to change.
815
816	elif tok_id == Id.Char_OneChar: # JSON and J8
817	ch = self.s[str_pos + 1]
818	part = consts.LookupCharC(ch)
819
820	elif tok_id == Id.Char_UBraced: # J8 only
821	h = self.s[str_pos + 3:str_end - 1]
822	i = int(h, 16)
823
824	# Same checks in osh/word_compile.py
825	if i > 0x10ffff:
826	raise self._Error(
827	"Code point can't be greater than U+10ffff", str_end)
828	if 0xD800 <= i and i < 0xE000:
829	raise self._Error(
830	r"\u{%s} escape is illegal because it's in the surrogate range"
831	% h, str_end)
832
833	part = Utf8Encode(i)
834
835	elif tok_id == Id.Char_YHex: # J8 only
836	h = self.s[str_pos + 2:str_end]
837
838	# Same check in osh/word_parse.py
839	if left_id != Id.Left_BSingleQuote:
840	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
841	raise self._Error(
842	r"\y%s escapes not allowed in u'' strings" % h,
843	str_end)
844
845	i = int(h, 16)
846	part = chr(i)
847
848	elif tok_id == Id.Char_SurrogatePair:
849	h1 = self.s[str_pos + 2:str_pos + 6]
850	h2 = self.s[str_pos + 8:str_pos + 12]
851
852	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
853	i1 = int(h1, 16) - 0xD800 # high surrogate
854	i2 = int(h2, 16) - 0xDC00 # low surrogate
855	code_point = 0x10000 + (i1 << 10) + i2
856
857	part = Utf8Encode(code_point)
858
859	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
860	h = self.s[str_pos + 2:str_end]
861	i = int(h, 16)
862	part = Utf8Encode(i)
863
864	else:
865	# Should never happen
866	raise AssertionError(Id_str(tok_id))
867
868	#log('%s part %r', Id_str(tok_id), part)
869	self.decoded.write(part)
870	str_pos = str_end
871
872
873	class _Parser(object):
874
875	def __init__(self, s, is_j8):
876	# type: (str, bool) -> None
877	self.s = s
878	self.is_j8 = is_j8
879	self.lang_str = "J8" if is_j8 else "JSON"
880
881	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
882	self.tok_id = Id.Undefined_Tok
883	self.start_pos = 0
884	self.end_pos = 0
885	self.decoded = '' # decoded J8 string
886
887	def _Next(self):
888	# type: () -> None
889
890	# This isn't the start of a J8_Bool token, it's the END of the token before it
891	while True:
892	self.start_pos = self.end_pos
893	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
894	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
895	Id.Ignored_Comment):
896	break
897	# TODO: add Ignored_Newline to count lines, and show line numbers
898	# in errors messages. The position of the last newline and a token
899	# can be used to calculate a column number.
900
901	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
902
903	def _Eat(self, tok_id):
904	# type: (Id_t) -> None
905
906	if self.tok_id != tok_id:
907	#log('position %r %d-%d %r', self.s, self.start_pos,
908	# self.end_pos, self.s[self.start_pos:self.end_pos])
909	raise self._ParseError("Expected %s, got %s" %
910	(Id_str(tok_id), Id_str(self.tok_id)))
911	self._Next()
912
913	def _NextForLines(self):
914	# type: () -> None
915	"""Like _Next, but use the J8 Lines lexer."""
916	self.start_pos = self.end_pos
917	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
918
919	def _ParseError(self, msg):
920	# type: (str) -> error.Decode
921	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
922	self.lexer.cur_line_num)
923
924
925	class Parser(_Parser):
926	"""JSON and JSON8 Parser."""
927
928	def __init__(self, s, is_j8):
929	# type: (str, bool) -> None
930	_Parser.__init__(self, s, is_j8)
931
932	def _ParsePair(self):
933	# type: () -> Tuple[str, value_t]
934
935	k = self.decoded # Save the potential string value
936	self._Eat(Id.J8_String) # Check that it's a string
937	assert k is not None
938
939	self._Eat(Id.J8_Colon)
940
941	v = self._ParseValue()
942	return k, v
943
944	def _ParseDict(self):
945	# type: () -> value_t
946	"""
947	pair = string ':' value
948	Dict = '{' '}'
949	\| '{' pair (',' pair)* '}'
950	"""
951	# precondition
952	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
953
954	#log('> Dict')
955
956	d = NewDict() # type: Dict[str, value_t]
957
958	self._Next()
959	if self.tok_id == Id.J8_RBrace:
960	self._Next()
961	return value.Dict(d)
962
963	k, v = self._ParsePair()
964	d[k] = v
965	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
966
967	while self.tok_id == Id.J8_Comma:
968	self._Next()
969	k, v = self._ParsePair()
970	d[k] = v
971	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
972
973	self._Eat(Id.J8_RBrace)
974
975	#log('< Dict')
976
977	return value.Dict(d)
978
979	def _ParseList(self):
980	# type: () -> value_t
981	"""
982	List = '[' ']'
983	\| '[' value (',' value)* ']'
984	"""
985	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
986
987	items = [] # type: List[value_t]
988
989	self._Next()
990	if self.tok_id == Id.J8_RBracket:
991	self._Next()
992	return value.List(items)
993
994	items.append(self._ParseValue())
995
996	while self.tok_id == Id.J8_Comma:
997	self._Next()
998	items.append(self._ParseValue())
999
1000	self._Eat(Id.J8_RBracket)
1001
1002	return value.List(items)
1003
1004	def _ParseValue(self):
1005	# type: () -> value_t
1006	if self.tok_id == Id.J8_LBrace:
1007	return self._ParseDict()
1008
1009	elif self.tok_id == Id.J8_LBracket:
1010	return self._ParseList()
1011
1012	elif self.tok_id == Id.J8_Null:
1013	self._Next()
1014	return value.Null
1015
1016	elif self.tok_id == Id.J8_Bool:
1017	#log('%r %d', self.s[self.start_pos], self.start_pos)
1018	b = value.Bool(self.s[self.start_pos] == 't')
1019	self._Next()
1020	return b
1021
1022	elif self.tok_id == Id.J8_Int:
1023	part = self.s[self.start_pos:self.end_pos]
1024	self._Next()
1025	try:
1026	big = mops.FromStr(part)
1027	except ValueError:
1028	raise self._ParseError('Integer is too big')
1029	return value.Int(big)
1030
1031	elif self.tok_id == Id.J8_Float:
1032	part = self.s[self.start_pos:self.end_pos]
1033	self._Next()
1034	return value.Float(float(part))
1035
1036	# UString, BString too
1037	elif self.tok_id == Id.J8_String:
1038	str_val = value.Str(self.decoded)
1039	#log('d %r', self.decoded)
1040	self._Next()
1041	return str_val
1042
1043	elif self.tok_id == Id.Eol_Tok:
1044	raise self._ParseError('Unexpected EOF while parsing %s' %
1045	self.lang_str)
1046
1047	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1048	raise self._ParseError('Invalid token while parsing %s: %s' %
1049	(self.lang_str, Id_str(self.tok_id)))
1050
1051	def ParseValue(self):
1052	# type: () -> value_t
1053	""" Raises error.Decode. """
1054	self._Next()
1055	obj = self._ParseValue()
1056
1057	n = len(self.s)
1058	if self.start_pos != n:
1059	extra = n - self.start_pos
1060	#log('n %d pos %d', n, self.start_pos)
1061	raise self._ParseError(
1062	'Got %d bytes of unexpected trailing input' % extra)
1063	return obj
1064
1065
1066	class Nil8Parser(_Parser):
1067	"""
1068	Tokens not in JSON8:
1069	LParen RParen Symbol
1070
1071	Tokens not in JSON, but in JSON8 and NIL8:
1072	Identifier (unquoted keys)
1073	Ignored_Comment
1074	"""
1075
1076	def __init__(self, s, is_j8):
1077	# type: (str, bool) -> None
1078	_Parser.__init__(self, s, is_j8)
1079
1080	if 0:
1081
1082	def _LookAhead(self):
1083	# type: () -> Id_t
1084	"""
1085	Don't need this right now
1086	"""
1087	end_pos = self.end_pos # look ahead from last token
1088	while True:
1089	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1090	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1091	Id.Ignored_Comment):
1092	break
1093	return tok_id
1094
1095	def _ParseRecord(self):
1096	# type: () -> nvalue_t
1097	"""
1098	Yaks
1099	(self->Next) => (-> self Next)
1100	(self->Next obj.field) => ((-> self Next) (. obj field))
1101
1102	Similar to
1103	((identity identity) 42) => 42 in Clojure
1104
1105	ASDL
1106	(Node left:(. x4beef2))
1107	(Node left !x4beef2)
1108
1109	# Ambiguous because value can be identifier.
1110	# We have to look ahead to and see if there's a colon :
1111	field =
1112	Identifier ':' value
1113	\| value
1114
1115	record = '(' head field* ')'
1116
1117	- Identifier \| Symbol are treated the same, it's a side effect of
1118	the lexing style
1119	- do positional args come before named args
1120	- () is invalid? Use [] for empty list
1121	"""
1122	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1123
1124	items = [] # type: List[nvalue_t]
1125
1126	self._Next()
1127	if self.tok_id == Id.J8_RParen:
1128	self._Next()
1129	return nvalue.List(items)
1130
1131	#log('TOK %s', Id_str(self.tok_id))
1132	while self.tok_id != Id.J8_RParen:
1133	items.append(self._ParseNil8())
1134	#log('TOK 2 %s', Id_str(self.tok_id))
1135
1136	self._Eat(Id.J8_RParen)
1137
1138	return nvalue.List(items)
1139
1140	def _ParseList8(self):
1141	# type: () -> nvalue_t
1142	"""
1143	List8 = '[' value* ']'
1144
1145	No commas, not even optional ones for now.
1146	"""
1147	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1148
1149	items = [] # type: List[nvalue_t]
1150
1151	self._Next()
1152	if self.tok_id == Id.J8_RBracket:
1153	self._Next()
1154	return nvalue.List(items)
1155
1156	#log('TOK %s', Id_str(self.tok_id))
1157	while self.tok_id != Id.J8_RBracket:
1158	items.append(self._ParseNil8())
1159	#log('TOK 2 %s', Id_str(self.tok_id))
1160
1161	self._Eat(Id.J8_RBracket)
1162
1163	return nvalue.List(items)
1164
1165	def _ParseNil8(self):
1166	# type: () -> nvalue_t
1167	if self.tok_id == Id.J8_LParen:
1168	obj = self._ParseRecord() # type: nvalue_t
1169	#return obj
1170
1171	elif self.tok_id == Id.J8_LBracket:
1172	obj = self._ParseList8()
1173	#return obj
1174
1175	# Primitives are copied from J8 above.
1176	# TODO: We also want hex literals.
1177	elif self.tok_id == Id.J8_Null:
1178	self._Next()
1179	obj = nvalue.Null
1180
1181	elif self.tok_id == Id.J8_Bool:
1182	b = nvalue.Bool(self.s[self.start_pos] == 't')
1183	self._Next()
1184	obj = b
1185
1186	elif self.tok_id == Id.J8_Int:
1187	part = self.s[self.start_pos:self.end_pos]
1188	self._Next()
1189	obj = nvalue.Int(int(part))
1190
1191	elif self.tok_id == Id.J8_Float:
1192	part = self.s[self.start_pos:self.end_pos]
1193	self._Next()
1194	obj = nvalue.Float(float(part))
1195
1196	elif self.tok_id == Id.J8_String:
1197	str_val = nvalue.Str(self.decoded)
1198	self._Next()
1199	obj = str_val
1200
1201	# <- etc.
1202	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1203	Id.J8_Comma):
1204	# unquoted "word" treated like a string
1205	part = self.s[self.start_pos:self.end_pos]
1206	self._Next()
1207	obj = nvalue.Symbol(part)
1208
1209	elif self.tok_id == Id.Eol_Tok:
1210	raise self._ParseError('Unexpected EOF while parsing %s' %
1211	self.lang_str)
1212
1213	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1214	raise self._ParseError('Invalid token while parsing %s: %s' %
1215	(self.lang_str, Id_str(self.tok_id)))
1216
1217	#log('YO %s', Id_str(self.tok_id))
1218	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1219	#log('AT %s', Id_str(self.tok_id))
1220
1221	# key: "value" -> (: key "value")
1222	part = self.s[self.start_pos:self.end_pos]
1223	op = nvalue.Symbol(part)
1224
1225	self._Next()
1226	operand2 = self._ParseNil8()
1227	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1228	#print("--> INFIX %d %s" % (id(infix), infix))
1229	return infix
1230
1231	#next_id = self._LookAhead()
1232	#print('NEXT %s' % Id_str(next_id))
1233
1234	#raise AssertionError()
1235	#print("--> OBJ %d %s" % (id(obj), obj))
1236	return obj
1237
1238	def ParseNil8(self):
1239	# type: () -> nvalue_t
1240	""" Raises error.Decode. """
1241	self._Next()
1242	#print('yo')
1243	obj = self._ParseNil8()
1244	#print("==> %d %s" % (id(obj), obj))
1245	if self.tok_id != Id.Eol_Tok:
1246	raise self._ParseError('Unexpected trailing input')
1247	return obj
1248
1249
1250	class J8LinesParser(_Parser):
1251	"""Decode lines from a string with newlines.
1252
1253	We specify this with a grammar, to preserve location info and to reduce
1254	allocations. (But note that unquoted_line is more like a LOOP than it is
1255	grammatical.)
1256
1257	Grammar:
1258
1259	end = J8_Newline \| Eol_Tok
1260
1261	empty_line = WS_Space? end
1262
1263	# special case: read until end token, but REMOVE trailing WS_Space
1264	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1265
1266	j8_line = WS_Space? J8_String WS_Space? end
1267
1268	lines = (empty_line \| unquoted_line \| j8_line)*
1269
1270	where Lit_Chars is valid UTF-8
1271
1272	Notes:
1273
1274	(1) We disallow multiple strings on a line, like:
1275
1276	"json" "json2"
1277	"json" unquoted
1278
1279	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1280
1281	foo "" u''
1282
1283	The "" and u'' are not a decoded string, because the line started with
1284	Id.Lit_Chars literals.
1285
1286	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1287	Does it have - for empty cell?
1288	"""
1289
1290	def __init__(self, s):
1291	# type: (str) -> None
1292	_Parser.__init__(self, s, True)
1293
1294	def _Show(self, s):
1295	# type: (str) -> None
1296	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1297	self.end_pos)
1298
1299	def _ParseLine(self, out):
1300	# type: (List[str]) -> None
1301	""" May append a line to 'out' """
1302	#self._Show('1')
1303	if self.tok_id == Id.WS_Space:
1304	self._NextForLines()
1305
1306	# Empty line - return without doing anything
1307	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1308	self._NextForLines()
1309	return
1310
1311	# Quoted string on line
1312	if self.tok_id == Id.J8_String:
1313	out.append(self.decoded)
1314	self._NextForLines()
1315
1316	if self.tok_id == Id.WS_Space: # trailing whitespace
1317	self._NextForLines()
1318
1319	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1320	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1321	Id_str(self.tok_id))
1322
1323	self._NextForLines()
1324	return
1325
1326	# Unquoted line
1327	if self.tok_id == Id.Lit_Chars:
1328	# ' unquoted "" text on line ' # read every token until end
1329	string_start = self.start_pos
1330	while True:
1331	# for stripping whitespace
1332	prev_id = self.tok_id
1333	prev_start = self.start_pos
1334
1335	self._NextForLines()
1336
1337	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1338	# \r, but we're sticking with the JSON spec definition of
1339	# whitespace. (As another data point, CPython on Unix allows
1340	# \r in the middle of expressions, treating it as whitespace.)
1341	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1342	break
1343
1344	if prev_id == Id.WS_Space:
1345	string_end = prev_start # remove trailing whitespace
1346	else:
1347	string_end = self.start_pos
1348
1349	out.append(self.s[string_start:string_end])
1350
1351	self._NextForLines() # past newline
1352	return
1353
1354	raise AssertionError(Id_str(self.tok_id))
1355
1356	def Parse(self):
1357	# type: () -> List[str]
1358	""" Raises error.Decode. """
1359	self._NextForLines()
1360
1361	lines = [] # type: List[str]
1362	while self.tok_id != Id.Eol_Tok:
1363	self._ParseLine(lines)
1364
1365	if self.tok_id != Id.Eol_Tok:
1366	raise self._ParseError('Unexpected trailing input in J8 Lines')
1367
1368	return lines
1369
1370
1371	def SplitJ8Lines(s):
1372	# type: (str) -> List[str]
1373	"""Used by @(echo split command sub)
1374
1375	Raises:
1376	error.Decode
1377
1378	3 Errors:
1379	- J8 string syntax error inside quotes
1380	- Extra input on line
1381	- unquoted line isn't utf-8
1382	"""
1383	p = J8LinesParser(s)
1384	return p.Parse()
1385
1386
1387	# vim: sw=4