data_lang/j8.py

OILS / data_lang / j8.py View on Github | oils.pub

1331 lines, 691 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- Unify with ASDL pretty printing - NIL8
8	- {} [] are identical
9	- () is for statically typed ASDL data
10	(command.Simple blame_tok:(...) words:[ ])
11	although we are also using [] for typed ASDL arrays, not just JSON
12	- object IDs
13	- @ x123 can create an ID
14	- ! x123 can reference an ID
15	- <> can be for non-J8 data types? For the = operator
16	- 'hi \(name)' interpolation is useful for code
17
18	- Common between JSON8 and NIL8 - for writing by hand
19	- comments - # line or // line (JSON5 uses // line, following JS)
20	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21	- commas
22	- JSON8 could have trailing commas rule
23	- NIL8 at least has no commas for [1 2 "hi"]
24	"""
25
26	import math
27
28	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
30	from _devbuild.gen.runtime_asdl import error_code_e
31	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
32
33	from core import bash_impl
34	from core import error
35	from data_lang import pyj8
36	# dependency issue: consts.py pulls in frontend/option_def.py
37	from frontend import consts
38	from frontend import match
39	from mycpp import mops
40	from mycpp import mylib
41	from mycpp.mylib import tagswitch, iteritems, NewDict, log
42
43	import fastfunc
44
45	_ = log
46
47	from typing import cast, Dict, List, Tuple, Optional
48
49
50	# COPIED from ui.ValType() to break dep
51	def ValType(val):
52	# type: (value_t) -> str
53	"""For displaying type errors in the UI."""
54
55	return value_str(val.tag(), dot=False)
56
57
58	if mylib.PYTHON:
59
60	def HeapValueId(val):
61	# type: (value_t) -> int
62	"""
63	Python's id() returns the address, which is up to 64 bits.
64
65	In C++ we can use the GC ID, which fits within 32 bits.
66	"""
67	return id(val)
68
69
70	def ValueId(val):
71	# type: (value_t) -> int
72	"""
73	Return an integer ID for object that:
74
75	1. Can be used to determine whether 2 objects are the same, e.g. for
76	List, Dict, Func, Proc, etc.
77	2. Will help detect object cycles
78
79	Primitives types like Int and Float don't have this notion. They're
80	immutable values that are copied and compared by value.
81	"""
82	with tagswitch(val) as case:
83	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
84	value_e.Str):
85	# These will not be on the heap if we switch to tagged pointers
86	# Str is handled conservatively - when we add small string
87	# optimization, some strings will be values, so we assume all are.
88	return -1
89	else:
90	return HeapValueId(val)
91
92
93	def ValueIdString(val):
94	# type: (value_t) -> str
95	"""Used by pp value (42) and = 42"""
96	heap_id = ValueId(val) # could be -1
97	if heap_id == -1:
98	return ''
99	else:
100	return ' 0x%s' % mylib.hex_lower(heap_id)
101
102
103	def Utf8Encode(code):
104	# type: (int) -> str
105	"""Return utf-8 encoded bytes from a unicode code point.
106
107	Based on https://stackoverflow.com/a/23502707
108	"""
109	num_cont_bytes = 0
110
111	if code <= 0x7F:
112	return chr(code & 0x7F) # ASCII
113
114	elif code <= 0x7FF:
115	num_cont_bytes = 1
116	elif code <= 0xFFFF:
117	num_cont_bytes = 2
118	else:
119	# What about the check code <= 0x10FFFF ?
120	# - it happens in statically parsed $'' u''
121	# - but not dynamically parsed echo -e / printf, following bash/zsh
122	num_cont_bytes = 3
123
124	bytes_ = [] # type: List[int]
125	for _ in xrange(num_cont_bytes):
126	bytes_.append(0x80 \| (code & 0x3F))
127	code >>= 6
128
129	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
130	bytes_.append(b)
131	bytes_.reverse()
132
133	# mod 256 because Python ints don't wrap around!
134	tmp = [chr(b & 0xFF) for b in bytes_]
135	return ''.join(tmp)
136
137
138	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
139	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
140	LOSSY_JSON = 1 << 3 # JSON may lose data about strings
141	INF_NAN_ARE_NULL = 1 << 4 # for JSON
142
143	# Hack until we fully translate
144	assert pyj8.LOSSY_JSON == LOSSY_JSON
145
146
147	def _Print(val, buf, indent, options=0):
148	# type: (value_t, mylib.BufWriter, int, int) -> None
149	"""
150	Args:
151	indent: number of spaces to indent, or -1 for everything on one line
152	"""
153	p = InstancePrinter(buf, indent, options)
154	p.Print(val)
155
156
157	def PrintMessage(val, buf, indent):
158	# type: (value_t, mylib.BufWriter, int) -> None
159	""" For json8 write (x) and toJson8()
160
161	Caller must handle error.Encode
162	"""
163	_Print(val, buf, indent)
164
165
166	def PrintJsonMessage(val, buf, indent):
167	# type: (value_t, mylib.BufWriter, int) -> None
168	""" For json write (x) and toJson()
169
170	Caller must handle error.Encode()
171	Doesn't decay to b'' strings - will use Unicode replacement char.
172	"""
173	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
174
175
176	def PrintLine(val, f):
177	# type: (value_t, mylib.Writer) -> None
178	""" For pp line (x) """
179
180	# error.Encode should be impossible - we show cycles and non-data
181	buf = mylib.BufWriter()
182
183	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
184
185	f.write(buf.getvalue())
186	f.write('\n')
187
188
189	if 0:
190
191	def Repr(val):
192	# type: (value_t) -> str
193	""" Unused
194	This is like Python's repr
195	"""
196	# error.Encode should be impossible - we show cycles and non-data
197	buf = mylib.BufWriter()
198	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
199	return buf.getvalue()
200
201
202	def EncodeString(s, buf, unquoted_ok=False):
203	# type: (str, mylib.BufWriter, bool) -> None
204	""" For pp proc, etc."""
205
206	if unquoted_ok and fastfunc.CanOmitQuotes(s):
207	buf.write(s)
208	return
209
210	_Print(value.Str(s), buf, -1)
211
212
213	def MaybeEncodeString(s):
214	# type: (str) -> str
215	""" For write --json8 $s and compexport """
216
217	# TODO: add unquoted_ok here?
218	# /usr/local/foo-bar/x.y/a_b
219
220	buf = mylib.BufWriter()
221	_Print(value.Str(s), buf, -1)
222	return buf.getvalue()
223
224
225	def MaybeEncodeJsonString(s):
226	# type: (str) -> str
227	""" For write --json """
228
229	# TODO: add unquoted_ok here?
230	# /usr/local/foo-bar/x.y/a_b
231	buf = mylib.BufWriter()
232	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
233	return buf.getvalue()
234
235
236	class InstancePrinter(object):
237	"""Print a value tree as J8/JSON."""
238
239	def __init__(self, buf, indent, options):
240	# type: (mylib.BufWriter, int, int) -> None
241	self.buf = buf
242	self.indent = indent
243	self.options = options
244
245	# Key is vm.HeapValueId(val)
246	self.visiting = {} # type: Dict[int, bool]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintMapping(self, d, left, right, level):
297	# type: (Dict[str, value_t], str, str, int) -> None
298	if len(d) == 0: # Special case like Python/JS
299	self.buf.write(left)
300	self.buf.write(right)
301	else:
302	self.buf.write(left)
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write(right)
324
325	def _PrintDict(self, val, level):
326	# type: (value.Dict, int) -> None
327	self._PrintMapping(val.d, '{', '}', level)
328
329	def _PrintObj(self, val, level):
330	# type: (Obj, int) -> None
331
332	self._PrintMapping(val.d, '(', ')', level)
333
334	if val.prototype:
335	self.buf.write(' --> ')
336	self._PrintObj(val.prototype, level)
337
338	def _PrintBashPrefix(self, type_str, level):
339	# type: (str, int) -> None
340
341	self.buf.write('{')
342	self._MaybeNewline()
343	self._ItemIndent(level)
344	self.buf.write('"type":')
345	self._MaybeSpace()
346	self.buf.write(type_str) # "BashArray", "SparseArray", or "BashAssoc",
347
348	self._MaybeNewline()
349
350	self._ItemIndent(level)
351	self.buf.write('"data":')
352	self._MaybeSpace()
353
354	def _PrintBashSuffix(self, level):
355	# type: (int) -> None
356	self._MaybeNewline()
357	self._BracketIndent(level)
358	self.buf.write('}')
359
360	def _PrintSparseArray(self, val, level):
361	# type: (value.SparseArray, int) -> None
362
363	self._PrintBashPrefix('"SparseArray",', level)
364
365	if bash_impl.SparseArray_Count(
366	val) == 0: # Special case like Python/JS
367	self.buf.write('{}')
368	else:
369	self.buf.write('{')
370	self._MaybeNewline()
371
372	i = 0
373	for k in bash_impl.SparseArray_GetKeys(val):
374	if i != 0:
375	self.buf.write(',')
376	self._MaybeNewline()
377
378	self._ItemIndent(level + 1)
379	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381	self.buf.write(':')
382	self._MaybeSpace()
383
384	v, error_code = bash_impl.SparseArray_GetElement(val, k)
385	assert error_code == error_code_e.OK, error_code
386	pyj8.WriteString(v, self.options, self.buf)
387
388	i += 1
389
390	self._MaybeNewline()
391
392	self._BracketIndent(level + 1)
393	self.buf.write('}')
394
395	self._PrintBashSuffix(level)
396
397	def _PrintBashArray(self, val, level):
398	# type: (value.BashArray, int) -> None
399
400	self._PrintBashPrefix('"BashArray",', level)
401
402	if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
403	self.buf.write('{}')
404	else:
405	self.buf.write('{')
406	self._MaybeNewline()
407
408	first = True
409	for i, s in enumerate(bash_impl.BashArray_GetValues(val)):
410	if s is None:
411	continue
412
413	if not first:
414	self.buf.write(',')
415	self._MaybeNewline()
416
417	self._ItemIndent(level + 1)
418	pyj8.WriteString(str(i), self.options, self.buf)
419
420	self.buf.write(':')
421	self._MaybeSpace()
422
423	pyj8.WriteString(s, self.options, self.buf)
424
425	first = False
426
427	self._MaybeNewline()
428
429	self._BracketIndent(level + 1)
430	self.buf.write('}')
431
432	self._PrintBashSuffix(level)
433
434	def _PrintBashAssoc(self, val, level):
435	# type: (value.BashAssoc, int) -> None
436
437	self._PrintBashPrefix('"BashAssoc",', level)
438
439	if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
440	self.buf.write('{}')
441	else:
442	self.buf.write('{')
443	self._MaybeNewline()
444
445	i = 0
446	for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
447	if i != 0:
448	self.buf.write(',')
449	self._MaybeNewline()
450
451	self._ItemIndent(level + 1)
452	pyj8.WriteString(k2, self.options, self.buf)
453
454	self.buf.write(':')
455	self._MaybeSpace()
456
457	pyj8.WriteString(v2, self.options, self.buf)
458
459	i += 1
460
461	self._MaybeNewline()
462
463	self._BracketIndent(level + 1)
464	self.buf.write('}')
465
466	self._PrintBashSuffix(level)
467
468	def Print(self, val, level=0):
469	# type: (value_t, int) -> None
470
471	# special value that means everything is on one line
472	# It's like
473	# JSON.stringify(d, null, 0)
474	# except we use -1, not 0. 0 can still have newlines.
475
476	UP_val = val
477	with tagswitch(val) as case:
478	if case(value_e.Null):
479	self.buf.write('null')
480
481	elif case(value_e.Bool):
482	val = cast(value.Bool, UP_val)
483	self.buf.write('true' if val.b else 'false')
484
485	elif case(value_e.Int):
486	val = cast(value.Int, UP_val)
487	# TODO: avoid intermediate allocation with
488	# self.buf.WriteBigInt(val.i)
489	#
490	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
491	# be of arbitrary length, and will need a growth strategy.
492	# Although that is not very common, so we could allocate in
493	# that case.
494
495	self.buf.write(mops.ToStr(val.i))
496
497	elif case(value_e.Float):
498	val = cast(value.Float, UP_val)
499
500	fl = val.f
501	if math.isinf(fl):
502	if self.options & INF_NAN_ARE_NULL:
503	s = 'null' # negative infinity is null too
504	else:
505	s = 'INFINITY'
506	if fl < 0:
507	s = '-' + s
508	elif math.isnan(fl):
509	if self.options & INF_NAN_ARE_NULL:
510	# JavaScript JSON lib behavior: Inf and NaN are null
511	# Python has a bug in the encoder by default, and then
512	# allow_nan=False raises an error
513	s = 'null'
514	else:
515	s = 'NAN'
516	else:
517	# TODO: can we avoid intermediate allocation?
518	# self.buf.WriteFloat(val.f)
519	s = str(fl)
520
521	self.buf.write(s)
522
523	elif case(value_e.Str):
524	val = cast(value.Str, UP_val)
525
526	pyj8.WriteString(val.s, self.options, self.buf)
527
528	elif case(value_e.List):
529	val = cast(value.List, UP_val)
530
531	# Cycle detection, only for containers that can be in cycles
532	heap_id = HeapValueId(val)
533
534	if self.visiting.get(heap_id, False):
535	if self.options & SHOW_CYCLES:
536	# Showing the ID would be nice for pretty printing, but
537	# the problem is we'd have to show it TWICE to make it
538	# meaningful
539	#
540	#self.buf.write('[ -->%s ]' % ValueIdString(val))
541	self.buf.write('[...]')
542	return
543	else:
544	# node.js prints which index closes the cycle
545	raise error.Encode(
546	"Can't encode List%s in object cycle" %
547	ValueIdString(val))
548	else:
549	self.visiting[heap_id] = True
550	self._PrintList(val, level)
551	self.visiting[heap_id] = False
552
553	elif case(value_e.Dict):
554	val = cast(value.Dict, UP_val)
555
556	# Cycle detection, only for containers that can be in cycles
557	heap_id = HeapValueId(val)
558
559	if self.visiting.get(heap_id, False):
560	if self.options & SHOW_CYCLES:
561	self.buf.write('{...}')
562	return
563	else:
564	# node.js prints which key closes the cycle
565	raise error.Encode(
566	"Can't encode Dict%s in object cycle" %
567	ValueIdString(val))
568	else:
569	self.visiting[heap_id] = True
570	self._PrintDict(val, level)
571	self.visiting[heap_id] = False
572
573	elif case(value_e.Obj):
574	val = cast(Obj, UP_val)
575
576	if not (self.options & SHOW_NON_DATA):
577	raise error.Encode("Can't encode value of type Obj")
578
579	# Cycle detection, only for containers that can be in cycles
580	heap_id = HeapValueId(val)
581
582	if self.visiting.get(heap_id, False):
583	if self.options & SHOW_CYCLES:
584	self.buf.write('(...)')
585	return
586	else:
587	# node.js prints which key closes the cycle
588	raise error.Encode(
589	"Can't encode Obj%s in object cycle" %
590	ValueIdString(val))
591	else:
592	self.visiting[heap_id] = True
593	self._PrintObj(val, level)
594	self.visiting[heap_id] = False
595
596	elif case(value_e.SparseArray):
597	val = cast(value.SparseArray, UP_val)
598	self._PrintSparseArray(val, level)
599
600	elif case(value_e.BashArray):
601	val = cast(value.BashArray, UP_val)
602	self._PrintBashArray(val, level)
603
604	elif case(value_e.BashAssoc):
605	val = cast(value.BashAssoc, UP_val)
606	self._PrintBashAssoc(val, level)
607
608	else:
609	pass # mycpp workaround
610	if self.options & SHOW_NON_DATA:
611	# Similar to = operator, ui.DebugPrint()
612	# TODO: that prints value.Range in a special way
613	ysh_type = ValType(val)
614	# Don't show ID in 'pp test_'
615	#id_str = ValueIdString(val)
616	self.buf.write('<%s>' % ysh_type)
617	else:
618	raise error.Encode("Can't serialize object of type %s" %
619	ValType(val))
620
621
622	class LexerDecoder(object):
623	"""J8 lexer and string decoder.
624
625	Similar interface as SimpleLexer, except we return an optional decoded
626	string
627	"""
628
629	def __init__(self, s, is_j8, lang_str):
630	# type: (str, bool, str) -> None
631	self.s = s
632	self.is_j8 = is_j8
633	self.lang_str = lang_str
634
635	self.pos = 0
636
637	# current line being lexed -- for error messages
638	self.cur_line_num = 1
639
640	# Reuse this instance to save GC objects. JSON objects could have
641	# thousands of strings.
642	self.decoded = mylib.BufWriter()
643
644	def _Error(self, msg, end_pos):
645	# type: (str, int) -> error.Decode
646
647	# Use the current position as start pos
648	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
649
650	def Next(self):
651	# type: () -> Tuple[Id_t, int, Optional[str]]
652	""" Returns a token and updates self.pos """
653
654	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
655
656	if not self.is_j8:
657	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
658	raise self._Error(
659	"Single quotes aren't part of JSON; you may want 'json8 read'",
660	end_pos)
661	if tok_id == Id.Ignored_Comment:
662	raise self._Error(
663	"Comments aren't part of JSON; you may want 'json8 read'",
664	end_pos)
665
666	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
667	Id.Left_USingleQuote):
668	return self._DecodeString(tok_id, end_pos)
669
670	if tok_id == Id.Left_JDoubleQuote:
671	if self.is_j8:
672	return self._DecodeString(tok_id, end_pos)
673	else:
674	raise self._Error('Pure JSON does not accept j"" prefix',
675	end_pos)
676
677	if tok_id == Id.Ignored_Newline:
678	#log('LINE %d', self.cur_line_num)
679	self.cur_line_num += 1
680
681	self.pos = end_pos
682	return tok_id, end_pos, None
683
684	def NextForLines(self):
685	# type: () -> Tuple[Id_t, int, Optional[str]]
686	""" Like Next(), but for J8 Lines """
687
688	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
689
690	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
691	Id.Left_BSingleQuote, Id.Left_USingleQuote):
692	return self._DecodeString(tok_id, end_pos)
693
694	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
695	# this for quoted strings.)
696	if (tok_id == Id.Lit_Chars and
697	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
698	raise self._Error(
699	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
700	if tok_id == Id.Char_AsciiControl:
701	raise self._Error(
702	"J8 Lines can't have unescaped ASCII control chars", end_pos)
703
704	if tok_id == Id.J8_Newline:
705	#log('LINE %d', self.cur_line_num)
706	self.cur_line_num += 1
707
708	self.pos = end_pos
709	return tok_id, end_pos, None
710
711	def _DecodeString(self, left_id, str_pos):
712	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
713	""" Returns a string token and updates self.pos """
714
715	while True:
716	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
717	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
718	else:
719	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
720
721	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
722
723	if tok_id == Id.Eol_Tok:
724	# TODO: point to beginning of # quote?
725	raise self._Error(
726	'Unexpected EOF while lexing %s string' % self.lang_str,
727	str_end)
728	if tok_id == Id.Unknown_Backslash:
729	raise self._Error(
730	'Bad backslash escape in %s string' % self.lang_str,
731	str_end)
732	if tok_id == Id.Char_AsciiControl:
733	raise self._Error(
734	"%s strings can't have unescaped ASCII control chars" %
735	self.lang_str, str_end)
736
737	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
738
739	self.pos = str_end
740
741	s = self.decoded.getvalue()
742	self.decoded.clear() # reuse this instance
743
744	#log('decoded %r', self.decoded.getvalue())
745	return Id.J8_String, str_end, s
746
747	#
748	# Now handle each kind of token
749	#
750
751	if tok_id == Id.Lit_Chars: # JSON and J8
752	part = self.s[str_pos:str_end]
753	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
754	raise self._Error(
755	'Invalid UTF-8 in %s string literal' % self.lang_str,
756	str_end)
757
758	# TODO: would be nice to avoid allocation in all these cases.
759	# But LookupCharC() would have to change.
760
761	elif tok_id == Id.Char_OneChar: # JSON and J8
762	ch = self.s[str_pos + 1]
763	part = consts.LookupCharC(ch)
764
765	elif tok_id == Id.Char_UBraced: # J8 only
766	h = self.s[str_pos + 3:str_end - 1]
767	i = int(h, 16)
768
769	# Same checks in osh/word_compile.py
770	if i > 0x10ffff:
771	raise self._Error(
772	"Code point can't be greater than U+10ffff", str_end)
773	if 0xD800 <= i and i < 0xE000:
774	raise self._Error(
775	r"\u{%s} escape is illegal because it's in the surrogate range"
776	% h, str_end)
777
778	part = Utf8Encode(i)
779
780	elif tok_id == Id.Char_YHex: # J8 only
781	h = self.s[str_pos + 2:str_end]
782
783	# Same check in osh/word_parse.py
784	if left_id != Id.Left_BSingleQuote:
785	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
786	raise self._Error(
787	r"\y%s escapes not allowed in u'' strings" % h,
788	str_end)
789
790	i = int(h, 16)
791	part = chr(i)
792
793	elif tok_id == Id.Char_SurrogatePair:
794	h1 = self.s[str_pos + 2:str_pos + 6]
795	h2 = self.s[str_pos + 8:str_pos + 12]
796
797	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
798	i1 = int(h1, 16) - 0xD800 # high surrogate
799	i2 = int(h2, 16) - 0xDC00 # low surrogate
800	code_point = 0x10000 + (i1 << 10) + i2
801
802	part = Utf8Encode(code_point)
803
804	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
805	h = self.s[str_pos + 2:str_end]
806	i = int(h, 16)
807	part = Utf8Encode(i)
808
809	else:
810	# Should never happen
811	raise AssertionError(Id_str(tok_id))
812
813	#log('%s part %r', Id_str(tok_id), part)
814	self.decoded.write(part)
815	str_pos = str_end
816
817
818	class _Parser(object):
819
820	def __init__(self, s, is_j8):
821	# type: (str, bool) -> None
822	self.s = s
823	self.is_j8 = is_j8
824	self.lang_str = "J8" if is_j8 else "JSON"
825
826	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
827	self.tok_id = Id.Undefined_Tok
828	self.start_pos = 0
829	self.end_pos = 0
830	self.decoded = '' # decoded J8 string
831
832	def _Next(self):
833	# type: () -> None
834
835	# This isn't the start of a J8_Bool token, it's the END of the token before it
836	while True:
837	self.start_pos = self.end_pos
838	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
839	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
840	Id.Ignored_Comment):
841	break
842	# TODO: add Ignored_Newline to count lines, and show line numbers
843	# in errors messages. The position of the last newline and a token
844	# can be used to calculate a column number.
845
846	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
847
848	def _Eat(self, tok_id):
849	# type: (Id_t) -> None
850
851	if self.tok_id != tok_id:
852	#log('position %r %d-%d %r', self.s, self.start_pos,
853	# self.end_pos, self.s[self.start_pos:self.end_pos])
854	raise self._ParseError("Expected %s, got %s" %
855	(Id_str(tok_id), Id_str(self.tok_id)))
856	self._Next()
857
858	def _NextForLines(self):
859	# type: () -> None
860	"""Like _Next, but use the J8 Lines lexer."""
861	self.start_pos = self.end_pos
862	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
863
864	def _ParseError(self, msg):
865	# type: (str) -> error.Decode
866	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
867	self.lexer.cur_line_num)
868
869
870	class Parser(_Parser):
871	"""JSON and JSON8 Parser."""
872
873	def __init__(self, s, is_j8):
874	# type: (str, bool) -> None
875	_Parser.__init__(self, s, is_j8)
876
877	def _ParsePair(self):
878	# type: () -> Tuple[str, value_t]
879
880	k = self.decoded # Save the potential string value
881	self._Eat(Id.J8_String) # Check that it's a string
882	assert k is not None
883
884	self._Eat(Id.J8_Colon)
885
886	v = self._ParseValue()
887	return k, v
888
889	def _ParseDict(self):
890	# type: () -> value_t
891	"""
892	pair = string ':' value
893	Dict = '{' '}'
894	\| '{' pair (',' pair)* '}'
895	"""
896	# precondition
897	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
898
899	#log('> Dict')
900
901	d = NewDict() # type: Dict[str, value_t]
902
903	self._Next()
904	if self.tok_id == Id.J8_RBrace:
905	self._Next()
906	return value.Dict(d)
907
908	k, v = self._ParsePair()
909	d[k] = v
910	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
911
912	while self.tok_id == Id.J8_Comma:
913	self._Next()
914	k, v = self._ParsePair()
915	d[k] = v
916	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
917
918	self._Eat(Id.J8_RBrace)
919
920	#log('< Dict')
921
922	return value.Dict(d)
923
924	def _ParseList(self):
925	# type: () -> value_t
926	"""
927	List = '[' ']'
928	\| '[' value (',' value)* ']'
929	"""
930	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
931
932	items = [] # type: List[value_t]
933
934	self._Next()
935	if self.tok_id == Id.J8_RBracket:
936	self._Next()
937	return value.List(items)
938
939	items.append(self._ParseValue())
940
941	while self.tok_id == Id.J8_Comma:
942	self._Next()
943	items.append(self._ParseValue())
944
945	self._Eat(Id.J8_RBracket)
946
947	return value.List(items)
948
949	def _ParseValue(self):
950	# type: () -> value_t
951	if self.tok_id == Id.J8_LBrace:
952	return self._ParseDict()
953
954	elif self.tok_id == Id.J8_LBracket:
955	return self._ParseList()
956
957	elif self.tok_id == Id.J8_Null:
958	self._Next()
959	return value.Null
960
961	elif self.tok_id == Id.J8_Bool:
962	#log('%r %d', self.s[self.start_pos], self.start_pos)
963	b = value.Bool(self.s[self.start_pos] == 't')
964	self._Next()
965	return b
966
967	elif self.tok_id == Id.J8_Int:
968	part = self.s[self.start_pos:self.end_pos]
969	self._Next()
970	ok, big = mops.FromStr2(part)
971	if not ok:
972	raise self._ParseError('Integer is too big')
973	return value.Int(big)
974
975	elif self.tok_id == Id.J8_Float:
976	part = self.s[self.start_pos:self.end_pos]
977	self._Next()
978	return value.Float(float(part))
979
980	# UString, BString too
981	elif self.tok_id == Id.J8_String:
982	str_val = value.Str(self.decoded)
983	#log('d %r', self.decoded)
984	self._Next()
985	return str_val
986
987	elif self.tok_id == Id.Eol_Tok:
988	raise self._ParseError('Unexpected EOF while parsing %s' %
989	self.lang_str)
990
991	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
992	raise self._ParseError('Invalid token while parsing %s: %s' %
993	(self.lang_str, Id_str(self.tok_id)))
994
995	def ParseValue(self):
996	# type: () -> value_t
997	""" Raises error.Decode. """
998	self._Next()
999	obj = self._ParseValue()
1000
1001	n = len(self.s)
1002	if self.start_pos != n:
1003	extra = n - self.start_pos
1004	#log('n %d pos %d', n, self.start_pos)
1005	raise self._ParseError(
1006	'Got %d bytes of unexpected trailing input' % extra)
1007	return obj
1008
1009
1010	class Nil8Parser(_Parser):
1011	"""
1012	Tokens not in JSON8:
1013	LParen RParen Symbol
1014
1015	Tokens not in JSON, but in JSON8 and NIL8:
1016	Identifier (unquoted keys)
1017	Ignored_Comment
1018	"""
1019
1020	def __init__(self, s, is_j8):
1021	# type: (str, bool) -> None
1022	_Parser.__init__(self, s, is_j8)
1023
1024	if 0:
1025
1026	def _LookAhead(self):
1027	# type: () -> Id_t
1028	"""
1029	Don't need this right now
1030	"""
1031	end_pos = self.end_pos # look ahead from last token
1032	while True:
1033	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1034	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1035	Id.Ignored_Comment):
1036	break
1037	return tok_id
1038
1039	def _ParseRecord(self):
1040	# type: () -> nvalue_t
1041	"""
1042	Yaks
1043	(self->Next) => (-> self Next)
1044	(self->Next obj.field) => ((-> self Next) (. obj field))
1045
1046	Similar to
1047	((identity identity) 42) => 42 in Clojure
1048
1049	ASDL
1050	(Node left:(. x4beef2))
1051	(Node left !x4beef2)
1052
1053	# Ambiguous because value can be identifier.
1054	# We have to look ahead to and see if there's a colon :
1055	field =
1056	Identifier ':' value
1057	\| value
1058
1059	record = '(' head field* ')'
1060
1061	- Identifier \| Symbol are treated the same, it's a side effect of
1062	the lexing style
1063	- do positional args come before named args
1064	- () is invalid? Use [] for empty list
1065	"""
1066	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1067
1068	items = [] # type: List[nvalue_t]
1069
1070	self._Next()
1071	if self.tok_id == Id.J8_RParen:
1072	self._Next()
1073	return nvalue.List(items)
1074
1075	#log('TOK %s', Id_str(self.tok_id))
1076	while self.tok_id != Id.J8_RParen:
1077	items.append(self._ParseNil8())
1078	#log('TOK 2 %s', Id_str(self.tok_id))
1079
1080	self._Eat(Id.J8_RParen)
1081
1082	return nvalue.List(items)
1083
1084	def _ParseList8(self):
1085	# type: () -> nvalue_t
1086	"""
1087	List8 = '[' value* ']'
1088
1089	No commas, not even optional ones for now.
1090	"""
1091	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1092
1093	items = [] # type: List[nvalue_t]
1094
1095	self._Next()
1096	if self.tok_id == Id.J8_RBracket:
1097	self._Next()
1098	return nvalue.List(items)
1099
1100	#log('TOK %s', Id_str(self.tok_id))
1101	while self.tok_id != Id.J8_RBracket:
1102	items.append(self._ParseNil8())
1103	#log('TOK 2 %s', Id_str(self.tok_id))
1104
1105	self._Eat(Id.J8_RBracket)
1106
1107	return nvalue.List(items)
1108
1109	def _ParseNil8(self):
1110	# type: () -> nvalue_t
1111	if self.tok_id == Id.J8_LParen:
1112	obj = self._ParseRecord() # type: nvalue_t
1113	#return obj
1114
1115	elif self.tok_id == Id.J8_LBracket:
1116	obj = self._ParseList8()
1117	#return obj
1118
1119	# Primitives are copied from J8 above.
1120	# TODO: We also want hex literals.
1121	elif self.tok_id == Id.J8_Null:
1122	self._Next()
1123	obj = nvalue.Null
1124
1125	elif self.tok_id == Id.J8_Bool:
1126	b = nvalue.Bool(self.s[self.start_pos] == 't')
1127	self._Next()
1128	obj = b
1129
1130	elif self.tok_id == Id.J8_Int:
1131	part = self.s[self.start_pos:self.end_pos]
1132	self._Next()
1133	obj = nvalue.Int(int(part))
1134
1135	elif self.tok_id == Id.J8_Float:
1136	part = self.s[self.start_pos:self.end_pos]
1137	self._Next()
1138	obj = nvalue.Float(float(part))
1139
1140	elif self.tok_id == Id.J8_String:
1141	str_val = nvalue.Str(self.decoded)
1142	self._Next()
1143	obj = str_val
1144
1145	# <- etc.
1146	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1147	Id.J8_Comma):
1148	# unquoted "word" treated like a string
1149	part = self.s[self.start_pos:self.end_pos]
1150	self._Next()
1151	obj = nvalue.Symbol(part)
1152
1153	elif self.tok_id == Id.Eol_Tok:
1154	raise self._ParseError('Unexpected EOF while parsing %s' %
1155	self.lang_str)
1156
1157	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1158	raise self._ParseError('Invalid token while parsing %s: %s' %
1159	(self.lang_str, Id_str(self.tok_id)))
1160
1161	#log('YO %s', Id_str(self.tok_id))
1162	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1163	#log('AT %s', Id_str(self.tok_id))
1164
1165	# key: "value" -> (: key "value")
1166	part = self.s[self.start_pos:self.end_pos]
1167	op = nvalue.Symbol(part)
1168
1169	self._Next()
1170	operand2 = self._ParseNil8()
1171	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1172	#print("--> INFIX %d %s" % (id(infix), infix))
1173	return infix
1174
1175	#next_id = self._LookAhead()
1176	#print('NEXT %s' % Id_str(next_id))
1177
1178	#raise AssertionError()
1179	#print("--> OBJ %d %s" % (id(obj), obj))
1180	return obj
1181
1182	def ParseNil8(self):
1183	# type: () -> nvalue_t
1184	""" Raises error.Decode. """
1185	self._Next()
1186	#print('yo')
1187	obj = self._ParseNil8()
1188	#print("==> %d %s" % (id(obj), obj))
1189	if self.tok_id != Id.Eol_Tok:
1190	raise self._ParseError('Unexpected trailing input')
1191	return obj
1192
1193
1194	class J8LinesParser(_Parser):
1195	"""Decode lines from a string with newlines.
1196
1197	We specify this with a grammar, to preserve location info and to reduce
1198	allocations. (But note that unquoted_line is more like a LOOP than it is
1199	grammatical.)
1200
1201	Grammar:
1202
1203	end = J8_Newline \| Eol_Tok
1204
1205	empty_line = WS_Space? end
1206
1207	# special case: read until end token, but REMOVE trailing WS_Space
1208	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1209
1210	j8_line = WS_Space? J8_String WS_Space? end
1211
1212	lines = (empty_line \| unquoted_line \| j8_line)*
1213
1214	where Lit_Chars is valid UTF-8
1215
1216	Notes:
1217
1218	(1) We disallow multiple strings on a line, like:
1219
1220	"json" "json2"
1221	"json" unquoted
1222
1223	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1224
1225	foo "" u''
1226
1227	The "" and u'' are not a decoded string, because the line started with
1228	Id.Lit_Chars literals.
1229
1230	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1231	Does it have - for empty cell?
1232	"""
1233
1234	def __init__(self, s):
1235	# type: (str) -> None
1236	_Parser.__init__(self, s, True)
1237
1238	def _Show(self, s):
1239	# type: (str) -> None
1240	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1241	self.end_pos)
1242
1243	def _ParseLine(self, out):
1244	# type: (List[str]) -> None
1245	""" May append a line to 'out' """
1246	#self._Show('1')
1247	if self.tok_id == Id.WS_Space:
1248	self._NextForLines()
1249
1250	# Empty line - return without doing anything
1251	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1252	self._NextForLines()
1253	return
1254
1255	# Quoted string on line
1256	if self.tok_id == Id.J8_String:
1257	out.append(self.decoded)
1258	self._NextForLines()
1259
1260	if self.tok_id == Id.WS_Space: # trailing whitespace
1261	self._NextForLines()
1262
1263	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1264	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1265	Id_str(self.tok_id))
1266
1267	self._NextForLines()
1268	return
1269
1270	# Unquoted line
1271	if self.tok_id == Id.Lit_Chars:
1272	# ' unquoted "" text on line ' # read every token until end
1273	string_start = self.start_pos
1274	while True:
1275	# for stripping whitespace
1276	prev_id = self.tok_id
1277	prev_start = self.start_pos
1278
1279	self._NextForLines()
1280
1281	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1282	# \r, but we're sticking with the JSON spec definition of
1283	# whitespace. (As another data point, CPython on Unix allows
1284	# \r in the middle of expressions, treating it as whitespace.)
1285	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1286	break
1287
1288	if prev_id == Id.WS_Space:
1289	string_end = prev_start # remove trailing whitespace
1290	else:
1291	string_end = self.start_pos
1292
1293	out.append(self.s[string_start:string_end])
1294
1295	self._NextForLines() # past newline
1296	return
1297
1298	raise AssertionError(Id_str(self.tok_id))
1299
1300	def Parse(self):
1301	# type: () -> List[str]
1302	""" Raises error.Decode. """
1303	self._NextForLines()
1304
1305	lines = [] # type: List[str]
1306	while self.tok_id != Id.Eol_Tok:
1307	self._ParseLine(lines)
1308
1309	if self.tok_id != Id.Eol_Tok:
1310	raise self._ParseError('Unexpected trailing input in J8 Lines')
1311
1312	return lines
1313
1314
1315	def SplitJ8Lines(s):
1316	# type: (str) -> List[str]
1317	"""Used by @(echo split command sub)
1318
1319	Raises:
1320	error.Decode
1321
1322	3 Errors:
1323	- J8 string syntax error inside quotes
1324	- Extra input on line
1325	- unquoted line isn't utf-8
1326	"""
1327	p = J8LinesParser(s)
1328	return p.Parse()
1329
1330
1331	# vim: sw=4