data_lang/j8.py

OILS / data_lang / j8.py View on Github | oils.pub

1335 lines, 703 significant

1	#!/usr/bin/env python2
2	"""j8.py - J8 Notation, a superset of JSON
3
4	Later:
5
6	- Unify with ASDL pretty printing - NIL8
7	- {} [] are identical
8	- () is for statically typed ASDL data
9	(command.Simple blame_tok:(...) words:[ ])
10	although we are also using [] for typed ASDL arrays, not just JSON
11	- object IDs
12	- @ x123 can create an ID
13	- ! x123 can reference an ID
14	- <> can be for non-J8 data types? For the = operator
15	- 'hi \(name)' interpolation is useful for code
16
17	- Common between JSON8 and NIL8 - for writing by hand
18	- comments - # line or // line (JSON5 uses // line, following JS)
19	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
20	- commas
21	- JSON8 could have trailing commas rule
22	- NIL8 at least has no commas for [1 2 "hi"]
23	"""
24
25	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
26	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
27	from _devbuild.gen.runtime_asdl import error_code_e
28	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
29
30	from core import bash_impl
31	from core import error
32	from data_lang import pyj8
33	# dependency issue: consts.py pulls in frontend/option_def.py
34	from frontend import consts
35	from frontend import match
36	from mycpp import mops
37	from mycpp import mylib
38	from mycpp.mylib import tagswitch, iteritems, NewDict, log, isinf_, isnan_
39
40	import fastfunc
41
42	_ = log
43
44	from typing import cast, Dict, List, Tuple, Optional
45
46
47	# COPIED from ui.ValType() to break dep
48	def ValType(val):
49	# type: (value_t) -> str
50	"""For displaying type errors in the UI."""
51
52	return value_str(val.tag(), dot=False)
53
54
55	if mylib.PYTHON:
56
57	def HeapValueId(val):
58	# type: (value_t) -> int
59	"""
60	Python's id() returns the address, which is up to 64 bits.
61
62	In C++ we can use the GC ID, which fits within 32 bits.
63	"""
64	return id(val)
65
66
67	def ValueId(val):
68	# type: (value_t) -> int
69	"""
70	Return an integer ID for object that:
71
72	1. Can be used to determine whether 2 objects are the same, e.g. for
73	List, Dict, Func, Proc, etc.
74	2. Will help detect object cycles
75
76	Primitives types like Int and Float don't have this notion. They're
77	immutable values that are copied and compared by value.
78	"""
79	with tagswitch(val) as case:
80	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
81	value_e.Str):
82	# These will not be on the heap if we switch to tagged pointers
83	# Str is handled conservatively - when we add small string
84	# optimization, some strings will be values, so we assume all are.
85	return -1
86	else:
87	return HeapValueId(val)
88
89
90	def ValueIdString(val):
91	# type: (value_t) -> str
92	"""Used by pp value (42) and = 42"""
93	heap_id = ValueId(val) # could be -1
94	if heap_id == -1:
95	return ''
96	else:
97	return ' 0x%s' % mylib.hex_lower(heap_id)
98
99
100	def Utf8Encode(code):
101	# type: (int) -> str
102	"""Return utf-8 encoded bytes from a unicode code point.
103
104	Based on https://stackoverflow.com/a/23502707
105	"""
106	num_cont_bytes = 0
107
108	if code <= 0x7F:
109	return chr(code & 0x7F) # ASCII
110
111	elif code <= 0x7FF:
112	num_cont_bytes = 1
113	elif code <= 0xFFFF:
114	num_cont_bytes = 2
115	else:
116	# What about the check code <= 0x10FFFF ?
117	# - it happens in statically parsed $'' u''
118	# - but not dynamically parsed echo -e / printf, following bash/zsh
119	num_cont_bytes = 3
120
121	bytes_ = [] # type: List[int]
122	for _ in xrange(num_cont_bytes):
123	bytes_.append(0x80 \| (code & 0x3F))
124	code >>= 6
125
126	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
127	bytes_.append(b)
128	bytes_.reverse()
129
130	# mod 256 because Python ints don't wrap around!
131	tmp = [chr(b & 0xFF) for b in bytes_]
132	return ''.join(tmp)
133
134
135	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
136	LOSSY_JSON_STRINGS = 1 << 3 # JSON may lose data about strings
137	INF_NAN_ARE_NULL = 1 << 4 # another lossy json issue
138
139	NON_DATA_IS_NULL = 1 << 6
140	NON_DATA_IS_ERROR = 1 << 7
141	# Otherwise, non-data objects like Eggex will be <Eggex 0xff>
142
143	# Hack until we fully translate
144	assert pyj8.LOSSY_JSON_STRINGS == LOSSY_JSON_STRINGS
145
146
147	def _Print(val, buf, indent, options=0):
148	# type: (value_t, mylib.BufWriter, int, int) -> None
149	"""
150	Args:
151	indent: number of spaces to indent, or -1 for everything on one line
152	"""
153	p = InstancePrinter(buf, indent, options)
154	p.Print(val)
155
156
157	def PrintMessage(val, buf, indent, type_errors):
158	# type: (value_t, mylib.BufWriter, int, bool) -> None
159	""" For json8 write (x) and toJson8()
160
161	Caller must handle error.Encode
162	"""
163	options = 0
164	if type_errors:
165	options \|= NON_DATA_IS_ERROR
166	else:
167	options \|= NON_DATA_IS_NULL
168	_Print(val, buf, indent, options=options)
169
170
171	def PrintJsonMessage(val, buf, indent, type_errors):
172	# type: (value_t, mylib.BufWriter, int, bool) -> None
173	""" For json write (x) and toJson()
174
175	Caller must handle error.Encode()
176	Doesn't decay to b'' strings - will use Unicode replacement char.
177	"""
178	options = LOSSY_JSON_STRINGS \| INF_NAN_ARE_NULL
179	if type_errors:
180	options \|= NON_DATA_IS_ERROR
181	else:
182	options \|= NON_DATA_IS_NULL
183	_Print(val, buf, indent, options=options)
184
185
186	def PrintLine(val, f):
187	# type: (value_t, mylib.Writer) -> None
188	""" For pp test_ (x) """
189
190	# error.Encode should be impossible - we show cycles and non-data
191	buf = mylib.BufWriter()
192
193	_Print(val, buf, -1, options=SHOW_CYCLES)
194
195	f.write(buf.getvalue())
196	f.write('\n')
197
198
199	def EncodeString(s, buf, unquoted_ok=False):
200	# type: (str, mylib.BufWriter, bool) -> None
201	""" For pp proc, etc."""
202
203	if unquoted_ok and fastfunc.CanOmitQuotes(s):
204	buf.write(s)
205	return
206
207	_Print(value.Str(s), buf, -1)
208
209
210	def MaybeEncodeString(s):
211	# type: (str) -> str
212	""" For write --json8 $s and compexport """
213
214	# TODO: add unquoted_ok here?
215	# /usr/local/foo-bar/x.y/a_b
216
217	buf = mylib.BufWriter()
218	_Print(value.Str(s), buf, -1)
219	return buf.getvalue()
220
221
222	def MaybeEncodeJsonString(s):
223	# type: (str) -> str
224	""" For write --json """
225
226	# TODO: add unquoted_ok here?
227	# /usr/local/foo-bar/x.y/a_b
228	buf = mylib.BufWriter()
229	_Print(value.Str(s), buf, -1, options=LOSSY_JSON_STRINGS)
230	return buf.getvalue()
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	self.visiting = {} # type: Dict[int, bool]
244
245	def _ItemIndent(self, level):
246	# type: (int) -> None
247
248	if self.indent == -1:
249	return
250
251	self.buf.write_spaces((level + 1) * self.indent)
252
253	def _BracketIndent(self, level):
254	# type: (int) -> None
255
256	if self.indent == -1:
257	return
258
259	self.buf.write_spaces(level * self.indent)
260
261	def _MaybeNewline(self):
262	# type: () -> None
263	if self.indent == -1:
264	return
265	self.buf.write('\n')
266
267	def _MaybeSpace(self):
268	# type: () -> None
269	if self.indent == -1:
270	return
271	self.buf.write(' ')
272
273	def _PrintList(self, val, level):
274	# type: (value.List, int) -> None
275
276	if len(val.items) == 0: # Special case like Python/JS
277	self.buf.write('[]')
278	else:
279	self.buf.write('[')
280	self._MaybeNewline()
281	for i, item in enumerate(val.items):
282	if i != 0:
283	self.buf.write(',')
284	self._MaybeNewline()
285
286	self._ItemIndent(level)
287	self.Print(item, level + 1)
288	self._MaybeNewline()
289
290	self._BracketIndent(level)
291	self.buf.write(']')
292
293	def _PrintMapping(self, d, left, right, level):
294	# type: (Dict[str, value_t], str, str, int) -> None
295	if len(d) == 0: # Special case like Python/JS
296	self.buf.write(left)
297	self.buf.write(right)
298	else:
299	self.buf.write(left)
300	self._MaybeNewline()
301	i = 0
302	for k, v in iteritems(d):
303	if i != 0:
304	self.buf.write(',')
305	self._MaybeNewline()
306
307	self._ItemIndent(level)
308
309	pyj8.WriteString(k, self.options, self.buf)
310
311	self.buf.write(':')
312	self._MaybeSpace()
313
314	self.Print(v, level + 1)
315
316	i += 1
317
318	self._MaybeNewline()
319	self._BracketIndent(level)
320	self.buf.write(right)
321
322	def _PrintDict(self, val, level):
323	# type: (value.Dict, int) -> None
324	self._PrintMapping(val.d, '{', '}', level)
325
326	def _PrintObj(self, val, level):
327	# type: (Obj, int) -> None
328
329	self._PrintMapping(val.d, '(', ')', level)
330
331	if val.prototype:
332	self.buf.write(' --> ')
333	self._PrintObj(val.prototype, level)
334
335	def _PrintBashPrefix(self, type_str, level):
336	# type: (str, int) -> None
337
338	self.buf.write('{')
339	self._MaybeNewline()
340	self._ItemIndent(level)
341	self.buf.write('"type":')
342	self._MaybeSpace()
343	self.buf.write(
344	type_str) # "InternalStringArray", "BashArray", or "BashAssoc",
345
346	self._MaybeNewline()
347
348	self._ItemIndent(level)
349	self.buf.write('"data":')
350	self._MaybeSpace()
351
352	def _PrintBashSuffix(self, level):
353	# type: (int) -> None
354	self._MaybeNewline()
355	self._BracketIndent(level)
356	self.buf.write('}')
357
358	def _PrintBashArray(self, val, level):
359	# type: (value.BashArray, int) -> None
360
361	self._PrintBashPrefix('"BashArray",', level)
362
363	if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
364	self.buf.write('{}')
365	else:
366	self.buf.write('{')
367	self._MaybeNewline()
368
369	i = 0
370	for k in bash_impl.BashArray_GetKeys(val):
371	if i != 0:
372	self.buf.write(',')
373	self._MaybeNewline()
374
375	self._ItemIndent(level + 1)
376	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
377
378	self.buf.write(':')
379	self._MaybeSpace()
380
381	v, error_code = bash_impl.BashArray_GetElement(val, k)
382	assert error_code == error_code_e.OK, error_code
383	pyj8.WriteString(v, self.options, self.buf)
384
385	i += 1
386
387	self._MaybeNewline()
388
389	self._BracketIndent(level + 1)
390	self.buf.write('}')
391
392	self._PrintBashSuffix(level)
393
394	def _PrintInternalStringArray(self, val, level):
395	# type: (value.InternalStringArray, int) -> None
396
397	self._PrintBashPrefix('"InternalStringArray",', level)
398
399	if bash_impl.InternalStringArray_Count(
400	val) == 0: # Special case like Python/JS
401	self.buf.write('{}')
402	else:
403	self.buf.write('{')
404	self._MaybeNewline()
405
406	first = True
407	for i, s in enumerate(
408	bash_impl.InternalStringArray_GetValues(val)):
409	if s is None:
410	continue
411
412	if not first:
413	self.buf.write(',')
414	self._MaybeNewline()
415
416	self._ItemIndent(level + 1)
417	pyj8.WriteString(str(i), self.options, self.buf)
418
419	self.buf.write(':')
420	self._MaybeSpace()
421
422	pyj8.WriteString(s, self.options, self.buf)
423
424	first = False
425
426	self._MaybeNewline()
427
428	self._BracketIndent(level + 1)
429	self.buf.write('}')
430
431	self._PrintBashSuffix(level)
432
433	def _PrintBashAssoc(self, val, level):
434	# type: (value.BashAssoc, int) -> None
435
436	self._PrintBashPrefix('"BashAssoc",', level)
437
438	if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
439	self.buf.write('{}')
440	else:
441	self.buf.write('{')
442	self._MaybeNewline()
443
444	i = 0
445	for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
446	if i != 0:
447	self.buf.write(',')
448	self._MaybeNewline()
449
450	self._ItemIndent(level + 1)
451	pyj8.WriteString(k2, self.options, self.buf)
452
453	self.buf.write(':')
454	self._MaybeSpace()
455
456	pyj8.WriteString(v2, self.options, self.buf)
457
458	i += 1
459
460	self._MaybeNewline()
461
462	self._BracketIndent(level + 1)
463	self.buf.write('}')
464
465	self._PrintBashSuffix(level)
466
467	def Print(self, val, level=0):
468	# type: (value_t, int) -> None
469
470	# special value that means everything is on one line
471	# It's like
472	# JSON.stringify(d, null, 0)
473	# except we use -1, not 0. 0 can still have newlines.
474
475	UP_val = val
476	with tagswitch(val) as case:
477	if case(value_e.Null):
478	self.buf.write('null')
479
480	elif case(value_e.Bool):
481	val = cast(value.Bool, UP_val)
482	self.buf.write('true' if val.b else 'false')
483
484	elif case(value_e.Int):
485	val = cast(value.Int, UP_val)
486	# TODO: avoid intermediate allocation with
487	# self.buf.WriteBigInt(val.i)
488	#
489	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
490	# be of arbitrary length, and will need a growth strategy.
491	# Although that is not very common, so we could allocate in
492	# that case.
493
494	self.buf.write(mops.ToStr(val.i))
495
496	elif case(value_e.Float):
497	val = cast(value.Float, UP_val)
498
499	fl = val.f
500	if isinf_(fl):
501	if self.options & INF_NAN_ARE_NULL:
502	s = 'null' # negative infinity is null too
503	else:
504	s = 'INFINITY'
505	if fl < 0:
506	s = '-' + s
507	elif isnan_(fl):
508	if self.options & INF_NAN_ARE_NULL:
509	# JavaScript JSON lib behavior: Inf and NaN are null
510	# Python has a bug in the encoder by default, and then
511	# allow_nan=False raises an error
512	s = 'null'
513	else:
514	s = 'NAN'
515	else:
516	# TODO: can we avoid intermediate allocation?
517	# self.buf.WriteFloat(val.f)
518	s = str(fl)
519
520	self.buf.write(s)
521
522	elif case(value_e.Str):
523	val = cast(value.Str, UP_val)
524
525	pyj8.WriteString(val.s, self.options, self.buf)
526
527	elif case(value_e.List):
528	val = cast(value.List, UP_val)
529
530	# Cycle detection, only for containers that can be in cycles
531	heap_id = HeapValueId(val)
532
533	if self.visiting.get(heap_id, False):
534	if self.options & SHOW_CYCLES:
535	# Showing the ID would be nice for pretty printing, but
536	# the problem is we'd have to show it TWICE to make it
537	# meaningful
538	#
539	#self.buf.write('[ -->%s ]' % ValueIdString(val))
540	self.buf.write('[...]')
541	return
542	else:
543	# node.js prints which index closes the cycle
544	raise error.Encode(
545	"Can't encode List%s in object cycle" %
546	ValueIdString(val))
547	else:
548	self.visiting[heap_id] = True
549	self._PrintList(val, level)
550	self.visiting[heap_id] = False
551
552	elif case(value_e.Dict):
553	val = cast(value.Dict, UP_val)
554
555	# Cycle detection, only for containers that can be in cycles
556	heap_id = HeapValueId(val)
557
558	if self.visiting.get(heap_id, False):
559	if self.options & SHOW_CYCLES:
560	self.buf.write('{...}')
561	return
562	else:
563	# node.js prints which key closes the cycle
564	raise error.Encode(
565	"Can't encode Dict%s in object cycle" %
566	ValueIdString(val))
567	else:
568	self.visiting[heap_id] = True
569	self._PrintDict(val, level)
570	self.visiting[heap_id] = False
571
572	elif case(value_e.Obj):
573	val = cast(Obj, UP_val)
574
575	if self.options & NON_DATA_IS_ERROR:
576	raise error.Encode("Can't encode value of type Obj")
577	elif self.options & NON_DATA_IS_NULL:
578	self.buf.write('null')
579	return
580
581	# Cycle detection, only for containers that can be in cycles
582	heap_id = HeapValueId(val)
583
584	if self.visiting.get(heap_id, False):
585	if self.options & SHOW_CYCLES:
586	self.buf.write('(...)')
587	return
588	else:
589	# node.js prints which key closes the cycle
590	raise error.Encode(
591	"Can't encode Obj%s in object cycle" %
592	ValueIdString(val))
593	else:
594	self.visiting[heap_id] = True
595	self._PrintObj(val, level)
596	self.visiting[heap_id] = False
597
598	elif case(value_e.BashArray):
599	val = cast(value.BashArray, UP_val)
600	self._PrintBashArray(val, level)
601
602	elif case(value_e.InternalStringArray):
603	val = cast(value.InternalStringArray, UP_val)
604	self._PrintInternalStringArray(val, level)
605
606	elif case(value_e.BashAssoc):
607	val = cast(value.BashAssoc, UP_val)
608	self._PrintBashAssoc(val, level)
609
610	else:
611	pass # mycpp workaround
612	if self.options & NON_DATA_IS_ERROR:
613	raise error.Encode("Can't serialize object of type %s" %
614	ValType(val))
615	elif self.options & NON_DATA_IS_NULL:
616	self.buf.write('null')
617	else:
618	# Similar to = operator, ui.DebugPrint()
619	# TODO: that prints value.Range in a special way
620	ysh_type = ValType(val)
621	# Don't show ID in 'pp test_'
622	#id_str = ValueIdString(val)
623	self.buf.write('<%s>' % ysh_type)
624
625
626	class LexerDecoder(object):
627	"""J8 lexer and string decoder.
628
629	Similar interface as SimpleLexer, except we return an optional decoded
630	string
631	"""
632
633	def __init__(self, s, is_j8, lang_str):
634	# type: (str, bool, str) -> None
635	self.s = s
636	self.is_j8 = is_j8
637	self.lang_str = lang_str
638
639	self.pos = 0
640
641	# current line being lexed -- for error messages
642	self.cur_line_num = 1
643
644	# Reuse this instance to save GC objects. JSON objects could have
645	# thousands of strings.
646	self.decoded = mylib.BufWriter()
647
648	def _Error(self, msg, end_pos):
649	# type: (str, int) -> error.Decode
650
651	# Use the current position as start pos
652	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
653
654	def Next(self):
655	# type: () -> Tuple[Id_t, int, Optional[str]]
656	""" Returns a token and updates self.pos """
657
658	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
659
660	if not self.is_j8:
661	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
662	raise self._Error(
663	"Single quotes aren't part of JSON; you may want 'json8 read'",
664	end_pos)
665	if tok_id == Id.Ignored_Comment:
666	raise self._Error(
667	"Comments aren't part of JSON; you may want 'json8 read'",
668	end_pos)
669
670	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
671	Id.Left_USingleQuote):
672	return self._DecodeString(tok_id, end_pos)
673
674	if tok_id == Id.Left_JDoubleQuote:
675	if self.is_j8:
676	return self._DecodeString(tok_id, end_pos)
677	else:
678	raise self._Error('Pure JSON does not accept j"" prefix',
679	end_pos)
680
681	if tok_id == Id.Ignored_Newline:
682	#log('LINE %d', self.cur_line_num)
683	self.cur_line_num += 1
684
685	self.pos = end_pos
686	return tok_id, end_pos, None
687
688	def NextForLines(self):
689	# type: () -> Tuple[Id_t, int, Optional[str]]
690	""" Like Next(), but for J8 Lines """
691
692	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
693
694	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
695	Id.Left_BSingleQuote, Id.Left_USingleQuote):
696	return self._DecodeString(tok_id, end_pos)
697
698	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
699	# this for quoted strings.)
700	if (tok_id == Id.Lit_Chars and
701	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
702	raise self._Error(
703	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
704	if tok_id == Id.Char_AsciiControl:
705	raise self._Error(
706	"J8 Lines can't have unescaped ASCII control chars", end_pos)
707
708	if tok_id == Id.J8_Newline:
709	#log('LINE %d', self.cur_line_num)
710	self.cur_line_num += 1
711
712	self.pos = end_pos
713	return tok_id, end_pos, None
714
715	def _DecodeString(self, left_id, str_pos):
716	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
717	""" Returns a string token and updates self.pos """
718
719	while True:
720	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
721	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
722	else:
723	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
724
725	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
726
727	if tok_id == Id.Eol_Tok:
728	# TODO: point to beginning of # quote?
729	raise self._Error(
730	'Unexpected EOF while lexing %s string' % self.lang_str,
731	str_end)
732	if tok_id == Id.Unknown_Backslash:
733	raise self._Error(
734	'Bad backslash escape in %s string' % self.lang_str,
735	str_end)
736	if tok_id == Id.Char_AsciiControl:
737	raise self._Error(
738	"%s strings can't have unescaped ASCII control chars" %
739	self.lang_str, str_end)
740
741	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
742
743	self.pos = str_end
744
745	s = self.decoded.getvalue()
746	self.decoded.clear() # reuse this instance
747
748	#log('decoded %r', self.decoded.getvalue())
749	return Id.J8_String, str_end, s
750
751	#
752	# Now handle each kind of token
753	#
754
755	if tok_id == Id.Lit_Chars: # JSON and J8
756	part = self.s[str_pos:str_end]
757	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
758	raise self._Error(
759	'Invalid UTF-8 in %s string literal' % self.lang_str,
760	str_end)
761
762	# TODO: would be nice to avoid allocation in all these cases.
763	# But LookupCharC() would have to change.
764
765	elif tok_id == Id.Char_OneChar: # JSON and J8
766	ch = self.s[str_pos + 1]
767	part = consts.LookupCharC(ch)
768
769	elif tok_id == Id.Char_UBraced: # J8 only
770	h = self.s[str_pos + 3:str_end - 1]
771	i = int(h, 16)
772
773	# Same checks in osh/word_compile.py
774	if i > 0x10ffff:
775	raise self._Error(
776	"Code point can't be greater than U+10ffff", str_end)
777	if 0xD800 <= i and i < 0xE000:
778	raise self._Error(
779	r"\u{%s} escape is illegal because it's in the surrogate range"
780	% h, str_end)
781
782	part = Utf8Encode(i)
783
784	elif tok_id == Id.Char_YHex: # J8 only
785	h = self.s[str_pos + 2:str_end]
786
787	# Same check in osh/word_parse.py
788	if left_id != Id.Left_BSingleQuote:
789	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
790	raise self._Error(
791	r"\y%s escapes not allowed in u'' strings" % h,
792	str_end)
793
794	i = int(h, 16)
795	part = chr(i)
796
797	elif tok_id == Id.Char_SurrogatePair:
798	h1 = self.s[str_pos + 2:str_pos + 6]
799	h2 = self.s[str_pos + 8:str_pos + 12]
800
801	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
802	i1 = int(h1, 16) - 0xD800 # high surrogate
803	i2 = int(h2, 16) - 0xDC00 # low surrogate
804	code_point = 0x10000 + (i1 << 10) + i2
805
806	part = Utf8Encode(code_point)
807
808	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
809	h = self.s[str_pos + 2:str_end]
810	i = int(h, 16)
811	part = Utf8Encode(i)
812
813	else:
814	# Should never happen
815	raise AssertionError(Id_str(tok_id))
816
817	#log('%s part %r', Id_str(tok_id), part)
818	self.decoded.write(part)
819	str_pos = str_end
820
821
822	class _Parser(object):
823
824	def __init__(self, s, is_j8):
825	# type: (str, bool) -> None
826	self.s = s
827	self.is_j8 = is_j8
828	self.lang_str = "J8" if is_j8 else "JSON"
829
830	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
831	self.tok_id = Id.Undefined_Tok
832	self.start_pos = 0
833	self.end_pos = 0
834	self.decoded = '' # decoded J8 string
835
836	def _Next(self):
837	# type: () -> None
838
839	# This isn't the start of a J8_Bool token, it's the END of the token before it
840	while True:
841	self.start_pos = self.end_pos
842	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
843	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
844	Id.Ignored_Comment):
845	break
846	# TODO: add Ignored_Newline to count lines, and show line numbers
847	# in errors messages. The position of the last newline and a token
848	# can be used to calculate a column number.
849
850	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
851
852	def _Eat(self, tok_id):
853	# type: (Id_t) -> None
854
855	if self.tok_id != tok_id:
856	#log('position %r %d-%d %r', self.s, self.start_pos,
857	# self.end_pos, self.s[self.start_pos:self.end_pos])
858	raise self._ParseError("Expected %s, got %s" %
859	(Id_str(tok_id), Id_str(self.tok_id)))
860	self._Next()
861
862	def _NextForLines(self):
863	# type: () -> None
864	"""Like _Next, but use the J8 Lines lexer."""
865	self.start_pos = self.end_pos
866	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
867
868	def _ParseError(self, msg):
869	# type: (str) -> error.Decode
870	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
871	self.lexer.cur_line_num)
872
873
874	class Parser(_Parser):
875	"""JSON and JSON8 Parser."""
876
877	def __init__(self, s, is_j8):
878	# type: (str, bool) -> None
879	_Parser.__init__(self, s, is_j8)
880
881	def _ParsePair(self):
882	# type: () -> Tuple[str, value_t]
883
884	k = self.decoded # Save the potential string value
885	self._Eat(Id.J8_String) # Check that it's a string
886	assert k is not None
887
888	self._Eat(Id.J8_Colon)
889
890	v = self._ParseValue()
891	return k, v
892
893	def _ParseDict(self):
894	# type: () -> value_t
895	"""
896	pair = string ':' value
897	Dict = '{' '}'
898	\| '{' pair (',' pair)* '}'
899	"""
900	# precondition
901	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
902
903	#log('> Dict')
904
905	d = NewDict() # type: Dict[str, value_t]
906
907	self._Next()
908	if self.tok_id == Id.J8_RBrace:
909	self._Next()
910	return value.Dict(d)
911
912	k, v = self._ParsePair()
913	d[k] = v
914	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
915
916	while self.tok_id == Id.J8_Comma:
917	self._Next()
918	k, v = self._ParsePair()
919	d[k] = v
920	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
921
922	self._Eat(Id.J8_RBrace)
923
924	#log('< Dict')
925
926	return value.Dict(d)
927
928	def _ParseList(self):
929	# type: () -> value_t
930	"""
931	List = '[' ']'
932	\| '[' value (',' value)* ']'
933	"""
934	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
935
936	items = [] # type: List[value_t]
937
938	self._Next()
939	if self.tok_id == Id.J8_RBracket:
940	self._Next()
941	return value.List(items)
942
943	items.append(self._ParseValue())
944
945	while self.tok_id == Id.J8_Comma:
946	self._Next()
947	items.append(self._ParseValue())
948
949	self._Eat(Id.J8_RBracket)
950
951	return value.List(items)
952
953	def _ParseValue(self):
954	# type: () -> value_t
955	if self.tok_id == Id.J8_LBrace:
956	return self._ParseDict()
957
958	elif self.tok_id == Id.J8_LBracket:
959	return self._ParseList()
960
961	elif self.tok_id == Id.J8_Null:
962	self._Next()
963	return value.Null
964
965	elif self.tok_id == Id.J8_Bool:
966	#log('%r %d', self.s[self.start_pos], self.start_pos)
967	b = value.Bool(self.s[self.start_pos] == 't')
968	self._Next()
969	return b
970
971	elif self.tok_id == Id.J8_Int:
972	part = self.s[self.start_pos:self.end_pos]
973	self._Next()
974	ok, big = mops.FromStr2(part)
975	if not ok:
976	raise self._ParseError('Integer is too big')
977	return value.Int(big)
978
979	elif self.tok_id == Id.J8_Float:
980	part = self.s[self.start_pos:self.end_pos]
981	self._Next()
982	return value.Float(float(part))
983
984	# UString, BString too
985	elif self.tok_id == Id.J8_String:
986	str_val = value.Str(self.decoded)
987	#log('d %r', self.decoded)
988	self._Next()
989	return str_val
990
991	elif self.tok_id == Id.Eol_Tok:
992	raise self._ParseError('Unexpected EOF while parsing %s' %
993	self.lang_str)
994
995	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
996	raise self._ParseError('Invalid token while parsing %s: %s' %
997	(self.lang_str, Id_str(self.tok_id)))
998
999	def ParseValue(self):
1000	# type: () -> value_t
1001	""" Raises error.Decode. """
1002	self._Next()
1003	obj = self._ParseValue()
1004
1005	n = len(self.s)
1006	if self.start_pos != n:
1007	extra = n - self.start_pos
1008	#log('n %d pos %d', n, self.start_pos)
1009	raise self._ParseError(
1010	'Got %d bytes of unexpected trailing input' % extra)
1011	return obj
1012
1013
1014	class Nil8Parser(_Parser):
1015	"""
1016	Tokens not in JSON8:
1017	LParen RParen Symbol
1018
1019	Tokens not in JSON, but in JSON8 and NIL8:
1020	Identifier (unquoted keys)
1021	Ignored_Comment
1022	"""
1023
1024	def __init__(self, s, is_j8):
1025	# type: (str, bool) -> None
1026	_Parser.__init__(self, s, is_j8)
1027
1028	if 0:
1029
1030	def _LookAhead(self):
1031	# type: () -> Id_t
1032	"""
1033	Don't need this right now
1034	"""
1035	end_pos = self.end_pos # look ahead from last token
1036	while True:
1037	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1038	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1039	Id.Ignored_Comment):
1040	break
1041	return tok_id
1042
1043	def _ParseRecord(self):
1044	# type: () -> nvalue_t
1045	"""
1046	Yaks
1047	(self->Next) => (-> self Next)
1048	(self->Next obj.field) => ((-> self Next) (. obj field))
1049
1050	Similar to
1051	((identity identity) 42) => 42 in Clojure
1052
1053	ASDL
1054	(Node left:(. x4beef2))
1055	(Node left !x4beef2)
1056
1057	# Ambiguous because value can be identifier.
1058	# We have to look ahead to and see if there's a colon :
1059	field =
1060	Identifier ':' value
1061	\| value
1062
1063	record = '(' head field* ')'
1064
1065	- Identifier \| Symbol are treated the same, it's a side effect of
1066	the lexing style
1067	- do positional args come before named args
1068	- () is invalid? Use [] for empty list
1069	"""
1070	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1071
1072	items = [] # type: List[nvalue_t]
1073
1074	self._Next()
1075	if self.tok_id == Id.J8_RParen:
1076	self._Next()
1077	return nvalue.List(items)
1078
1079	#log('TOK %s', Id_str(self.tok_id))
1080	while self.tok_id != Id.J8_RParen:
1081	items.append(self._ParseNil8())
1082	#log('TOK 2 %s', Id_str(self.tok_id))
1083
1084	self._Eat(Id.J8_RParen)
1085
1086	return nvalue.List(items)
1087
1088	def _ParseList8(self):
1089	# type: () -> nvalue_t
1090	"""
1091	List8 = '[' value* ']'
1092
1093	No commas, not even optional ones for now.
1094	"""
1095	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1096
1097	items = [] # type: List[nvalue_t]
1098
1099	self._Next()
1100	if self.tok_id == Id.J8_RBracket:
1101	self._Next()
1102	return nvalue.List(items)
1103
1104	#log('TOK %s', Id_str(self.tok_id))
1105	while self.tok_id != Id.J8_RBracket:
1106	items.append(self._ParseNil8())
1107	#log('TOK 2 %s', Id_str(self.tok_id))
1108
1109	self._Eat(Id.J8_RBracket)
1110
1111	return nvalue.List(items)
1112
1113	def _ParseNil8(self):
1114	# type: () -> nvalue_t
1115	if self.tok_id == Id.J8_LParen:
1116	obj = self._ParseRecord() # type: nvalue_t
1117	#return obj
1118
1119	elif self.tok_id == Id.J8_LBracket:
1120	obj = self._ParseList8()
1121	#return obj
1122
1123	# Primitives are copied from J8 above.
1124	# TODO: We also want hex literals.
1125	elif self.tok_id == Id.J8_Null:
1126	self._Next()
1127	obj = nvalue.Null
1128
1129	elif self.tok_id == Id.J8_Bool:
1130	b = nvalue.Bool(self.s[self.start_pos] == 't')
1131	self._Next()
1132	obj = b
1133
1134	elif self.tok_id == Id.J8_Int:
1135	part = self.s[self.start_pos:self.end_pos]
1136	self._Next()
1137	obj = nvalue.Int(int(part))
1138
1139	elif self.tok_id == Id.J8_Float:
1140	part = self.s[self.start_pos:self.end_pos]
1141	self._Next()
1142	obj = nvalue.Float(float(part))
1143
1144	elif self.tok_id == Id.J8_String:
1145	str_val = nvalue.Str(self.decoded)
1146	self._Next()
1147	obj = str_val
1148
1149	# <- etc.
1150	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1151	Id.J8_Comma):
1152	# unquoted "word" treated like a string
1153	part = self.s[self.start_pos:self.end_pos]
1154	self._Next()
1155	obj = nvalue.Symbol(part)
1156
1157	elif self.tok_id == Id.Eol_Tok:
1158	raise self._ParseError('Unexpected EOF while parsing %s' %
1159	self.lang_str)
1160
1161	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1162	raise self._ParseError('Invalid token while parsing %s: %s' %
1163	(self.lang_str, Id_str(self.tok_id)))
1164
1165	#log('YO %s', Id_str(self.tok_id))
1166	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1167	#log('AT %s', Id_str(self.tok_id))
1168
1169	# key: "value" -> (: key "value")
1170	part = self.s[self.start_pos:self.end_pos]
1171	op = nvalue.Symbol(part)
1172
1173	self._Next()
1174	operand2 = self._ParseNil8()
1175	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1176	#print("--> INFIX %d %s" % (id(infix), infix))
1177	return infix
1178
1179	#next_id = self._LookAhead()
1180	#print('NEXT %s' % Id_str(next_id))
1181
1182	#raise AssertionError()
1183	#print("--> OBJ %d %s" % (id(obj), obj))
1184	return obj
1185
1186	def ParseNil8(self):
1187	# type: () -> nvalue_t
1188	""" Raises error.Decode. """
1189	self._Next()
1190	#print('yo')
1191	obj = self._ParseNil8()
1192	#print("==> %d %s" % (id(obj), obj))
1193	if self.tok_id != Id.Eol_Tok:
1194	raise self._ParseError('Unexpected trailing input')
1195	return obj
1196
1197
1198	class J8LinesParser(_Parser):
1199	"""Decode lines from a string with newlines.
1200
1201	We specify this with a grammar, to preserve location info and to reduce
1202	allocations. (But note that unquoted_line is more like a LOOP than it is
1203	grammatical.)
1204
1205	Grammar:
1206
1207	end = J8_Newline \| Eol_Tok
1208
1209	empty_line = WS_Space? end
1210
1211	# special case: read until end token, but REMOVE trailing WS_Space
1212	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1213
1214	j8_line = WS_Space? J8_String WS_Space? end
1215
1216	lines = (empty_line \| unquoted_line \| j8_line)*
1217
1218	where Lit_Chars is valid UTF-8
1219
1220	Notes:
1221
1222	(1) We disallow multiple strings on a line, like:
1223
1224	"json" "json2"
1225	"json" unquoted
1226
1227	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1228
1229	foo "" u''
1230
1231	The "" and u'' are not a decoded string, because the line started with
1232	Id.Lit_Chars literals.
1233
1234	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1235	Does it have - for empty cell?
1236	"""
1237
1238	def __init__(self, s):
1239	# type: (str) -> None
1240	_Parser.__init__(self, s, True)
1241
1242	def _Show(self, s):
1243	# type: (str) -> None
1244	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1245	self.end_pos)
1246
1247	def _ParseLine(self, out):
1248	# type: (List[str]) -> None
1249	""" May append a line to 'out' """
1250	#self._Show('1')
1251	if self.tok_id == Id.WS_Space:
1252	self._NextForLines()
1253
1254	# Empty line - return without doing anything
1255	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1256	self._NextForLines()
1257	return
1258
1259	# Quoted string on line
1260	if self.tok_id == Id.J8_String:
1261	out.append(self.decoded)
1262	self._NextForLines()
1263
1264	if self.tok_id == Id.WS_Space: # trailing whitespace
1265	self._NextForLines()
1266
1267	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1268	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1269	Id_str(self.tok_id))
1270
1271	self._NextForLines()
1272	return
1273
1274	# Unquoted line
1275	if self.tok_id == Id.Lit_Chars:
1276	# ' unquoted "" text on line ' # read every token until end
1277	string_start = self.start_pos
1278	while True:
1279	# for stripping whitespace
1280	prev_id = self.tok_id
1281	prev_start = self.start_pos
1282
1283	self._NextForLines()
1284
1285	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1286	# \r, but we're sticking with the JSON spec definition of
1287	# whitespace. (As another data point, CPython on Unix allows
1288	# \r in the middle of expressions, treating it as whitespace.)
1289	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1290	break
1291
1292	if prev_id == Id.WS_Space:
1293	string_end = prev_start # remove trailing whitespace
1294	else:
1295	string_end = self.start_pos
1296
1297	out.append(self.s[string_start:string_end])
1298
1299	self._NextForLines() # past newline
1300	return
1301
1302	raise AssertionError(Id_str(self.tok_id))
1303
1304	def Parse(self):
1305	# type: () -> List[str]
1306	""" Raises error.Decode. """
1307	self._NextForLines()
1308
1309	lines = [] # type: List[str]
1310	while self.tok_id != Id.Eol_Tok:
1311	self._ParseLine(lines)
1312
1313	if self.tok_id != Id.Eol_Tok:
1314	raise self._ParseError('Unexpected trailing input in J8 Lines')
1315
1316	return lines
1317
1318
1319	def SplitJ8Lines(s):
1320	# type: (str) -> List[str]
1321	"""Used by @(echo split command sub)
1322
1323	Raises:
1324	error.Decode
1325
1326	3 Errors:
1327	- J8 string syntax error inside quotes
1328	- Extra input on line
1329	- unquoted line isn't utf-8
1330	"""
1331	p = J8LinesParser(s)
1332	return p.Parse()
1333
1334
1335	# vim: sw=4