data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1326 lines, 686 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- Unify with ASDL pretty printing - NIL8
8	- {} [] are identical
9	- () is for statically typed ASDL data
10	(command.Simple blame_tok:(...) words:[ ])
11	although we are also using [] for typed ASDL arrays, not just JSON
12	- object IDs
13	- @ x123 can create an ID
14	- ! x123 can reference an ID
15	- <> can be for non-J8 data types? For the = operator
16	- 'hi \(name)' interpolation is useful for code
17
18	- Common between JSON8 and NIL8 - for writing by hand
19	- comments - # line or // line (JSON5 uses // line, following JS)
20	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21	- commas
22	- JSON8 could have trailing commas rule
23	- NIL8 at least has no commas for [1 2 "hi"]
24	"""
25
26	import math
27
28	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
30	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
31
32	from core import error
33	from data_lang import pyj8
34	# dependency issue: consts.py pulls in frontend/option_def.py
35	from frontend import consts
36	from frontend import match
37	from mycpp import mops
38	from mycpp import mylib
39	from mycpp.mylib import tagswitch, iteritems, NewDict, log
40
41	import fastfunc
42
43	_ = log
44
45	from typing import cast, Dict, List, Tuple, Optional
46
47
48	# COPIED from ui.ValType() to break dep
49	def ValType(val):
50	# type: (value_t) -> str
51	"""For displaying type errors in the UI."""
52
53	return value_str(val.tag(), dot=False)
54
55
56	if mylib.PYTHON:
57
58	def HeapValueId(val):
59	# type: (value_t) -> int
60	"""
61	Python's id() returns the address, which is up to 64 bits.
62
63	In C++ we can use the GC ID, which fits within 32 bits.
64	"""
65	return id(val)
66
67
68	def ValueId(val):
69	# type: (value_t) -> int
70	"""
71	Return an integer ID for object that:
72
73	1. Can be used to determine whether 2 objects are the same, e.g. for
74	List, Dict, Func, Proc, etc.
75	2. Will help detect object cycles
76
77	Primitives types like Int and Float don't have this notion. They're
78	immutable values that are copied and compared by value.
79	"""
80	with tagswitch(val) as case:
81	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
82	value_e.Str):
83	# These will not be on the heap if we switch to tagged pointers
84	# Str is handled conservatively - when we add small string
85	# optimization, some strings will be values, so we assume all are.
86	return -1
87	else:
88	return HeapValueId(val)
89
90
91	def ValueIdString(val):
92	# type: (value_t) -> str
93	"""Used by pp value (42) and = 42"""
94	heap_id = ValueId(val) # could be -1
95	if heap_id == -1:
96	return ''
97	else:
98	return ' 0x%s' % mylib.hex_lower(heap_id)
99
100
101	def Utf8Encode(code):
102	# type: (int) -> str
103	"""Return utf-8 encoded bytes from a unicode code point.
104
105	Based on https://stackoverflow.com/a/23502707
106	"""
107	num_cont_bytes = 0
108
109	if code <= 0x7F:
110	return chr(code & 0x7F) # ASCII
111
112	elif code <= 0x7FF:
113	num_cont_bytes = 1
114	elif code <= 0xFFFF:
115	num_cont_bytes = 2
116	else:
117	# What about the check code <= 0x10FFFF ?
118	# - it happens in statically parsed $'' u''
119	# - but not dynamically parsed echo -e / printf, following bash/zsh
120	num_cont_bytes = 3
121
122	bytes_ = [] # type: List[int]
123	for _ in xrange(num_cont_bytes):
124	bytes_.append(0x80 \| (code & 0x3F))
125	code >>= 6
126
127	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
128	bytes_.append(b)
129	bytes_.reverse()
130
131	# mod 256 because Python ints don't wrap around!
132	tmp = [chr(b & 0xFF) for b in bytes_]
133	return ''.join(tmp)
134
135
136	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
137	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
138	LOSSY_JSON = 1 << 3 # JSON may lose data about strings
139	INF_NAN_ARE_NULL = 1 << 4 # for JSON
140
141	# Hack until we fully translate
142	assert pyj8.LOSSY_JSON == LOSSY_JSON
143
144
145	def _Print(val, buf, indent, options=0):
146	# type: (value_t, mylib.BufWriter, int, int) -> None
147	"""
148	Args:
149	indent: number of spaces to indent, or -1 for everything on one line
150	"""
151	p = InstancePrinter(buf, indent, options)
152	p.Print(val)
153
154
155	def PrintMessage(val, buf, indent):
156	# type: (value_t, mylib.BufWriter, int) -> None
157	""" For json8 write (x) and toJson8()
158
159	Caller must handle error.Encode
160	"""
161	_Print(val, buf, indent)
162
163
164	def PrintJsonMessage(val, buf, indent):
165	# type: (value_t, mylib.BufWriter, int) -> None
166	""" For json write (x) and toJson()
167
168	Caller must handle error.Encode()
169	Doesn't decay to b'' strings - will use Unicode replacement char.
170	"""
171	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
172
173
174	def PrintLine(val, f):
175	# type: (value_t, mylib.Writer) -> None
176	""" For pp line (x) """
177
178	# error.Encode should be impossible - we show cycles and non-data
179	buf = mylib.BufWriter()
180
181	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
182
183	f.write(buf.getvalue())
184	f.write('\n')
185
186
187	if 0:
188
189	def Repr(val):
190	# type: (value_t) -> str
191	""" Unused
192	This is like Python's repr
193	"""
194	# error.Encode should be impossible - we show cycles and non-data
195	buf = mylib.BufWriter()
196	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
197	return buf.getvalue()
198
199
200	def EncodeString(s, buf, unquoted_ok=False):
201	# type: (str, mylib.BufWriter, bool) -> None
202	""" For pp proc, etc."""
203
204	if unquoted_ok and fastfunc.CanOmitQuotes(s):
205	buf.write(s)
206	return
207
208	_Print(value.Str(s), buf, -1)
209
210
211	def MaybeEncodeString(s):
212	# type: (str) -> str
213	""" For write --json8 $s and compexport """
214
215	# TODO: add unquoted_ok here?
216	# /usr/local/foo-bar/x.y/a_b
217
218	buf = mylib.BufWriter()
219	_Print(value.Str(s), buf, -1)
220	return buf.getvalue()
221
222
223	def MaybeEncodeJsonString(s):
224	# type: (str) -> str
225	""" For write --json """
226
227	# TODO: add unquoted_ok here?
228	# /usr/local/foo-bar/x.y/a_b
229	buf = mylib.BufWriter()
230	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
231	return buf.getvalue()
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	self.visiting = {} # type: Dict[int, bool]
245
246	def _ItemIndent(self, level):
247	# type: (int) -> None
248
249	if self.indent == -1:
250	return
251
252	self.buf.write_spaces((level + 1) * self.indent)
253
254	def _BracketIndent(self, level):
255	# type: (int) -> None
256
257	if self.indent == -1:
258	return
259
260	self.buf.write_spaces(level * self.indent)
261
262	def _MaybeNewline(self):
263	# type: () -> None
264	if self.indent == -1:
265	return
266	self.buf.write('\n')
267
268	def _MaybeSpace(self):
269	# type: () -> None
270	if self.indent == -1:
271	return
272	self.buf.write(' ')
273
274	def _PrintList(self, val, level):
275	# type: (value.List, int) -> None
276
277	if len(val.items) == 0: # Special case like Python/JS
278	self.buf.write('[]')
279	else:
280	self.buf.write('[')
281	self._MaybeNewline()
282	for i, item in enumerate(val.items):
283	if i != 0:
284	self.buf.write(',')
285	self._MaybeNewline()
286
287	self._ItemIndent(level)
288	self.Print(item, level + 1)
289	self._MaybeNewline()
290
291	self._BracketIndent(level)
292	self.buf.write(']')
293
294	def _PrintMapping(self, d, left, right, level):
295	# type: (Dict[str, value_t], str, str, int) -> None
296	if len(d) == 0: # Special case like Python/JS
297	self.buf.write(left)
298	self.buf.write(right)
299	else:
300	self.buf.write(left)
301	self._MaybeNewline()
302	i = 0
303	for k, v in iteritems(d):
304	if i != 0:
305	self.buf.write(',')
306	self._MaybeNewline()
307
308	self._ItemIndent(level)
309
310	pyj8.WriteString(k, self.options, self.buf)
311
312	self.buf.write(':')
313	self._MaybeSpace()
314
315	self.Print(v, level + 1)
316
317	i += 1
318
319	self._MaybeNewline()
320	self._BracketIndent(level)
321	self.buf.write(right)
322
323	def _PrintDict(self, val, level):
324	# type: (value.Dict, int) -> None
325	self._PrintMapping(val.d, '{', '}', level)
326
327	def _PrintObj(self, val, level):
328	# type: (Obj, int) -> None
329
330	self._PrintMapping(val.d, '(', ')', level)
331
332	if val.prototype:
333	self.buf.write(' --> ')
334	self._PrintObj(val.prototype, level)
335
336	def _PrintBashPrefix(self, type_str, level):
337	# type: (str, int) -> None
338
339	self.buf.write('{')
340	self._MaybeNewline()
341	self._ItemIndent(level)
342	self.buf.write('"type":')
343	self._MaybeSpace()
344	self.buf.write(type_str) # "BashArray", or "BashAssoc",
345
346	self._MaybeNewline()
347
348	self._ItemIndent(level)
349	self.buf.write('"data":')
350	self._MaybeSpace()
351
352	def _PrintBashSuffix(self, level):
353	# type: (int) -> None
354	self._MaybeNewline()
355	self._BracketIndent(level)
356	self.buf.write('}')
357
358	def _PrintSparseArray(self, val, level):
359	# type: (value.SparseArray, int) -> None
360
361	self._PrintBashPrefix('"SparseArray",', level)
362
363	if len(val.d) == 0: # Special case like Python/JS
364	self.buf.write('{}')
365	else:
366	self.buf.write('{')
367	self._MaybeNewline()
368
369	i = 0
370	for k, v in iteritems(val.d):
371	if i != 0:
372	self.buf.write(',')
373	self._MaybeNewline()
374
375	self._ItemIndent(level + 1)
376	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
377
378	self.buf.write(':')
379	self._MaybeSpace()
380
381	pyj8.WriteString(v, self.options, self.buf)
382
383	i += 1
384
385	self._MaybeNewline()
386
387	self._BracketIndent(level + 1)
388	self.buf.write('}')
389
390	self._PrintBashSuffix(level)
391
392	def _PrintBashArray(self, val, level):
393	# type: (value.BashArray, int) -> None
394
395	self._PrintBashPrefix('"BashArray",', level)
396
397	if len(val.strs) == 0: # Special case like Python/JS
398	self.buf.write('{}')
399	else:
400	self.buf.write('{')
401	self._MaybeNewline()
402
403	first = True
404	for i, s in enumerate(val.strs):
405	if s is None:
406	continue
407
408	if not first:
409	self.buf.write(',')
410	self._MaybeNewline()
411
412	self._ItemIndent(level + 1)
413	pyj8.WriteString(str(i), self.options, self.buf)
414
415	self.buf.write(':')
416	self._MaybeSpace()
417
418	pyj8.WriteString(s, self.options, self.buf)
419
420	first = False
421
422	self._MaybeNewline()
423
424	self._BracketIndent(level + 1)
425	self.buf.write('}')
426
427	self._PrintBashSuffix(level)
428
429	def _PrintBashAssoc(self, val, level):
430	# type: (value.BashAssoc, int) -> None
431
432	self._PrintBashPrefix('"BashAssoc",', level)
433
434	if len(val.d) == 0: # Special case like Python/JS
435	self.buf.write('{}')
436	else:
437	self.buf.write('{')
438	self._MaybeNewline()
439
440	i = 0
441	for k2, v2 in iteritems(val.d):
442	if i != 0:
443	self.buf.write(',')
444	self._MaybeNewline()
445
446	self._ItemIndent(level + 1)
447	pyj8.WriteString(k2, self.options, self.buf)
448
449	self.buf.write(':')
450	self._MaybeSpace()
451
452	pyj8.WriteString(v2, self.options, self.buf)
453
454	i += 1
455
456	self._MaybeNewline()
457
458	self._BracketIndent(level + 1)
459	self.buf.write('}')
460
461	self._PrintBashSuffix(level)
462
463	def Print(self, val, level=0):
464	# type: (value_t, int) -> None
465
466	# special value that means everything is on one line
467	# It's like
468	# JSON.stringify(d, null, 0)
469	# except we use -1, not 0. 0 can still have newlines.
470
471	UP_val = val
472	with tagswitch(val) as case:
473	if case(value_e.Null):
474	self.buf.write('null')
475
476	elif case(value_e.Bool):
477	val = cast(value.Bool, UP_val)
478	self.buf.write('true' if val.b else 'false')
479
480	elif case(value_e.Int):
481	val = cast(value.Int, UP_val)
482	# TODO: avoid intermediate allocation with
483	# self.buf.WriteBigInt(val.i)
484	#
485	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
486	# be of arbitrary length, and will need a growth strategy.
487	# Although that is not very common, so we could allocate in
488	# that case.
489
490	self.buf.write(mops.ToStr(val.i))
491
492	elif case(value_e.Float):
493	val = cast(value.Float, UP_val)
494
495	fl = val.f
496	if math.isinf(fl):
497	if self.options & INF_NAN_ARE_NULL:
498	s = 'null' # negative infinity is null too
499	else:
500	s = 'INFINITY'
501	if fl < 0:
502	s = '-' + s
503	elif math.isnan(fl):
504	if self.options & INF_NAN_ARE_NULL:
505	# JavaScript JSON lib behavior: Inf and NaN are null
506	# Python has a bug in the encoder by default, and then
507	# allow_nan=False raises an error
508	s = 'null'
509	else:
510	s = 'NAN'
511	else:
512	# TODO: can we avoid intermediate allocation?
513	# self.buf.WriteFloat(val.f)
514	s = str(fl)
515
516	self.buf.write(s)
517
518	elif case(value_e.Str):
519	val = cast(value.Str, UP_val)
520
521	pyj8.WriteString(val.s, self.options, self.buf)
522
523	elif case(value_e.List):
524	val = cast(value.List, UP_val)
525
526	# Cycle detection, only for containers that can be in cycles
527	heap_id = HeapValueId(val)
528
529	if self.visiting.get(heap_id, False):
530	if self.options & SHOW_CYCLES:
531	# Showing the ID would be nice for pretty printing, but
532	# the problem is we'd have to show it TWICE to make it
533	# meaningful
534	#
535	#self.buf.write('[ -->%s ]' % ValueIdString(val))
536	self.buf.write('[...]')
537	return
538	else:
539	# node.js prints which index closes the cycle
540	raise error.Encode(
541	"Can't encode List%s in object cycle" %
542	ValueIdString(val))
543	else:
544	self.visiting[heap_id] = True
545	self._PrintList(val, level)
546	self.visiting[heap_id] = False
547
548	elif case(value_e.Dict):
549	val = cast(value.Dict, UP_val)
550
551	# Cycle detection, only for containers that can be in cycles
552	heap_id = HeapValueId(val)
553
554	if self.visiting.get(heap_id, False):
555	if self.options & SHOW_CYCLES:
556	self.buf.write('{...}')
557	return
558	else:
559	# node.js prints which key closes the cycle
560	raise error.Encode(
561	"Can't encode Dict%s in object cycle" %
562	ValueIdString(val))
563	else:
564	self.visiting[heap_id] = True
565	self._PrintDict(val, level)
566	self.visiting[heap_id] = False
567
568	elif case(value_e.Obj):
569	val = cast(Obj, UP_val)
570
571	if not (self.options & SHOW_NON_DATA):
572	raise error.Encode("Can't encode value of type Obj")
573
574	# Cycle detection, only for containers that can be in cycles
575	heap_id = HeapValueId(val)
576
577	if self.visiting.get(heap_id, False):
578	if self.options & SHOW_CYCLES:
579	self.buf.write('(...)')
580	return
581	else:
582	# node.js prints which key closes the cycle
583	raise error.Encode(
584	"Can't encode Obj%s in object cycle" %
585	ValueIdString(val))
586	else:
587	self.visiting[heap_id] = True
588	self._PrintObj(val, level)
589	self.visiting[heap_id] = False
590
591	elif case(value_e.SparseArray):
592	val = cast(value.SparseArray, UP_val)
593	self._PrintSparseArray(val, level)
594
595	elif case(value_e.BashArray):
596	val = cast(value.BashArray, UP_val)
597	self._PrintBashArray(val, level)
598
599	elif case(value_e.BashAssoc):
600	val = cast(value.BashAssoc, UP_val)
601	self._PrintBashAssoc(val, level)
602
603	else:
604	pass # mycpp workaround
605	if self.options & SHOW_NON_DATA:
606	# Similar to = operator, ui.DebugPrint()
607	# TODO: that prints value.Range in a special way
608	ysh_type = ValType(val)
609	# Don't show ID in 'pp test_'
610	#id_str = ValueIdString(val)
611	self.buf.write('<%s>' % ysh_type)
612	else:
613	raise error.Encode("Can't serialize object of type %s" %
614	ValType(val))
615
616
617	class LexerDecoder(object):
618	"""J8 lexer and string decoder.
619
620	Similar interface as SimpleLexer, except we return an optional decoded
621	string
622	"""
623
624	def __init__(self, s, is_j8, lang_str):
625	# type: (str, bool, str) -> None
626	self.s = s
627	self.is_j8 = is_j8
628	self.lang_str = lang_str
629
630	self.pos = 0
631
632	# current line being lexed -- for error messages
633	self.cur_line_num = 1
634
635	# Reuse this instance to save GC objects. JSON objects could have
636	# thousands of strings.
637	self.decoded = mylib.BufWriter()
638
639	def _Error(self, msg, end_pos):
640	# type: (str, int) -> error.Decode
641
642	# Use the current position as start pos
643	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
644
645	def Next(self):
646	# type: () -> Tuple[Id_t, int, Optional[str]]
647	""" Returns a token and updates self.pos """
648
649	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
650
651	if not self.is_j8:
652	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
653	raise self._Error(
654	"Single quotes aren't part of JSON; you may want 'json8 read'",
655	end_pos)
656	if tok_id == Id.Ignored_Comment:
657	raise self._Error(
658	"Comments aren't part of JSON; you may want 'json8 read'",
659	end_pos)
660
661	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
662	Id.Left_USingleQuote):
663	return self._DecodeString(tok_id, end_pos)
664
665	if tok_id == Id.Left_JDoubleQuote:
666	if self.is_j8:
667	return self._DecodeString(tok_id, end_pos)
668	else:
669	raise self._Error('Pure JSON does not accept j"" prefix',
670	end_pos)
671
672	if tok_id == Id.Ignored_Newline:
673	#log('LINE %d', self.cur_line_num)
674	self.cur_line_num += 1
675
676	self.pos = end_pos
677	return tok_id, end_pos, None
678
679	def NextForLines(self):
680	# type: () -> Tuple[Id_t, int, Optional[str]]
681	""" Like Next(), but for J8 Lines """
682
683	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
684
685	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
686	Id.Left_BSingleQuote, Id.Left_USingleQuote):
687	return self._DecodeString(tok_id, end_pos)
688
689	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
690	# this for quoted strings.)
691	if (tok_id == Id.Lit_Chars and
692	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
693	raise self._Error(
694	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
695	if tok_id == Id.Char_AsciiControl:
696	raise self._Error(
697	"J8 Lines can't have unescaped ASCII control chars", end_pos)
698
699	if tok_id == Id.J8_Newline:
700	#log('LINE %d', self.cur_line_num)
701	self.cur_line_num += 1
702
703	self.pos = end_pos
704	return tok_id, end_pos, None
705
706	def _DecodeString(self, left_id, str_pos):
707	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
708	""" Returns a string token and updates self.pos """
709
710	while True:
711	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
712	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
713	else:
714	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
715
716	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
717
718	if tok_id == Id.Eol_Tok:
719	# TODO: point to beginning of # quote?
720	raise self._Error(
721	'Unexpected EOF while lexing %s string' % self.lang_str,
722	str_end)
723	if tok_id == Id.Unknown_Backslash:
724	raise self._Error(
725	'Bad backslash escape in %s string' % self.lang_str,
726	str_end)
727	if tok_id == Id.Char_AsciiControl:
728	raise self._Error(
729	"%s strings can't have unescaped ASCII control chars" %
730	self.lang_str, str_end)
731
732	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
733
734	self.pos = str_end
735
736	s = self.decoded.getvalue()
737	self.decoded.clear() # reuse this instance
738
739	#log('decoded %r', self.decoded.getvalue())
740	return Id.J8_String, str_end, s
741
742	#
743	# Now handle each kind of token
744	#
745
746	if tok_id == Id.Lit_Chars: # JSON and J8
747	part = self.s[str_pos:str_end]
748	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
749	raise self._Error(
750	'Invalid UTF-8 in %s string literal' % self.lang_str,
751	str_end)
752
753	# TODO: would be nice to avoid allocation in all these cases.
754	# But LookupCharC() would have to change.
755
756	elif tok_id == Id.Char_OneChar: # JSON and J8
757	ch = self.s[str_pos + 1]
758	part = consts.LookupCharC(ch)
759
760	elif tok_id == Id.Char_UBraced: # J8 only
761	h = self.s[str_pos + 3:str_end - 1]
762	i = int(h, 16)
763
764	# Same checks in osh/word_compile.py
765	if i > 0x10ffff:
766	raise self._Error(
767	"Code point can't be greater than U+10ffff", str_end)
768	if 0xD800 <= i and i < 0xE000:
769	raise self._Error(
770	r"\u{%s} escape is illegal because it's in the surrogate range"
771	% h, str_end)
772
773	part = Utf8Encode(i)
774
775	elif tok_id == Id.Char_YHex: # J8 only
776	h = self.s[str_pos + 2:str_end]
777
778	# Same check in osh/word_parse.py
779	if left_id != Id.Left_BSingleQuote:
780	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
781	raise self._Error(
782	r"\y%s escapes not allowed in u'' strings" % h,
783	str_end)
784
785	i = int(h, 16)
786	part = chr(i)
787
788	elif tok_id == Id.Char_SurrogatePair:
789	h1 = self.s[str_pos + 2:str_pos + 6]
790	h2 = self.s[str_pos + 8:str_pos + 12]
791
792	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
793	i1 = int(h1, 16) - 0xD800 # high surrogate
794	i2 = int(h2, 16) - 0xDC00 # low surrogate
795	code_point = 0x10000 + (i1 << 10) + i2
796
797	part = Utf8Encode(code_point)
798
799	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
800	h = self.s[str_pos + 2:str_end]
801	i = int(h, 16)
802	part = Utf8Encode(i)
803
804	else:
805	# Should never happen
806	raise AssertionError(Id_str(tok_id))
807
808	#log('%s part %r', Id_str(tok_id), part)
809	self.decoded.write(part)
810	str_pos = str_end
811
812
813	class _Parser(object):
814
815	def __init__(self, s, is_j8):
816	# type: (str, bool) -> None
817	self.s = s
818	self.is_j8 = is_j8
819	self.lang_str = "J8" if is_j8 else "JSON"
820
821	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
822	self.tok_id = Id.Undefined_Tok
823	self.start_pos = 0
824	self.end_pos = 0
825	self.decoded = '' # decoded J8 string
826
827	def _Next(self):
828	# type: () -> None
829
830	# This isn't the start of a J8_Bool token, it's the END of the token before it
831	while True:
832	self.start_pos = self.end_pos
833	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
834	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
835	Id.Ignored_Comment):
836	break
837	# TODO: add Ignored_Newline to count lines, and show line numbers
838	# in errors messages. The position of the last newline and a token
839	# can be used to calculate a column number.
840
841	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
842
843	def _Eat(self, tok_id):
844	# type: (Id_t) -> None
845
846	if self.tok_id != tok_id:
847	#log('position %r %d-%d %r', self.s, self.start_pos,
848	# self.end_pos, self.s[self.start_pos:self.end_pos])
849	raise self._ParseError("Expected %s, got %s" %
850	(Id_str(tok_id), Id_str(self.tok_id)))
851	self._Next()
852
853	def _NextForLines(self):
854	# type: () -> None
855	"""Like _Next, but use the J8 Lines lexer."""
856	self.start_pos = self.end_pos
857	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
858
859	def _ParseError(self, msg):
860	# type: (str) -> error.Decode
861	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
862	self.lexer.cur_line_num)
863
864
865	class Parser(_Parser):
866	"""JSON and JSON8 Parser."""
867
868	def __init__(self, s, is_j8):
869	# type: (str, bool) -> None
870	_Parser.__init__(self, s, is_j8)
871
872	def _ParsePair(self):
873	# type: () -> Tuple[str, value_t]
874
875	k = self.decoded # Save the potential string value
876	self._Eat(Id.J8_String) # Check that it's a string
877	assert k is not None
878
879	self._Eat(Id.J8_Colon)
880
881	v = self._ParseValue()
882	return k, v
883
884	def _ParseDict(self):
885	# type: () -> value_t
886	"""
887	pair = string ':' value
888	Dict = '{' '}'
889	\| '{' pair (',' pair)* '}'
890	"""
891	# precondition
892	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
893
894	#log('> Dict')
895
896	d = NewDict() # type: Dict[str, value_t]
897
898	self._Next()
899	if self.tok_id == Id.J8_RBrace:
900	self._Next()
901	return value.Dict(d)
902
903	k, v = self._ParsePair()
904	d[k] = v
905	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
906
907	while self.tok_id == Id.J8_Comma:
908	self._Next()
909	k, v = self._ParsePair()
910	d[k] = v
911	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
912
913	self._Eat(Id.J8_RBrace)
914
915	#log('< Dict')
916
917	return value.Dict(d)
918
919	def _ParseList(self):
920	# type: () -> value_t
921	"""
922	List = '[' ']'
923	\| '[' value (',' value)* ']'
924	"""
925	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
926
927	items = [] # type: List[value_t]
928
929	self._Next()
930	if self.tok_id == Id.J8_RBracket:
931	self._Next()
932	return value.List(items)
933
934	items.append(self._ParseValue())
935
936	while self.tok_id == Id.J8_Comma:
937	self._Next()
938	items.append(self._ParseValue())
939
940	self._Eat(Id.J8_RBracket)
941
942	return value.List(items)
943
944	def _ParseValue(self):
945	# type: () -> value_t
946	if self.tok_id == Id.J8_LBrace:
947	return self._ParseDict()
948
949	elif self.tok_id == Id.J8_LBracket:
950	return self._ParseList()
951
952	elif self.tok_id == Id.J8_Null:
953	self._Next()
954	return value.Null
955
956	elif self.tok_id == Id.J8_Bool:
957	#log('%r %d', self.s[self.start_pos], self.start_pos)
958	b = value.Bool(self.s[self.start_pos] == 't')
959	self._Next()
960	return b
961
962	elif self.tok_id == Id.J8_Int:
963	part = self.s[self.start_pos:self.end_pos]
964	self._Next()
965	ok, big = mops.FromStr2(part)
966	if not ok:
967	raise self._ParseError('Integer is too big')
968	return value.Int(big)
969
970	elif self.tok_id == Id.J8_Float:
971	part = self.s[self.start_pos:self.end_pos]
972	self._Next()
973	return value.Float(float(part))
974
975	# UString, BString too
976	elif self.tok_id == Id.J8_String:
977	str_val = value.Str(self.decoded)
978	#log('d %r', self.decoded)
979	self._Next()
980	return str_val
981
982	elif self.tok_id == Id.Eol_Tok:
983	raise self._ParseError('Unexpected EOF while parsing %s' %
984	self.lang_str)
985
986	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
987	raise self._ParseError('Invalid token while parsing %s: %s' %
988	(self.lang_str, Id_str(self.tok_id)))
989
990	def ParseValue(self):
991	# type: () -> value_t
992	""" Raises error.Decode. """
993	self._Next()
994	obj = self._ParseValue()
995
996	n = len(self.s)
997	if self.start_pos != n:
998	extra = n - self.start_pos
999	#log('n %d pos %d', n, self.start_pos)
1000	raise self._ParseError(
1001	'Got %d bytes of unexpected trailing input' % extra)
1002	return obj
1003
1004
1005	class Nil8Parser(_Parser):
1006	"""
1007	Tokens not in JSON8:
1008	LParen RParen Symbol
1009
1010	Tokens not in JSON, but in JSON8 and NIL8:
1011	Identifier (unquoted keys)
1012	Ignored_Comment
1013	"""
1014
1015	def __init__(self, s, is_j8):
1016	# type: (str, bool) -> None
1017	_Parser.__init__(self, s, is_j8)
1018
1019	if 0:
1020
1021	def _LookAhead(self):
1022	# type: () -> Id_t
1023	"""
1024	Don't need this right now
1025	"""
1026	end_pos = self.end_pos # look ahead from last token
1027	while True:
1028	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1029	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1030	Id.Ignored_Comment):
1031	break
1032	return tok_id
1033
1034	def _ParseRecord(self):
1035	# type: () -> nvalue_t
1036	"""
1037	Yaks
1038	(self->Next) => (-> self Next)
1039	(self->Next obj.field) => ((-> self Next) (. obj field))
1040
1041	Similar to
1042	((identity identity) 42) => 42 in Clojure
1043
1044	ASDL
1045	(Node left:(. x4beef2))
1046	(Node left !x4beef2)
1047
1048	# Ambiguous because value can be identifier.
1049	# We have to look ahead to and see if there's a colon :
1050	field =
1051	Identifier ':' value
1052	\| value
1053
1054	record = '(' head field* ')'
1055
1056	- Identifier \| Symbol are treated the same, it's a side effect of
1057	the lexing style
1058	- do positional args come before named args
1059	- () is invalid? Use [] for empty list
1060	"""
1061	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1062
1063	items = [] # type: List[nvalue_t]
1064
1065	self._Next()
1066	if self.tok_id == Id.J8_RParen:
1067	self._Next()
1068	return nvalue.List(items)
1069
1070	#log('TOK %s', Id_str(self.tok_id))
1071	while self.tok_id != Id.J8_RParen:
1072	items.append(self._ParseNil8())
1073	#log('TOK 2 %s', Id_str(self.tok_id))
1074
1075	self._Eat(Id.J8_RParen)
1076
1077	return nvalue.List(items)
1078
1079	def _ParseList8(self):
1080	# type: () -> nvalue_t
1081	"""
1082	List8 = '[' value* ']'
1083
1084	No commas, not even optional ones for now.
1085	"""
1086	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1087
1088	items = [] # type: List[nvalue_t]
1089
1090	self._Next()
1091	if self.tok_id == Id.J8_RBracket:
1092	self._Next()
1093	return nvalue.List(items)
1094
1095	#log('TOK %s', Id_str(self.tok_id))
1096	while self.tok_id != Id.J8_RBracket:
1097	items.append(self._ParseNil8())
1098	#log('TOK 2 %s', Id_str(self.tok_id))
1099
1100	self._Eat(Id.J8_RBracket)
1101
1102	return nvalue.List(items)
1103
1104	def _ParseNil8(self):
1105	# type: () -> nvalue_t
1106	if self.tok_id == Id.J8_LParen:
1107	obj = self._ParseRecord() # type: nvalue_t
1108	#return obj
1109
1110	elif self.tok_id == Id.J8_LBracket:
1111	obj = self._ParseList8()
1112	#return obj
1113
1114	# Primitives are copied from J8 above.
1115	# TODO: We also want hex literals.
1116	elif self.tok_id == Id.J8_Null:
1117	self._Next()
1118	obj = nvalue.Null
1119
1120	elif self.tok_id == Id.J8_Bool:
1121	b = nvalue.Bool(self.s[self.start_pos] == 't')
1122	self._Next()
1123	obj = b
1124
1125	elif self.tok_id == Id.J8_Int:
1126	part = self.s[self.start_pos:self.end_pos]
1127	self._Next()
1128	obj = nvalue.Int(int(part))
1129
1130	elif self.tok_id == Id.J8_Float:
1131	part = self.s[self.start_pos:self.end_pos]
1132	self._Next()
1133	obj = nvalue.Float(float(part))
1134
1135	elif self.tok_id == Id.J8_String:
1136	str_val = nvalue.Str(self.decoded)
1137	self._Next()
1138	obj = str_val
1139
1140	# <- etc.
1141	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1142	Id.J8_Comma):
1143	# unquoted "word" treated like a string
1144	part = self.s[self.start_pos:self.end_pos]
1145	self._Next()
1146	obj = nvalue.Symbol(part)
1147
1148	elif self.tok_id == Id.Eol_Tok:
1149	raise self._ParseError('Unexpected EOF while parsing %s' %
1150	self.lang_str)
1151
1152	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1153	raise self._ParseError('Invalid token while parsing %s: %s' %
1154	(self.lang_str, Id_str(self.tok_id)))
1155
1156	#log('YO %s', Id_str(self.tok_id))
1157	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1158	#log('AT %s', Id_str(self.tok_id))
1159
1160	# key: "value" -> (: key "value")
1161	part = self.s[self.start_pos:self.end_pos]
1162	op = nvalue.Symbol(part)
1163
1164	self._Next()
1165	operand2 = self._ParseNil8()
1166	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1167	#print("--> INFIX %d %s" % (id(infix), infix))
1168	return infix
1169
1170	#next_id = self._LookAhead()
1171	#print('NEXT %s' % Id_str(next_id))
1172
1173	#raise AssertionError()
1174	#print("--> OBJ %d %s" % (id(obj), obj))
1175	return obj
1176
1177	def ParseNil8(self):
1178	# type: () -> nvalue_t
1179	""" Raises error.Decode. """
1180	self._Next()
1181	#print('yo')
1182	obj = self._ParseNil8()
1183	#print("==> %d %s" % (id(obj), obj))
1184	if self.tok_id != Id.Eol_Tok:
1185	raise self._ParseError('Unexpected trailing input')
1186	return obj
1187
1188
1189	class J8LinesParser(_Parser):
1190	"""Decode lines from a string with newlines.
1191
1192	We specify this with a grammar, to preserve location info and to reduce
1193	allocations. (But note that unquoted_line is more like a LOOP than it is
1194	grammatical.)
1195
1196	Grammar:
1197
1198	end = J8_Newline \| Eol_Tok
1199
1200	empty_line = WS_Space? end
1201
1202	# special case: read until end token, but REMOVE trailing WS_Space
1203	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1204
1205	j8_line = WS_Space? J8_String WS_Space? end
1206
1207	lines = (empty_line \| unquoted_line \| j8_line)*
1208
1209	where Lit_Chars is valid UTF-8
1210
1211	Notes:
1212
1213	(1) We disallow multiple strings on a line, like:
1214
1215	"json" "json2"
1216	"json" unquoted
1217
1218	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1219
1220	foo "" u''
1221
1222	The "" and u'' are not a decoded string, because the line started with
1223	Id.Lit_Chars literals.
1224
1225	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1226	Does it have - for empty cell?
1227	"""
1228
1229	def __init__(self, s):
1230	# type: (str) -> None
1231	_Parser.__init__(self, s, True)
1232
1233	def _Show(self, s):
1234	# type: (str) -> None
1235	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1236	self.end_pos)
1237
1238	def _ParseLine(self, out):
1239	# type: (List[str]) -> None
1240	""" May append a line to 'out' """
1241	#self._Show('1')
1242	if self.tok_id == Id.WS_Space:
1243	self._NextForLines()
1244
1245	# Empty line - return without doing anything
1246	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1247	self._NextForLines()
1248	return
1249
1250	# Quoted string on line
1251	if self.tok_id == Id.J8_String:
1252	out.append(self.decoded)
1253	self._NextForLines()
1254
1255	if self.tok_id == Id.WS_Space: # trailing whitespace
1256	self._NextForLines()
1257
1258	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1259	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1260	Id_str(self.tok_id))
1261
1262	self._NextForLines()
1263	return
1264
1265	# Unquoted line
1266	if self.tok_id == Id.Lit_Chars:
1267	# ' unquoted "" text on line ' # read every token until end
1268	string_start = self.start_pos
1269	while True:
1270	# for stripping whitespace
1271	prev_id = self.tok_id
1272	prev_start = self.start_pos
1273
1274	self._NextForLines()
1275
1276	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1277	# \r, but we're sticking with the JSON spec definition of
1278	# whitespace. (As another data point, CPython on Unix allows
1279	# \r in the middle of expressions, treating it as whitespace.)
1280	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1281	break
1282
1283	if prev_id == Id.WS_Space:
1284	string_end = prev_start # remove trailing whitespace
1285	else:
1286	string_end = self.start_pos
1287
1288	out.append(self.s[string_start:string_end])
1289
1290	self._NextForLines() # past newline
1291	return
1292
1293	raise AssertionError(Id_str(self.tok_id))
1294
1295	def Parse(self):
1296	# type: () -> List[str]
1297	""" Raises error.Decode. """
1298	self._NextForLines()
1299
1300	lines = [] # type: List[str]
1301	while self.tok_id != Id.Eol_Tok:
1302	self._ParseLine(lines)
1303
1304	if self.tok_id != Id.Eol_Tok:
1305	raise self._ParseError('Unexpected trailing input in J8 Lines')
1306
1307	return lines
1308
1309
1310	def SplitJ8Lines(s):
1311	# type: (str) -> List[str]
1312	"""Used by @(echo split command sub)
1313
1314	Raises:
1315	error.Decode
1316
1317	3 Errors:
1318	- J8 string syntax error inside quotes
1319	- Extra input on line
1320	- unquoted line isn't utf-8
1321	"""
1322	p = J8LinesParser(s)
1323	return p.Parse()
1324
1325
1326	# vim: sw=4