data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1386 lines, 697 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	if 0:
194
195	def Repr(val):
196	# type: (value_t) -> str
197	""" Unused
198	This is like Python's repr
199	"""
200	# error.Encode should be impossible - we show cycles and non-data
201	buf = mylib.BufWriter()
202	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
203	return buf.getvalue()
204
205
206	def EncodeString(s, buf, unquoted_ok=False):
207	# type: (str, mylib.BufWriter, bool) -> None
208	""" For pp proc, etc."""
209
210	if unquoted_ok and fastfunc.CanOmitQuotes(s):
211	buf.write(s)
212	return
213
214	_Print(value.Str(s), buf, -1)
215
216
217	def MaybeEncodeString(s):
218	# type: (str) -> str
219	""" For write --json8 $s and compexport """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223
224	buf = mylib.BufWriter()
225	_Print(value.Str(s), buf, -1)
226	return buf.getvalue()
227
228
229	def MaybeEncodeJsonString(s):
230	# type: (str) -> str
231	""" For write --json """
232
233	# TODO: add unquoted_ok here?
234	# /usr/local/foo-bar/x.y/a_b
235	buf = mylib.BufWriter()
236	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237	return buf.getvalue()
238
239
240	class InstancePrinter(object):
241	"""Print a value tree as J8/JSON."""
242
243	def __init__(self, buf, indent, options):
244	# type: (mylib.BufWriter, int, int) -> None
245	self.buf = buf
246	self.indent = indent
247	self.options = options
248
249	# Key is vm.HeapValueId(val)
250	self.visiting = {} # type: Dict[int, bool]
251
252	def _ItemIndent(self, level):
253	# type: (int) -> None
254
255	if self.indent == -1:
256	return
257
258	self.buf.write_spaces((level + 1) * self.indent)
259
260	def _BracketIndent(self, level):
261	# type: (int) -> None
262
263	if self.indent == -1:
264	return
265
266	self.buf.write_spaces(level * self.indent)
267
268	def _MaybeNewline(self):
269	# type: () -> None
270	if self.indent == -1:
271	return
272	self.buf.write('\n')
273
274	def _MaybeSpace(self):
275	# type: () -> None
276	if self.indent == -1:
277	return
278	self.buf.write(' ')
279
280	def _PrintList(self, val, level):
281	# type: (value.List, int) -> None
282
283	if len(val.items) == 0: # Special case like Python/JS
284	self.buf.write('[]')
285	else:
286	self.buf.write('[')
287	self._MaybeNewline()
288	for i, item in enumerate(val.items):
289	if i != 0:
290	self.buf.write(',')
291	self._MaybeNewline()
292
293	self._ItemIndent(level)
294	self.Print(item, level + 1)
295	self._MaybeNewline()
296
297	self._BracketIndent(level)
298	self.buf.write(']')
299
300	def _PrintMapping(self, d, left, right, level):
301	# type: (Dict[str, value_t], str, str, int) -> None
302	if len(d) == 0: # Special case like Python/JS
303	self.buf.write(left)
304	self.buf.write(right)
305	else:
306	self.buf.write(left)
307	self._MaybeNewline()
308	i = 0
309	for k, v in iteritems(d):
310	if i != 0:
311	self.buf.write(',')
312	self._MaybeNewline()
313
314	self._ItemIndent(level)
315
316	pyj8.WriteString(k, self.options, self.buf)
317
318	self.buf.write(':')
319	self._MaybeSpace()
320
321	self.Print(v, level + 1)
322
323	i += 1
324
325	self._MaybeNewline()
326	self._BracketIndent(level)
327	self.buf.write(right)
328
329	def _PrintDict(self, val, level):
330	# type: (value.Dict, int) -> None
331	self._PrintMapping(val.d, '{', '}', level)
332
333	def _PrintObj(self, val, level):
334	# type: (Obj, int) -> None
335
336	self._PrintMapping(val.d, '(', ')', level)
337
338	if val.prototype:
339	self.buf.write(' --> ')
340	self._PrintObj(val.prototype, level)
341
342	def _PrintBashPrefix(self, type_str, level):
343	# type: (str, int) -> None
344
345	self.buf.write('{')
346	self._MaybeNewline()
347	self._ItemIndent(level)
348	self.buf.write('"type":')
349	self._MaybeSpace()
350	self.buf.write(type_str) # "BashArray", or "BashAssoc",
351
352	self._MaybeNewline()
353
354	self._ItemIndent(level)
355	self.buf.write('"data":')
356	self._MaybeSpace()
357
358	def _PrintBashSuffix(self, level):
359	# type: (int) -> None
360	self._MaybeNewline()
361	self._BracketIndent(level)
362	self.buf.write('}')
363
364	def _PrintSparseArray(self, val, level):
365	# type: (value.SparseArray, int) -> None
366
367	self._PrintBashPrefix('"SparseArray",', level)
368
369	if len(val.d) == 0: # Special case like Python/JS
370	self.buf.write('{}')
371	else:
372	self.buf.write('{')
373	self._MaybeNewline()
374
375	i = 0
376	for k, v in iteritems(val.d):
377	if i != 0:
378	self.buf.write(',')
379	self._MaybeNewline()
380
381	self._ItemIndent(level + 1)
382	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
383
384	self.buf.write(':')
385	self._MaybeSpace()
386
387	pyj8.WriteString(v, self.options, self.buf)
388
389	i += 1
390
391	self._MaybeNewline()
392
393	self._BracketIndent(level + 1)
394	self.buf.write('}')
395
396	self._PrintBashSuffix(level)
397
398	def _PrintBashArray(self, val, level):
399	# type: (value.BashArray, int) -> None
400
401	self._PrintBashPrefix('"BashArray",', level)
402
403	if len(val.strs) == 0: # Special case like Python/JS
404	self.buf.write('{}')
405	else:
406	self.buf.write('{')
407	self._MaybeNewline()
408
409	first = True
410	for i, s in enumerate(val.strs):
411	if s is None:
412	continue
413
414	if not first:
415	self.buf.write(',')
416	self._MaybeNewline()
417
418	self._ItemIndent(level + 1)
419	pyj8.WriteString(str(i), self.options, self.buf)
420
421	self.buf.write(':')
422	self._MaybeSpace()
423
424	pyj8.WriteString(s, self.options, self.buf)
425
426	first = False
427
428	self._MaybeNewline()
429
430	self._BracketIndent(level + 1)
431	self.buf.write('}')
432
433	self._PrintBashSuffix(level)
434
435	def _PrintBashAssoc(self, val, level):
436	# type: (value.BashAssoc, int) -> None
437
438	self._PrintBashPrefix('"BashAssoc",', level)
439
440	if len(val.d) == 0: # Special case like Python/JS
441	self.buf.write('{}')
442	else:
443	self.buf.write('{')
444	self._MaybeNewline()
445
446	i = 0
447	for k2, v2 in iteritems(val.d):
448	if i != 0:
449	self.buf.write(',')
450	self._MaybeNewline()
451
452	self._ItemIndent(level + 1)
453	pyj8.WriteString(k2, self.options, self.buf)
454
455	self.buf.write(':')
456	self._MaybeSpace()
457
458	pyj8.WriteString(v2, self.options, self.buf)
459
460	i += 1
461
462	self._MaybeNewline()
463
464	self._BracketIndent(level + 1)
465	self.buf.write('}')
466
467	self._PrintBashSuffix(level)
468
469	def Print(self, val, level=0):
470	# type: (value_t, int) -> None
471
472	# special value that means everything is on one line
473	# It's like
474	# JSON.stringify(d, null, 0)
475	# except we use -1, not 0. 0 can still have newlines.
476
477	UP_val = val
478	with tagswitch(val) as case:
479	if case(value_e.Null):
480	self.buf.write('null')
481
482	elif case(value_e.Bool):
483	val = cast(value.Bool, UP_val)
484	self.buf.write('true' if val.b else 'false')
485
486	elif case(value_e.Int):
487	val = cast(value.Int, UP_val)
488	# TODO: avoid intermediate allocation with
489	# self.buf.WriteBigInt(val.i)
490	#
491	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
492	# be of arbitrary length, and will need a growth strategy.
493	# Although that is not very common, so we could allocate in
494	# that case.
495
496	self.buf.write(mops.ToStr(val.i))
497
498	elif case(value_e.Float):
499	val = cast(value.Float, UP_val)
500
501	fl = val.f
502	if math.isinf(fl):
503	if self.options & INF_NAN_ARE_NULL:
504	s = 'null' # negative infinity is null too
505	else:
506	s = 'INFINITY'
507	if fl < 0:
508	s = '-' + s
509	elif math.isnan(fl):
510	if self.options & INF_NAN_ARE_NULL:
511	# JavaScript JSON lib behavior: Inf and NaN are null
512	# Python has a bug in the encoder by default, and then
513	# allow_nan=False raises an error
514	s = 'null'
515	else:
516	s = 'NAN'
517	else:
518	# TODO: can we avoid intermediate allocation?
519	# self.buf.WriteFloat(val.f)
520	s = str(fl)
521
522	self.buf.write(s)
523
524	elif case(value_e.Str):
525	val = cast(value.Str, UP_val)
526
527	pyj8.WriteString(val.s, self.options, self.buf)
528
529	elif case(value_e.List):
530	val = cast(value.List, UP_val)
531
532	# Cycle detection, only for containers that can be in cycles
533	heap_id = HeapValueId(val)
534
535	if self.visiting.get(heap_id, False):
536	if self.options & SHOW_CYCLES:
537	# Showing the ID would be nice for pretty printing, but
538	# the problem is we'd have to show it TWICE to make it
539	# meaningful
540	#
541	#self.buf.write('[ -->%s ]' % ValueIdString(val))
542	self.buf.write('[...]')
543	return
544	else:
545	# node.js prints which index closes the cycle
546	raise error.Encode(
547	"Can't encode List%s in object cycle" %
548	ValueIdString(val))
549	else:
550	self.visiting[heap_id] = True
551	self._PrintList(val, level)
552	self.visiting[heap_id] = False
553
554	elif case(value_e.Dict):
555	val = cast(value.Dict, UP_val)
556
557	# Cycle detection, only for containers that can be in cycles
558	heap_id = HeapValueId(val)
559
560	if self.visiting.get(heap_id, False):
561	if self.options & SHOW_CYCLES:
562	self.buf.write('{...}')
563	return
564	else:
565	# node.js prints which key closes the cycle
566	raise error.Encode(
567	"Can't encode Dict%s in object cycle" %
568	ValueIdString(val))
569	else:
570	self.visiting[heap_id] = True
571	self._PrintDict(val, level)
572	self.visiting[heap_id] = False
573
574	elif case(value_e.Obj):
575	val = cast(Obj, UP_val)
576
577	if not (self.options & SHOW_NON_DATA):
578	raise error.Encode("Can't encode value of type Obj")
579
580	# Cycle detection, only for containers that can be in cycles
581	heap_id = HeapValueId(val)
582
583	if self.visiting.get(heap_id, False):
584	if self.options & SHOW_CYCLES:
585	self.buf.write('(...)')
586	return
587	else:
588	# node.js prints which key closes the cycle
589	raise error.Encode(
590	"Can't encode Obj%s in object cycle" %
591	ValueIdString(val))
592	else:
593	self.visiting[heap_id] = True
594	self._PrintObj(val, level)
595	self.visiting[heap_id] = False
596
597	elif case(value_e.SparseArray):
598	val = cast(value.SparseArray, UP_val)
599	self._PrintSparseArray(val, level)
600
601	elif case(value_e.BashArray):
602	val = cast(value.BashArray, UP_val)
603	self._PrintBashArray(val, level)
604
605	elif case(value_e.BashAssoc):
606	val = cast(value.BashAssoc, UP_val)
607	self._PrintBashAssoc(val, level)
608
609	else:
610	pass # mycpp workaround
611	if self.options & SHOW_NON_DATA:
612	# Similar to = operator, ui.DebugPrint()
613	# TODO: that prints value.Range in a special way
614	ysh_type = ValType(val)
615	# Don't show ID in 'pp test_'
616	#id_str = ValueIdString(val)
617	self.buf.write('<%s>' % ysh_type)
618	else:
619	raise error.Encode("Can't serialize object of type %s" %
620	ValType(val))
621
622
623	class PrettyPrinter(object):
624	""" Unused right now, but could enhance the = operator.
625
626	Output to polymorphic ColorOutput
627
628	Features like asdl/format.py:
629	- line wrapping
630	- color
631	- sharing detection by passing in a REF COUTN dict
632	- print @123 the first time, and then print ... the second time
633
634	and
635
636	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
637	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
638
639	- Omitting commas for ASDL? Maybe we can use two spaces
640
641	(Token id: Id.VSub_DollarName start: 0 length: 3)
642	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
643	"""
644
645	def __init__(self, max_col):
646	# type: (int) -> None
647	self.max_col = max_col
648
649	# This could be an optimized set an C++ bit set like
650	# mark_sweep_heap.h, rather than a Dict
651	#self.unique_objs = mylib.UniqueObjects()
652
653	# first pass of object ID -> number of times references
654
655	self.ref_count = {} # type: Dict[int, int]
656
657	def PrettyTree(self, val, f):
658	# type: (value_t, fmt.ColorOutput) -> None
659
660	# TODO: first convert to hnode.asdl types?
661
662	# Although we might want
663	# hnode.AlreadyShown = (str type, int unique_id)
664	pass
665
666	def Print(self, val, buf):
667	# type: (value_t, mylib.BufWriter) -> None
668
669	# Or print to stderr?
670	f = fmt.DetectConsoleOutput(mylib.Stdout())
671	self.PrettyTree(val, f)
672
673	# Then print those with ASDL
674	pass
675
676
677	class LexerDecoder(object):
678	"""J8 lexer and string decoder.
679
680	Similar interface as SimpleLexer, except we return an optional decoded
681	string
682	"""
683
684	def __init__(self, s, is_j8, lang_str):
685	# type: (str, bool, str) -> None
686	self.s = s
687	self.is_j8 = is_j8
688	self.lang_str = lang_str
689
690	self.pos = 0
691
692	# current line being lexed -- for error messages
693	self.cur_line_num = 1
694
695	# Reuse this instance to save GC objects. JSON objects could have
696	# thousands of strings.
697	self.decoded = mylib.BufWriter()
698
699	def _Error(self, msg, end_pos):
700	# type: (str, int) -> error.Decode
701
702	# Use the current position as start pos
703	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
704
705	def Next(self):
706	# type: () -> Tuple[Id_t, int, Optional[str]]
707	""" Returns a token and updates self.pos """
708
709	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
710
711	if not self.is_j8:
712	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
713	raise self._Error(
714	"Single quotes aren't part of JSON; you may want 'json8 read'",
715	end_pos)
716	if tok_id == Id.Ignored_Comment:
717	raise self._Error(
718	"Comments aren't part of JSON; you may want 'json8 read'",
719	end_pos)
720
721	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
722	Id.Left_USingleQuote):
723	return self._DecodeString(tok_id, end_pos)
724
725	if tok_id == Id.Left_JDoubleQuote:
726	if self.is_j8:
727	return self._DecodeString(tok_id, end_pos)
728	else:
729	raise self._Error('Pure JSON does not accept j"" prefix',
730	end_pos)
731
732	if tok_id == Id.Ignored_Newline:
733	#log('LINE %d', self.cur_line_num)
734	self.cur_line_num += 1
735
736	self.pos = end_pos
737	return tok_id, end_pos, None
738
739	def NextForLines(self):
740	# type: () -> Tuple[Id_t, int, Optional[str]]
741	""" Like Next(), but for J8 Lines """
742
743	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
744
745	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
746	Id.Left_BSingleQuote, Id.Left_USingleQuote):
747	return self._DecodeString(tok_id, end_pos)
748
749	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
750	# this for quoted strings.)
751	if (tok_id == Id.Lit_Chars and
752	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
753	raise self._Error(
754	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
755	if tok_id == Id.Char_AsciiControl:
756	raise self._Error(
757	"J8 Lines can't have unescaped ASCII control chars", end_pos)
758
759	if tok_id == Id.J8_Newline:
760	#log('LINE %d', self.cur_line_num)
761	self.cur_line_num += 1
762
763	self.pos = end_pos
764	return tok_id, end_pos, None
765
766	def _DecodeString(self, left_id, str_pos):
767	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
768	""" Returns a string token and updates self.pos """
769
770	while True:
771	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
772	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
773	else:
774	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
775
776	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
777
778	if tok_id == Id.Eol_Tok:
779	# TODO: point to beginning of # quote?
780	raise self._Error(
781	'Unexpected EOF while lexing %s string' % self.lang_str,
782	str_end)
783	if tok_id == Id.Unknown_Backslash:
784	raise self._Error(
785	'Bad backslash escape in %s string' % self.lang_str,
786	str_end)
787	if tok_id == Id.Char_AsciiControl:
788	raise self._Error(
789	"%s strings can't have unescaped ASCII control chars" %
790	self.lang_str, str_end)
791
792	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
793
794	self.pos = str_end
795
796	s = self.decoded.getvalue()
797	self.decoded.clear() # reuse this instance
798
799	#log('decoded %r', self.decoded.getvalue())
800	return Id.J8_String, str_end, s
801
802	#
803	# Now handle each kind of token
804	#
805
806	if tok_id == Id.Lit_Chars: # JSON and J8
807	part = self.s[str_pos:str_end]
808	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
809	raise self._Error(
810	'Invalid UTF-8 in %s string literal' % self.lang_str,
811	str_end)
812
813	# TODO: would be nice to avoid allocation in all these cases.
814	# But LookupCharC() would have to change.
815
816	elif tok_id == Id.Char_OneChar: # JSON and J8
817	ch = self.s[str_pos + 1]
818	part = consts.LookupCharC(ch)
819
820	elif tok_id == Id.Char_UBraced: # J8 only
821	h = self.s[str_pos + 3:str_end - 1]
822	i = int(h, 16)
823
824	# Same checks in osh/word_compile.py
825	if i > 0x10ffff:
826	raise self._Error(
827	"Code point can't be greater than U+10ffff", str_end)
828	if 0xD800 <= i and i < 0xE000:
829	raise self._Error(
830	r"\u{%s} escape is illegal because it's in the surrogate range"
831	% h, str_end)
832
833	part = Utf8Encode(i)
834
835	elif tok_id == Id.Char_YHex: # J8 only
836	h = self.s[str_pos + 2:str_end]
837
838	# Same check in osh/word_parse.py
839	if left_id != Id.Left_BSingleQuote:
840	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
841	raise self._Error(
842	r"\y%s escapes not allowed in u'' strings" % h,
843	str_end)
844
845	i = int(h, 16)
846	part = chr(i)
847
848	elif tok_id == Id.Char_SurrogatePair:
849	h1 = self.s[str_pos + 2:str_pos + 6]
850	h2 = self.s[str_pos + 8:str_pos + 12]
851
852	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
853	i1 = int(h1, 16) - 0xD800 # high surrogate
854	i2 = int(h2, 16) - 0xDC00 # low surrogate
855	code_point = 0x10000 + (i1 << 10) + i2
856
857	part = Utf8Encode(code_point)
858
859	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
860	h = self.s[str_pos + 2:str_end]
861	i = int(h, 16)
862	part = Utf8Encode(i)
863
864	else:
865	# Should never happen
866	raise AssertionError(Id_str(tok_id))
867
868	#log('%s part %r', Id_str(tok_id), part)
869	self.decoded.write(part)
870	str_pos = str_end
871
872
873	class _Parser(object):
874
875	def __init__(self, s, is_j8):
876	# type: (str, bool) -> None
877	self.s = s
878	self.is_j8 = is_j8
879	self.lang_str = "J8" if is_j8 else "JSON"
880
881	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
882	self.tok_id = Id.Undefined_Tok
883	self.start_pos = 0
884	self.end_pos = 0
885	self.decoded = '' # decoded J8 string
886
887	def _Next(self):
888	# type: () -> None
889
890	# This isn't the start of a J8_Bool token, it's the END of the token before it
891	while True:
892	self.start_pos = self.end_pos
893	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
894	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
895	Id.Ignored_Comment):
896	break
897	# TODO: add Ignored_Newline to count lines, and show line numbers
898	# in errors messages. The position of the last newline and a token
899	# can be used to calculate a column number.
900
901	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
902
903	def _Eat(self, tok_id):
904	# type: (Id_t) -> None
905
906	if self.tok_id != tok_id:
907	#log('position %r %d-%d %r', self.s, self.start_pos,
908	# self.end_pos, self.s[self.start_pos:self.end_pos])
909	raise self._ParseError("Expected %s, got %s" %
910	(Id_str(tok_id), Id_str(self.tok_id)))
911	self._Next()
912
913	def _NextForLines(self):
914	# type: () -> None
915	"""Like _Next, but use the J8 Lines lexer."""
916	self.start_pos = self.end_pos
917	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
918
919	def _ParseError(self, msg):
920	# type: (str) -> error.Decode
921	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
922	self.lexer.cur_line_num)
923
924
925	class Parser(_Parser):
926	"""JSON and JSON8 Parser."""
927
928	def __init__(self, s, is_j8):
929	# type: (str, bool) -> None
930	_Parser.__init__(self, s, is_j8)
931
932	def _ParsePair(self):
933	# type: () -> Tuple[str, value_t]
934
935	k = self.decoded # Save the potential string value
936	self._Eat(Id.J8_String) # Check that it's a string
937	assert k is not None
938
939	self._Eat(Id.J8_Colon)
940
941	v = self._ParseValue()
942	return k, v
943
944	def _ParseDict(self):
945	# type: () -> value_t
946	"""
947	pair = string ':' value
948	Dict = '{' '}'
949	\| '{' pair (',' pair)* '}'
950	"""
951	# precondition
952	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
953
954	#log('> Dict')
955
956	d = NewDict() # type: Dict[str, value_t]
957
958	self._Next()
959	if self.tok_id == Id.J8_RBrace:
960	self._Next()
961	return value.Dict(d)
962
963	k, v = self._ParsePair()
964	d[k] = v
965	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
966
967	while self.tok_id == Id.J8_Comma:
968	self._Next()
969	k, v = self._ParsePair()
970	d[k] = v
971	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
972
973	self._Eat(Id.J8_RBrace)
974
975	#log('< Dict')
976
977	return value.Dict(d)
978
979	def _ParseList(self):
980	# type: () -> value_t
981	"""
982	List = '[' ']'
983	\| '[' value (',' value)* ']'
984	"""
985	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
986
987	items = [] # type: List[value_t]
988
989	self._Next()
990	if self.tok_id == Id.J8_RBracket:
991	self._Next()
992	return value.List(items)
993
994	items.append(self._ParseValue())
995
996	while self.tok_id == Id.J8_Comma:
997	self._Next()
998	items.append(self._ParseValue())
999
1000	self._Eat(Id.J8_RBracket)
1001
1002	return value.List(items)
1003
1004	def _ParseValue(self):
1005	# type: () -> value_t
1006	if self.tok_id == Id.J8_LBrace:
1007	return self._ParseDict()
1008
1009	elif self.tok_id == Id.J8_LBracket:
1010	return self._ParseList()
1011
1012	elif self.tok_id == Id.J8_Null:
1013	self._Next()
1014	return value.Null
1015
1016	elif self.tok_id == Id.J8_Bool:
1017	#log('%r %d', self.s[self.start_pos], self.start_pos)
1018	b = value.Bool(self.s[self.start_pos] == 't')
1019	self._Next()
1020	return b
1021
1022	elif self.tok_id == Id.J8_Int:
1023	part = self.s[self.start_pos:self.end_pos]
1024	self._Next()
1025	ok, big = mops.FromStr2(part)
1026	if not ok:
1027	raise self._ParseError('Integer is too big')
1028	return value.Int(big)
1029
1030	elif self.tok_id == Id.J8_Float:
1031	part = self.s[self.start_pos:self.end_pos]
1032	self._Next()
1033	return value.Float(float(part))
1034
1035	# UString, BString too
1036	elif self.tok_id == Id.J8_String:
1037	str_val = value.Str(self.decoded)
1038	#log('d %r', self.decoded)
1039	self._Next()
1040	return str_val
1041
1042	elif self.tok_id == Id.Eol_Tok:
1043	raise self._ParseError('Unexpected EOF while parsing %s' %
1044	self.lang_str)
1045
1046	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1047	raise self._ParseError('Invalid token while parsing %s: %s' %
1048	(self.lang_str, Id_str(self.tok_id)))
1049
1050	def ParseValue(self):
1051	# type: () -> value_t
1052	""" Raises error.Decode. """
1053	self._Next()
1054	obj = self._ParseValue()
1055
1056	n = len(self.s)
1057	if self.start_pos != n:
1058	extra = n - self.start_pos
1059	#log('n %d pos %d', n, self.start_pos)
1060	raise self._ParseError(
1061	'Got %d bytes of unexpected trailing input' % extra)
1062	return obj
1063
1064
1065	class Nil8Parser(_Parser):
1066	"""
1067	Tokens not in JSON8:
1068	LParen RParen Symbol
1069
1070	Tokens not in JSON, but in JSON8 and NIL8:
1071	Identifier (unquoted keys)
1072	Ignored_Comment
1073	"""
1074
1075	def __init__(self, s, is_j8):
1076	# type: (str, bool) -> None
1077	_Parser.__init__(self, s, is_j8)
1078
1079	if 0:
1080
1081	def _LookAhead(self):
1082	# type: () -> Id_t
1083	"""
1084	Don't need this right now
1085	"""
1086	end_pos = self.end_pos # look ahead from last token
1087	while True:
1088	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1089	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1090	Id.Ignored_Comment):
1091	break
1092	return tok_id
1093
1094	def _ParseRecord(self):
1095	# type: () -> nvalue_t
1096	"""
1097	Yaks
1098	(self->Next) => (-> self Next)
1099	(self->Next obj.field) => ((-> self Next) (. obj field))
1100
1101	Similar to
1102	((identity identity) 42) => 42 in Clojure
1103
1104	ASDL
1105	(Node left:(. x4beef2))
1106	(Node left !x4beef2)
1107
1108	# Ambiguous because value can be identifier.
1109	# We have to look ahead to and see if there's a colon :
1110	field =
1111	Identifier ':' value
1112	\| value
1113
1114	record = '(' head field* ')'
1115
1116	- Identifier \| Symbol are treated the same, it's a side effect of
1117	the lexing style
1118	- do positional args come before named args
1119	- () is invalid? Use [] for empty list
1120	"""
1121	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1122
1123	items = [] # type: List[nvalue_t]
1124
1125	self._Next()
1126	if self.tok_id == Id.J8_RParen:
1127	self._Next()
1128	return nvalue.List(items)
1129
1130	#log('TOK %s', Id_str(self.tok_id))
1131	while self.tok_id != Id.J8_RParen:
1132	items.append(self._ParseNil8())
1133	#log('TOK 2 %s', Id_str(self.tok_id))
1134
1135	self._Eat(Id.J8_RParen)
1136
1137	return nvalue.List(items)
1138
1139	def _ParseList8(self):
1140	# type: () -> nvalue_t
1141	"""
1142	List8 = '[' value* ']'
1143
1144	No commas, not even optional ones for now.
1145	"""
1146	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1147
1148	items = [] # type: List[nvalue_t]
1149
1150	self._Next()
1151	if self.tok_id == Id.J8_RBracket:
1152	self._Next()
1153	return nvalue.List(items)
1154
1155	#log('TOK %s', Id_str(self.tok_id))
1156	while self.tok_id != Id.J8_RBracket:
1157	items.append(self._ParseNil8())
1158	#log('TOK 2 %s', Id_str(self.tok_id))
1159
1160	self._Eat(Id.J8_RBracket)
1161
1162	return nvalue.List(items)
1163
1164	def _ParseNil8(self):
1165	# type: () -> nvalue_t
1166	if self.tok_id == Id.J8_LParen:
1167	obj = self._ParseRecord() # type: nvalue_t
1168	#return obj
1169
1170	elif self.tok_id == Id.J8_LBracket:
1171	obj = self._ParseList8()
1172	#return obj
1173
1174	# Primitives are copied from J8 above.
1175	# TODO: We also want hex literals.
1176	elif self.tok_id == Id.J8_Null:
1177	self._Next()
1178	obj = nvalue.Null
1179
1180	elif self.tok_id == Id.J8_Bool:
1181	b = nvalue.Bool(self.s[self.start_pos] == 't')
1182	self._Next()
1183	obj = b
1184
1185	elif self.tok_id == Id.J8_Int:
1186	part = self.s[self.start_pos:self.end_pos]
1187	self._Next()
1188	obj = nvalue.Int(int(part))
1189
1190	elif self.tok_id == Id.J8_Float:
1191	part = self.s[self.start_pos:self.end_pos]
1192	self._Next()
1193	obj = nvalue.Float(float(part))
1194
1195	elif self.tok_id == Id.J8_String:
1196	str_val = nvalue.Str(self.decoded)
1197	self._Next()
1198	obj = str_val
1199
1200	# <- etc.
1201	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1202	Id.J8_Comma):
1203	# unquoted "word" treated like a string
1204	part = self.s[self.start_pos:self.end_pos]
1205	self._Next()
1206	obj = nvalue.Symbol(part)
1207
1208	elif self.tok_id == Id.Eol_Tok:
1209	raise self._ParseError('Unexpected EOF while parsing %s' %
1210	self.lang_str)
1211
1212	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1213	raise self._ParseError('Invalid token while parsing %s: %s' %
1214	(self.lang_str, Id_str(self.tok_id)))
1215
1216	#log('YO %s', Id_str(self.tok_id))
1217	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1218	#log('AT %s', Id_str(self.tok_id))
1219
1220	# key: "value" -> (: key "value")
1221	part = self.s[self.start_pos:self.end_pos]
1222	op = nvalue.Symbol(part)
1223
1224	self._Next()
1225	operand2 = self._ParseNil8()
1226	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1227	#print("--> INFIX %d %s" % (id(infix), infix))
1228	return infix
1229
1230	#next_id = self._LookAhead()
1231	#print('NEXT %s' % Id_str(next_id))
1232
1233	#raise AssertionError()
1234	#print("--> OBJ %d %s" % (id(obj), obj))
1235	return obj
1236
1237	def ParseNil8(self):
1238	# type: () -> nvalue_t
1239	""" Raises error.Decode. """
1240	self._Next()
1241	#print('yo')
1242	obj = self._ParseNil8()
1243	#print("==> %d %s" % (id(obj), obj))
1244	if self.tok_id != Id.Eol_Tok:
1245	raise self._ParseError('Unexpected trailing input')
1246	return obj
1247
1248
1249	class J8LinesParser(_Parser):
1250	"""Decode lines from a string with newlines.
1251
1252	We specify this with a grammar, to preserve location info and to reduce
1253	allocations. (But note that unquoted_line is more like a LOOP than it is
1254	grammatical.)
1255
1256	Grammar:
1257
1258	end = J8_Newline \| Eol_Tok
1259
1260	empty_line = WS_Space? end
1261
1262	# special case: read until end token, but REMOVE trailing WS_Space
1263	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1264
1265	j8_line = WS_Space? J8_String WS_Space? end
1266
1267	lines = (empty_line \| unquoted_line \| j8_line)*
1268
1269	where Lit_Chars is valid UTF-8
1270
1271	Notes:
1272
1273	(1) We disallow multiple strings on a line, like:
1274
1275	"json" "json2"
1276	"json" unquoted
1277
1278	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1279
1280	foo "" u''
1281
1282	The "" and u'' are not a decoded string, because the line started with
1283	Id.Lit_Chars literals.
1284
1285	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1286	Does it have - for empty cell?
1287	"""
1288
1289	def __init__(self, s):
1290	# type: (str) -> None
1291	_Parser.__init__(self, s, True)
1292
1293	def _Show(self, s):
1294	# type: (str) -> None
1295	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1296	self.end_pos)
1297
1298	def _ParseLine(self, out):
1299	# type: (List[str]) -> None
1300	""" May append a line to 'out' """
1301	#self._Show('1')
1302	if self.tok_id == Id.WS_Space:
1303	self._NextForLines()
1304
1305	# Empty line - return without doing anything
1306	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1307	self._NextForLines()
1308	return
1309
1310	# Quoted string on line
1311	if self.tok_id == Id.J8_String:
1312	out.append(self.decoded)
1313	self._NextForLines()
1314
1315	if self.tok_id == Id.WS_Space: # trailing whitespace
1316	self._NextForLines()
1317
1318	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1319	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1320	Id_str(self.tok_id))
1321
1322	self._NextForLines()
1323	return
1324
1325	# Unquoted line
1326	if self.tok_id == Id.Lit_Chars:
1327	# ' unquoted "" text on line ' # read every token until end
1328	string_start = self.start_pos
1329	while True:
1330	# for stripping whitespace
1331	prev_id = self.tok_id
1332	prev_start = self.start_pos
1333
1334	self._NextForLines()
1335
1336	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1337	# \r, but we're sticking with the JSON spec definition of
1338	# whitespace. (As another data point, CPython on Unix allows
1339	# \r in the middle of expressions, treating it as whitespace.)
1340	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1341	break
1342
1343	if prev_id == Id.WS_Space:
1344	string_end = prev_start # remove trailing whitespace
1345	else:
1346	string_end = self.start_pos
1347
1348	out.append(self.s[string_start:string_end])
1349
1350	self._NextForLines() # past newline
1351	return
1352
1353	raise AssertionError(Id_str(self.tok_id))
1354
1355	def Parse(self):
1356	# type: () -> List[str]
1357	""" Raises error.Decode. """
1358	self._NextForLines()
1359
1360	lines = [] # type: List[str]
1361	while self.tok_id != Id.Eol_Tok:
1362	self._ParseLine(lines)
1363
1364	if self.tok_id != Id.Eol_Tok:
1365	raise self._ParseError('Unexpected trailing input in J8 Lines')
1366
1367	return lines
1368
1369
1370	def SplitJ8Lines(s):
1371	# type: (str) -> List[str]
1372	"""Used by @(echo split command sub)
1373
1374	Raises:
1375	error.Decode
1376
1377	3 Errors:
1378	- J8 string syntax error inside quotes
1379	- Extra input on line
1380	- unquoted line isn't utf-8
1381	"""
1382	p = J8LinesParser(s)
1383	return p.Parse()
1384
1385
1386	# vim: sw=4