data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1388 lines, 699 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON may lose data about strings
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	if 0:
194
195	def Repr(val):
196	# type: (value_t) -> str
197	""" Unused
198	This is like Python's repr
199	"""
200	# error.Encode should be impossible - we show cycles and non-data
201	buf = mylib.BufWriter()
202	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
203	return buf.getvalue()
204
205
206	def EncodeString(s, buf, unquoted_ok=False):
207	# type: (str, mylib.BufWriter, bool) -> None
208	""" For pp proc, etc."""
209
210	if unquoted_ok and fastfunc.CanOmitQuotes(s):
211	buf.write(s)
212	return
213
214	_Print(value.Str(s), buf, -1)
215
216
217	def MaybeEncodeString(s):
218	# type: (str) -> str
219	""" For write --json8 $s and compexport """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223
224	buf = mylib.BufWriter()
225	_Print(value.Str(s), buf, -1)
226	return buf.getvalue()
227
228
229	def MaybeEncodeJsonString(s):
230	# type: (str) -> str
231	""" For write --json """
232
233	# TODO: add unquoted_ok here?
234	# /usr/local/foo-bar/x.y/a_b
235	buf = mylib.BufWriter()
236	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237	return buf.getvalue()
238
239
240	class InstancePrinter(object):
241	"""Print a value tree as J8/JSON."""
242
243	def __init__(self, buf, indent, options):
244	# type: (mylib.BufWriter, int, int) -> None
245	self.buf = buf
246	self.indent = indent
247	self.options = options
248
249	# Key is vm.HeapValueId(val)
250	self.visiting = {} # type: Dict[int, bool]
251
252	def _ItemIndent(self, level):
253	# type: (int) -> None
254
255	if self.indent == -1:
256	return
257
258	self.buf.write_spaces((level + 1) * self.indent)
259
260	def _BracketIndent(self, level):
261	# type: (int) -> None
262
263	if self.indent == -1:
264	return
265
266	self.buf.write_spaces(level * self.indent)
267
268	def _MaybeNewline(self):
269	# type: () -> None
270	if self.indent == -1:
271	return
272	self.buf.write('\n')
273
274	def _MaybeSpace(self):
275	# type: () -> None
276	if self.indent == -1:
277	return
278	self.buf.write(' ')
279
280	def _PrintList(self, val, level):
281	# type: (value.List, int) -> None
282
283	if len(val.items) == 0: # Special case like Python/JS
284	self.buf.write('[]')
285	else:
286	self.buf.write('[')
287	self._MaybeNewline()
288	for i, item in enumerate(val.items):
289	if i != 0:
290	self.buf.write(',')
291	self._MaybeNewline()
292
293	self._ItemIndent(level)
294	self.Print(item, level + 1)
295	self._MaybeNewline()
296
297	self._BracketIndent(level)
298	self.buf.write(']')
299
300	def _PrintMapping(self, d, left, right, level):
301	# type: (Dict[str, value_t], str, str, int) -> None
302	if len(d) == 0: # Special case like Python/JS
303	self.buf.write(left)
304	self.buf.write(right)
305	else:
306	self.buf.write(left)
307	self._MaybeNewline()
308	i = 0
309	for k, v in iteritems(d):
310	if i != 0:
311	self.buf.write(',')
312	self._MaybeNewline()
313
314	self._ItemIndent(level)
315
316	pyj8.WriteString(k, self.options, self.buf)
317
318	self.buf.write(':')
319	self._MaybeSpace()
320
321	self.Print(v, level + 1)
322
323	i += 1
324
325	self._MaybeNewline()
326	self._BracketIndent(level)
327	self.buf.write(right)
328
329	def _PrintDict(self, val, level):
330	# type: (value.Dict, int) -> None
331	self._PrintMapping(val.d, '{', '}', level)
332
333	def _PrintObj(self, val, level):
334	# type: (Obj, int) -> None
335
336	self._PrintMapping(val.d, '(', ')', level)
337
338	if val.prototype:
339	self.buf.write(' --> ')
340	self._PrintObj(val.prototype, level)
341
342	def _PrintBashPrefix(self, type_str, level):
343	# type: (str, int) -> None
344
345	self.buf.write('{')
346	self._MaybeNewline()
347	self._ItemIndent(level)
348	self.buf.write('"type":')
349	self._MaybeSpace()
350	self.buf.write(type_str) # "BashArray", or "BashAssoc",
351
352	self._MaybeNewline()
353
354	self._ItemIndent(level)
355	self.buf.write('"data":')
356	self._MaybeSpace()
357
358	def _PrintBashSuffix(self, level):
359	# type: (int) -> None
360	self._MaybeNewline()
361	self._BracketIndent(level)
362	self.buf.write('}')
363
364	def _PrintSparseArray(self, val, level):
365	# type: (value.SparseArray, int) -> None
366
367	self._PrintBashPrefix('"SparseArray",', level)
368
369	if len(val.d) == 0: # Special case like Python/JS
370	self.buf.write('{}')
371	else:
372	self.buf.write('{')
373	self._MaybeNewline()
374
375	first = True
376	i = 0
377	for k, v in iteritems(val.d):
378	if i != 0:
379	self.buf.write(',')
380	self._MaybeNewline()
381
382	self._ItemIndent(level + 1)
383	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
384
385	self.buf.write(':')
386	self._MaybeSpace()
387
388	pyj8.WriteString(v, self.options, self.buf)
389
390	i += 1
391
392	self._MaybeNewline()
393
394	self._BracketIndent(level + 1)
395	self.buf.write('}')
396
397	self._PrintBashSuffix(level)
398
399	def _PrintBashArray(self, val, level):
400	# type: (value.BashArray, int) -> None
401
402	self._PrintBashPrefix('"BashArray",', level)
403
404	if len(val.strs) == 0: # Special case like Python/JS
405	self.buf.write('{}')
406	else:
407	self.buf.write('{')
408	self._MaybeNewline()
409
410	first = True
411	for i, s in enumerate(val.strs):
412	if s is None:
413	continue
414
415	if not first:
416	self.buf.write(',')
417	self._MaybeNewline()
418
419	self._ItemIndent(level + 1)
420	pyj8.WriteString(str(i), self.options, self.buf)
421
422	self.buf.write(':')
423	self._MaybeSpace()
424
425	pyj8.WriteString(s, self.options, self.buf)
426
427	first = False
428
429	self._MaybeNewline()
430
431	self._BracketIndent(level + 1)
432	self.buf.write('}')
433
434	self._PrintBashSuffix(level)
435
436	def _PrintBashAssoc(self, val, level):
437	# type: (value.BashAssoc, int) -> None
438
439	self._PrintBashPrefix('"BashAssoc",', level)
440
441	if len(val.d) == 0: # Special case like Python/JS
442	self.buf.write('{}')
443	else:
444	self.buf.write('{')
445	self._MaybeNewline()
446
447	i = 0
448	for k2, v2 in iteritems(val.d):
449	if i != 0:
450	self.buf.write(',')
451	self._MaybeNewline()
452
453	self._ItemIndent(level + 1)
454	pyj8.WriteString(k2, self.options, self.buf)
455
456	self.buf.write(':')
457	self._MaybeSpace()
458
459	pyj8.WriteString(v2, self.options, self.buf)
460
461	i += 1
462
463	self._MaybeNewline()
464
465	self._BracketIndent(level + 1)
466	self.buf.write('}')
467
468	self._PrintBashSuffix(level)
469
470	def Print(self, val, level=0):
471	# type: (value_t, int) -> None
472
473	# special value that means everything is on one line
474	# It's like
475	# JSON.stringify(d, null, 0)
476	# except we use -1, not 0. 0 can still have newlines.
477
478	UP_val = val
479	with tagswitch(val) as case:
480	if case(value_e.Null):
481	self.buf.write('null')
482
483	elif case(value_e.Bool):
484	val = cast(value.Bool, UP_val)
485	self.buf.write('true' if val.b else 'false')
486
487	elif case(value_e.Int):
488	val = cast(value.Int, UP_val)
489	# TODO: avoid intermediate allocation with
490	# self.buf.WriteBigInt(val.i)
491	#
492	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
493	# be of arbitrary length, and will need a growth strategy.
494	# Although that is not very common, so we could allocate in
495	# that case.
496
497	self.buf.write(mops.ToStr(val.i))
498
499	elif case(value_e.Float):
500	val = cast(value.Float, UP_val)
501
502	fl = val.f
503	if math.isinf(fl):
504	if self.options & INF_NAN_ARE_NULL:
505	s = 'null' # negative infinity is null too
506	else:
507	s = 'INFINITY'
508	if fl < 0:
509	s = '-' + s
510	elif math.isnan(fl):
511	if self.options & INF_NAN_ARE_NULL:
512	# JavaScript JSON lib behavior: Inf and NaN are null
513	# Python has a bug in the encoder by default, and then
514	# allow_nan=False raises an error
515	s = 'null'
516	else:
517	s = 'NAN'
518	else:
519	# TODO: can we avoid intermediate allocation?
520	# self.buf.WriteFloat(val.f)
521	s = str(fl)
522
523	self.buf.write(s)
524
525	elif case(value_e.Str):
526	val = cast(value.Str, UP_val)
527
528	pyj8.WriteString(val.s, self.options, self.buf)
529
530	elif case(value_e.List):
531	val = cast(value.List, UP_val)
532
533	# Cycle detection, only for containers that can be in cycles
534	heap_id = HeapValueId(val)
535
536	if self.visiting.get(heap_id, False):
537	if self.options & SHOW_CYCLES:
538	# Showing the ID would be nice for pretty printing, but
539	# the problem is we'd have to show it TWICE to make it
540	# meaningful
541	#
542	#self.buf.write('[ -->%s ]' % ValueIdString(val))
543	self.buf.write('[...]')
544	return
545	else:
546	# node.js prints which index closes the cycle
547	raise error.Encode(
548	"Can't encode List%s in object cycle" %
549	ValueIdString(val))
550	else:
551	self.visiting[heap_id] = True
552	self._PrintList(val, level)
553	self.visiting[heap_id] = False
554
555	elif case(value_e.Dict):
556	val = cast(value.Dict, UP_val)
557
558	# Cycle detection, only for containers that can be in cycles
559	heap_id = HeapValueId(val)
560
561	if self.visiting.get(heap_id, False):
562	if self.options & SHOW_CYCLES:
563	self.buf.write('{...}')
564	return
565	else:
566	# node.js prints which key closes the cycle
567	raise error.Encode(
568	"Can't encode Dict%s in object cycle" %
569	ValueIdString(val))
570	else:
571	self.visiting[heap_id] = True
572	self._PrintDict(val, level)
573	self.visiting[heap_id] = False
574
575	elif case(value_e.Obj):
576	val = cast(Obj, UP_val)
577
578	if not (self.options & SHOW_NON_DATA):
579	raise error.Encode("Can't encode value of type Obj")
580
581	# Cycle detection, only for containers that can be in cycles
582	heap_id = HeapValueId(val)
583
584	if self.visiting.get(heap_id, False):
585	if self.options & SHOW_CYCLES:
586	self.buf.write('(...)')
587	return
588	else:
589	# node.js prints which key closes the cycle
590	raise error.Encode(
591	"Can't encode Obj%s in object cycle" %
592	ValueIdString(val))
593	else:
594	self.visiting[heap_id] = True
595	self._PrintObj(val, level)
596	self.visiting[heap_id] = False
597
598	elif case(value_e.SparseArray):
599	val = cast(value.SparseArray, UP_val)
600	self._PrintSparseArray(val, level)
601
602	elif case(value_e.BashArray):
603	val = cast(value.BashArray, UP_val)
604	self._PrintBashArray(val, level)
605
606	elif case(value_e.BashAssoc):
607	val = cast(value.BashAssoc, UP_val)
608	self._PrintBashAssoc(val, level)
609
610	else:
611	pass # mycpp workaround
612	if self.options & SHOW_NON_DATA:
613	# Similar to = operator, ui.DebugPrint()
614	# TODO: that prints value.Range in a special way
615	ysh_type = ValType(val)
616	# Don't show ID in 'pp test_'
617	#id_str = ValueIdString(val)
618	self.buf.write('<%s>' % ysh_type)
619	else:
620	raise error.Encode("Can't serialize object of type %s" %
621	ValType(val))
622
623
624	class PrettyPrinter(object):
625	""" Unused right now, but could enhance the = operator.
626
627	Output to polymorphic ColorOutput
628
629	Features like asdl/format.py:
630	- line wrapping
631	- color
632	- sharing detection by passing in a REF COUTN dict
633	- print @123 the first time, and then print ... the second time
634
635	and
636
637	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
638	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
639
640	- Omitting commas for ASDL? Maybe we can use two spaces
641
642	(Token id: Id.VSub_DollarName start: 0 length: 3)
643	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
644	"""
645
646	def __init__(self, max_col):
647	# type: (int) -> None
648	self.max_col = max_col
649
650	# This could be an optimized set an C++ bit set like
651	# mark_sweep_heap.h, rather than a Dict
652	#self.unique_objs = mylib.UniqueObjects()
653
654	# first pass of object ID -> number of times references
655
656	self.ref_count = {} # type: Dict[int, int]
657
658	def PrettyTree(self, val, f):
659	# type: (value_t, fmt.ColorOutput) -> None
660
661	# TODO: first convert to hnode.asdl types?
662
663	# Although we might want
664	# hnode.AlreadyShown = (str type, int unique_id)
665	pass
666
667	def Print(self, val, buf):
668	# type: (value_t, mylib.BufWriter) -> None
669
670	# Or print to stderr?
671	f = fmt.DetectConsoleOutput(mylib.Stdout())
672	self.PrettyTree(val, f)
673
674	# Then print those with ASDL
675	pass
676
677
678	class LexerDecoder(object):
679	"""J8 lexer and string decoder.
680
681	Similar interface as SimpleLexer, except we return an optional decoded
682	string
683	"""
684
685	def __init__(self, s, is_j8, lang_str):
686	# type: (str, bool, str) -> None
687	self.s = s
688	self.is_j8 = is_j8
689	self.lang_str = lang_str
690
691	self.pos = 0
692
693	# current line being lexed -- for error messages
694	self.cur_line_num = 1
695
696	# Reuse this instance to save GC objects. JSON objects could have
697	# thousands of strings.
698	self.decoded = mylib.BufWriter()
699
700	def _Error(self, msg, end_pos):
701	# type: (str, int) -> error.Decode
702
703	# Use the current position as start pos
704	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
705
706	def Next(self):
707	# type: () -> Tuple[Id_t, int, Optional[str]]
708	""" Returns a token and updates self.pos """
709
710	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
711
712	if not self.is_j8:
713	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
714	raise self._Error(
715	"Single quotes aren't part of JSON; you may want 'json8 read'",
716	end_pos)
717	if tok_id == Id.Ignored_Comment:
718	raise self._Error(
719	"Comments aren't part of JSON; you may want 'json8 read'",
720	end_pos)
721
722	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
723	Id.Left_USingleQuote):
724	return self._DecodeString(tok_id, end_pos)
725
726	if tok_id == Id.Left_JDoubleQuote:
727	if self.is_j8:
728	return self._DecodeString(tok_id, end_pos)
729	else:
730	raise self._Error('Pure JSON does not accept j"" prefix',
731	end_pos)
732
733	if tok_id == Id.Ignored_Newline:
734	#log('LINE %d', self.cur_line_num)
735	self.cur_line_num += 1
736
737	self.pos = end_pos
738	return tok_id, end_pos, None
739
740	def NextForLines(self):
741	# type: () -> Tuple[Id_t, int, Optional[str]]
742	""" Like Next(), but for J8 Lines """
743
744	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
745
746	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
747	Id.Left_BSingleQuote, Id.Left_USingleQuote):
748	return self._DecodeString(tok_id, end_pos)
749
750	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
751	# this for quoted strings.)
752	if (tok_id == Id.Lit_Chars and
753	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
754	raise self._Error(
755	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
756	if tok_id == Id.Char_AsciiControl:
757	raise self._Error(
758	"J8 Lines can't have unescaped ASCII control chars", end_pos)
759
760	if tok_id == Id.J8_Newline:
761	#log('LINE %d', self.cur_line_num)
762	self.cur_line_num += 1
763
764	self.pos = end_pos
765	return tok_id, end_pos, None
766
767	def _DecodeString(self, left_id, str_pos):
768	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
769	""" Returns a string token and updates self.pos """
770
771	while True:
772	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
773	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
774	else:
775	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
776
777	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
778
779	if tok_id == Id.Eol_Tok:
780	# TODO: point to beginning of # quote?
781	raise self._Error(
782	'Unexpected EOF while lexing %s string' % self.lang_str,
783	str_end)
784	if tok_id == Id.Unknown_Backslash:
785	raise self._Error(
786	'Bad backslash escape in %s string' % self.lang_str,
787	str_end)
788	if tok_id == Id.Char_AsciiControl:
789	raise self._Error(
790	"%s strings can't have unescaped ASCII control chars" %
791	self.lang_str, str_end)
792
793	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
794
795	self.pos = str_end
796
797	s = self.decoded.getvalue()
798	self.decoded.clear() # reuse this instance
799
800	#log('decoded %r', self.decoded.getvalue())
801	return Id.J8_String, str_end, s
802
803	#
804	# Now handle each kind of token
805	#
806
807	if tok_id == Id.Lit_Chars: # JSON and J8
808	part = self.s[str_pos:str_end]
809	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
810	raise self._Error(
811	'Invalid UTF-8 in %s string literal' % self.lang_str,
812	str_end)
813
814	# TODO: would be nice to avoid allocation in all these cases.
815	# But LookupCharC() would have to change.
816
817	elif tok_id == Id.Char_OneChar: # JSON and J8
818	ch = self.s[str_pos + 1]
819	part = consts.LookupCharC(ch)
820
821	elif tok_id == Id.Char_UBraced: # J8 only
822	h = self.s[str_pos + 3:str_end - 1]
823	i = int(h, 16)
824
825	# Same checks in osh/word_compile.py
826	if i > 0x10ffff:
827	raise self._Error(
828	"Code point can't be greater than U+10ffff", str_end)
829	if 0xD800 <= i and i < 0xE000:
830	raise self._Error(
831	r"\u{%s} escape is illegal because it's in the surrogate range"
832	% h, str_end)
833
834	part = Utf8Encode(i)
835
836	elif tok_id == Id.Char_YHex: # J8 only
837	h = self.s[str_pos + 2:str_end]
838
839	# Same check in osh/word_parse.py
840	if left_id != Id.Left_BSingleQuote:
841	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
842	raise self._Error(
843	r"\y%s escapes not allowed in u'' strings" % h,
844	str_end)
845
846	i = int(h, 16)
847	part = chr(i)
848
849	elif tok_id == Id.Char_SurrogatePair:
850	h1 = self.s[str_pos + 2:str_pos + 6]
851	h2 = self.s[str_pos + 8:str_pos + 12]
852
853	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
854	i1 = int(h1, 16) - 0xD800 # high surrogate
855	i2 = int(h2, 16) - 0xDC00 # low surrogate
856	code_point = 0x10000 + (i1 << 10) + i2
857
858	part = Utf8Encode(code_point)
859
860	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
861	h = self.s[str_pos + 2:str_end]
862	i = int(h, 16)
863	part = Utf8Encode(i)
864
865	else:
866	# Should never happen
867	raise AssertionError(Id_str(tok_id))
868
869	#log('%s part %r', Id_str(tok_id), part)
870	self.decoded.write(part)
871	str_pos = str_end
872
873
874	class _Parser(object):
875
876	def __init__(self, s, is_j8):
877	# type: (str, bool) -> None
878	self.s = s
879	self.is_j8 = is_j8
880	self.lang_str = "J8" if is_j8 else "JSON"
881
882	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
883	self.tok_id = Id.Undefined_Tok
884	self.start_pos = 0
885	self.end_pos = 0
886	self.decoded = '' # decoded J8 string
887
888	def _Next(self):
889	# type: () -> None
890
891	# This isn't the start of a J8_Bool token, it's the END of the token before it
892	while True:
893	self.start_pos = self.end_pos
894	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
895	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
896	Id.Ignored_Comment):
897	break
898	# TODO: add Ignored_Newline to count lines, and show line numbers
899	# in errors messages. The position of the last newline and a token
900	# can be used to calculate a column number.
901
902	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
903
904	def _Eat(self, tok_id):
905	# type: (Id_t) -> None
906
907	if self.tok_id != tok_id:
908	#log('position %r %d-%d %r', self.s, self.start_pos,
909	# self.end_pos, self.s[self.start_pos:self.end_pos])
910	raise self._ParseError("Expected %s, got %s" %
911	(Id_str(tok_id), Id_str(self.tok_id)))
912	self._Next()
913
914	def _NextForLines(self):
915	# type: () -> None
916	"""Like _Next, but use the J8 Lines lexer."""
917	self.start_pos = self.end_pos
918	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
919
920	def _ParseError(self, msg):
921	# type: (str) -> error.Decode
922	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
923	self.lexer.cur_line_num)
924
925
926	class Parser(_Parser):
927	"""JSON and JSON8 Parser."""
928
929	def __init__(self, s, is_j8):
930	# type: (str, bool) -> None
931	_Parser.__init__(self, s, is_j8)
932
933	def _ParsePair(self):
934	# type: () -> Tuple[str, value_t]
935
936	k = self.decoded # Save the potential string value
937	self._Eat(Id.J8_String) # Check that it's a string
938	assert k is not None
939
940	self._Eat(Id.J8_Colon)
941
942	v = self._ParseValue()
943	return k, v
944
945	def _ParseDict(self):
946	# type: () -> value_t
947	"""
948	pair = string ':' value
949	Dict = '{' '}'
950	\| '{' pair (',' pair)* '}'
951	"""
952	# precondition
953	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
954
955	#log('> Dict')
956
957	d = NewDict() # type: Dict[str, value_t]
958
959	self._Next()
960	if self.tok_id == Id.J8_RBrace:
961	self._Next()
962	return value.Dict(d)
963
964	k, v = self._ParsePair()
965	d[k] = v
966	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
967
968	while self.tok_id == Id.J8_Comma:
969	self._Next()
970	k, v = self._ParsePair()
971	d[k] = v
972	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
973
974	self._Eat(Id.J8_RBrace)
975
976	#log('< Dict')
977
978	return value.Dict(d)
979
980	def _ParseList(self):
981	# type: () -> value_t
982	"""
983	List = '[' ']'
984	\| '[' value (',' value)* ']'
985	"""
986	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
987
988	items = [] # type: List[value_t]
989
990	self._Next()
991	if self.tok_id == Id.J8_RBracket:
992	self._Next()
993	return value.List(items)
994
995	items.append(self._ParseValue())
996
997	while self.tok_id == Id.J8_Comma:
998	self._Next()
999	items.append(self._ParseValue())
1000
1001	self._Eat(Id.J8_RBracket)
1002
1003	return value.List(items)
1004
1005	def _ParseValue(self):
1006	# type: () -> value_t
1007	if self.tok_id == Id.J8_LBrace:
1008	return self._ParseDict()
1009
1010	elif self.tok_id == Id.J8_LBracket:
1011	return self._ParseList()
1012
1013	elif self.tok_id == Id.J8_Null:
1014	self._Next()
1015	return value.Null
1016
1017	elif self.tok_id == Id.J8_Bool:
1018	#log('%r %d', self.s[self.start_pos], self.start_pos)
1019	b = value.Bool(self.s[self.start_pos] == 't')
1020	self._Next()
1021	return b
1022
1023	elif self.tok_id == Id.J8_Int:
1024	part = self.s[self.start_pos:self.end_pos]
1025	self._Next()
1026	try:
1027	big = mops.FromStr(part)
1028	except ValueError:
1029	raise self._ParseError('Integer is too big')
1030	return value.Int(big)
1031
1032	elif self.tok_id == Id.J8_Float:
1033	part = self.s[self.start_pos:self.end_pos]
1034	self._Next()
1035	return value.Float(float(part))
1036
1037	# UString, BString too
1038	elif self.tok_id == Id.J8_String:
1039	str_val = value.Str(self.decoded)
1040	#log('d %r', self.decoded)
1041	self._Next()
1042	return str_val
1043
1044	elif self.tok_id == Id.Eol_Tok:
1045	raise self._ParseError('Unexpected EOF while parsing %s' %
1046	self.lang_str)
1047
1048	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1049	raise self._ParseError('Invalid token while parsing %s: %s' %
1050	(self.lang_str, Id_str(self.tok_id)))
1051
1052	def ParseValue(self):
1053	# type: () -> value_t
1054	""" Raises error.Decode. """
1055	self._Next()
1056	obj = self._ParseValue()
1057
1058	n = len(self.s)
1059	if self.start_pos != n:
1060	extra = n - self.start_pos
1061	#log('n %d pos %d', n, self.start_pos)
1062	raise self._ParseError(
1063	'Got %d bytes of unexpected trailing input' % extra)
1064	return obj
1065
1066
1067	class Nil8Parser(_Parser):
1068	"""
1069	Tokens not in JSON8:
1070	LParen RParen Symbol
1071
1072	Tokens not in JSON, but in JSON8 and NIL8:
1073	Identifier (unquoted keys)
1074	Ignored_Comment
1075	"""
1076
1077	def __init__(self, s, is_j8):
1078	# type: (str, bool) -> None
1079	_Parser.__init__(self, s, is_j8)
1080
1081	if 0:
1082
1083	def _LookAhead(self):
1084	# type: () -> Id_t
1085	"""
1086	Don't need this right now
1087	"""
1088	end_pos = self.end_pos # look ahead from last token
1089	while True:
1090	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1091	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1092	Id.Ignored_Comment):
1093	break
1094	return tok_id
1095
1096	def _ParseRecord(self):
1097	# type: () -> nvalue_t
1098	"""
1099	Yaks
1100	(self->Next) => (-> self Next)
1101	(self->Next obj.field) => ((-> self Next) (. obj field))
1102
1103	Similar to
1104	((identity identity) 42) => 42 in Clojure
1105
1106	ASDL
1107	(Node left:(. x4beef2))
1108	(Node left !x4beef2)
1109
1110	# Ambiguous because value can be identifier.
1111	# We have to look ahead to and see if there's a colon :
1112	field =
1113	Identifier ':' value
1114	\| value
1115
1116	record = '(' head field* ')'
1117
1118	- Identifier \| Symbol are treated the same, it's a side effect of
1119	the lexing style
1120	- do positional args come before named args
1121	- () is invalid? Use [] for empty list
1122	"""
1123	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1124
1125	items = [] # type: List[nvalue_t]
1126
1127	self._Next()
1128	if self.tok_id == Id.J8_RParen:
1129	self._Next()
1130	return nvalue.List(items)
1131
1132	#log('TOK %s', Id_str(self.tok_id))
1133	while self.tok_id != Id.J8_RParen:
1134	items.append(self._ParseNil8())
1135	#log('TOK 2 %s', Id_str(self.tok_id))
1136
1137	self._Eat(Id.J8_RParen)
1138
1139	return nvalue.List(items)
1140
1141	def _ParseList8(self):
1142	# type: () -> nvalue_t
1143	"""
1144	List8 = '[' value* ']'
1145
1146	No commas, not even optional ones for now.
1147	"""
1148	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1149
1150	items = [] # type: List[nvalue_t]
1151
1152	self._Next()
1153	if self.tok_id == Id.J8_RBracket:
1154	self._Next()
1155	return nvalue.List(items)
1156
1157	#log('TOK %s', Id_str(self.tok_id))
1158	while self.tok_id != Id.J8_RBracket:
1159	items.append(self._ParseNil8())
1160	#log('TOK 2 %s', Id_str(self.tok_id))
1161
1162	self._Eat(Id.J8_RBracket)
1163
1164	return nvalue.List(items)
1165
1166	def _ParseNil8(self):
1167	# type: () -> nvalue_t
1168	if self.tok_id == Id.J8_LParen:
1169	obj = self._ParseRecord() # type: nvalue_t
1170	#return obj
1171
1172	elif self.tok_id == Id.J8_LBracket:
1173	obj = self._ParseList8()
1174	#return obj
1175
1176	# Primitives are copied from J8 above.
1177	# TODO: We also want hex literals.
1178	elif self.tok_id == Id.J8_Null:
1179	self._Next()
1180	obj = nvalue.Null
1181
1182	elif self.tok_id == Id.J8_Bool:
1183	b = nvalue.Bool(self.s[self.start_pos] == 't')
1184	self._Next()
1185	obj = b
1186
1187	elif self.tok_id == Id.J8_Int:
1188	part = self.s[self.start_pos:self.end_pos]
1189	self._Next()
1190	obj = nvalue.Int(int(part))
1191
1192	elif self.tok_id == Id.J8_Float:
1193	part = self.s[self.start_pos:self.end_pos]
1194	self._Next()
1195	obj = nvalue.Float(float(part))
1196
1197	elif self.tok_id == Id.J8_String:
1198	str_val = nvalue.Str(self.decoded)
1199	self._Next()
1200	obj = str_val
1201
1202	# <- etc.
1203	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1204	Id.J8_Comma):
1205	# unquoted "word" treated like a string
1206	part = self.s[self.start_pos:self.end_pos]
1207	self._Next()
1208	obj = nvalue.Symbol(part)
1209
1210	elif self.tok_id == Id.Eol_Tok:
1211	raise self._ParseError('Unexpected EOF while parsing %s' %
1212	self.lang_str)
1213
1214	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1215	raise self._ParseError('Invalid token while parsing %s: %s' %
1216	(self.lang_str, Id_str(self.tok_id)))
1217
1218	#log('YO %s', Id_str(self.tok_id))
1219	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1220	#log('AT %s', Id_str(self.tok_id))
1221
1222	# key: "value" -> (: key "value")
1223	part = self.s[self.start_pos:self.end_pos]
1224	op = nvalue.Symbol(part)
1225
1226	self._Next()
1227	operand2 = self._ParseNil8()
1228	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1229	#print("--> INFIX %d %s" % (id(infix), infix))
1230	return infix
1231
1232	#next_id = self._LookAhead()
1233	#print('NEXT %s' % Id_str(next_id))
1234
1235	#raise AssertionError()
1236	#print("--> OBJ %d %s" % (id(obj), obj))
1237	return obj
1238
1239	def ParseNil8(self):
1240	# type: () -> nvalue_t
1241	""" Raises error.Decode. """
1242	self._Next()
1243	#print('yo')
1244	obj = self._ParseNil8()
1245	#print("==> %d %s" % (id(obj), obj))
1246	if self.tok_id != Id.Eol_Tok:
1247	raise self._ParseError('Unexpected trailing input')
1248	return obj
1249
1250
1251	class J8LinesParser(_Parser):
1252	"""Decode lines from a string with newlines.
1253
1254	We specify this with a grammar, to preserve location info and to reduce
1255	allocations. (But note that unquoted_line is more like a LOOP than it is
1256	grammatical.)
1257
1258	Grammar:
1259
1260	end = J8_Newline \| Eol_Tok
1261
1262	empty_line = WS_Space? end
1263
1264	# special case: read until end token, but REMOVE trailing WS_Space
1265	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1266
1267	j8_line = WS_Space? J8_String WS_Space? end
1268
1269	lines = (empty_line \| unquoted_line \| j8_line)*
1270
1271	where Lit_Chars is valid UTF-8
1272
1273	Notes:
1274
1275	(1) We disallow multiple strings on a line, like:
1276
1277	"json" "json2"
1278	"json" unquoted
1279
1280	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1281
1282	foo "" u''
1283
1284	The "" and u'' are not a decoded string, because the line started with
1285	Id.Lit_Chars literals.
1286
1287	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1288	Does it have - for empty cell?
1289	"""
1290
1291	def __init__(self, s):
1292	# type: (str) -> None
1293	_Parser.__init__(self, s, True)
1294
1295	def _Show(self, s):
1296	# type: (str) -> None
1297	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1298	self.end_pos)
1299
1300	def _ParseLine(self, out):
1301	# type: (List[str]) -> None
1302	""" May append a line to 'out' """
1303	#self._Show('1')
1304	if self.tok_id == Id.WS_Space:
1305	self._NextForLines()
1306
1307	# Empty line - return without doing anything
1308	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1309	self._NextForLines()
1310	return
1311
1312	# Quoted string on line
1313	if self.tok_id == Id.J8_String:
1314	out.append(self.decoded)
1315	self._NextForLines()
1316
1317	if self.tok_id == Id.WS_Space: # trailing whitespace
1318	self._NextForLines()
1319
1320	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1321	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1322	Id_str(self.tok_id))
1323
1324	self._NextForLines()
1325	return
1326
1327	# Unquoted line
1328	if self.tok_id == Id.Lit_Chars:
1329	# ' unquoted "" text on line ' # read every token until end
1330	string_start = self.start_pos
1331	while True:
1332	# for stripping whitespace
1333	prev_id = self.tok_id
1334	prev_start = self.start_pos
1335
1336	self._NextForLines()
1337
1338	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1339	# \r, but we're sticking with the JSON spec definition of
1340	# whitespace. (As another data point, CPython on Unix allows
1341	# \r in the middle of expressions, treating it as whitespace.)
1342	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1343	break
1344
1345	if prev_id == Id.WS_Space:
1346	string_end = prev_start # remove trailing whitespace
1347	else:
1348	string_end = self.start_pos
1349
1350	out.append(self.s[string_start:string_end])
1351
1352	self._NextForLines() # past newline
1353	return
1354
1355	raise AssertionError(Id_str(self.tok_id))
1356
1357	def Parse(self):
1358	# type: () -> List[str]
1359	""" Raises error.Decode. """
1360	self._NextForLines()
1361
1362	lines = [] # type: List[str]
1363	while self.tok_id != Id.Eol_Tok:
1364	self._ParseLine(lines)
1365
1366	if self.tok_id != Id.Eol_Tok:
1367	raise self._ParseError('Unexpected trailing input in J8 Lines')
1368
1369	return lines
1370
1371
1372	def SplitJ8Lines(s):
1373	# type: (str) -> List[str]
1374	"""Used by @(echo split command sub)
1375
1376	Raises:
1377	error.Decode
1378
1379	3 Errors:
1380	- J8 string syntax error inside quotes
1381	- Extra input on line
1382	- unquoted line isn't utf-8
1383	"""
1384	p = J8LinesParser(s)
1385	return p.Parse()
1386
1387
1388	# vim: sw=4