osh/word_compile.py

OILS / osh / word_compile.py View on Github | oilshell.org

309 lines, 165 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""osh/word_compile.py.
4
5	These functions are called after parsing, but don't depend on any runtime
6	values.
7	"""
8
9	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10	from _devbuild.gen.syntax_asdl import (
11	Token,
12	CharCode,
13	word_part_e,
14	word_part_t,
15	)
16	from core.error import p_die
17	from data_lang import j8
18	from frontend import consts
19	from frontend import lexer
20	from mycpp import mylib
21	from mycpp.mylib import log, switch
22
23	from typing import List, Optional, cast
24
25
26	def EvalCharLiteralForRegex(tok):
27	# type: (Token) -> CharCode
28	"""For regex char classes.
29
30	Similar logic as below.
31	"""
32	id_ = tok.id
33	value = lexer.TokenVal(tok)
34
35	with switch(id_) as case:
36	if case(Id.Char_UBraced):
37	s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38	i = int(s, 16)
39	return CharCode(tok, i, True) # u_braced
40
41	elif case(Id.Char_OneChar): # \'
42	# value[1] -> mylib.ByteAt()
43	one_char_str = consts.LookupCharC(value[1])
44	return CharCode(tok, ord(one_char_str), False)
45
46	elif case(Id.Char_Hex):
47	s = lexer.TokenSliceLeft(tok, 2)
48	i = int(s, 16)
49	return CharCode(tok, i, False)
50
51	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56	assert len(value) == 1, tok
57	# value[0] -> mylib.ByteAt()
58	return CharCode(tok, ord(value[0]), False)
59
60	else:
61	raise AssertionError(tok)
62
63
64	def EvalCStringToken(id_, value):
65	# type: (Id_t, str) -> Optional[str]
66	"""All types of C-style backslash-escaped strings use this function:
67
68	- echo -e and printf at runtime
69	- $'' and b'' u'' at parse time
70	"""
71	code_point = -1
72
73	if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74	# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
75	return value
76
77	# single quotes in the middle of a triple quoted string
78	elif id_ == Id.Right_SingleQuote:
79	return value
80
81	elif id_ == Id.Char_OneChar:
82	c = value[1]
83	return consts.LookupCharC(c)
84
85	elif id_ == Id.Char_Stop: # \c returns a special sentinel
86	return None
87
88	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
89	if id_ == Id.Char_Octal3: # $'\377'
90	s = value[1:]
91	else: # echo -e '\0377'
92	s = value[2:]
93
94	i = int(s, 8)
95	if i >= 256:
96	i = i % 256
97	# NOTE: This is for strict mode
98	#raise AssertionError('Out of range')
99	return chr(i)
100
101	elif id_ in (Id.Char_Hex, Id.Char_YHex):
102	s = value[2:]
103	i = int(s, 16)
104	return chr(i)
105
106	# Note: we're not doing the surrogate range and max code point checks for
107	# echo -e and printf:
108	#
109	# 1. It's not compatible with bash
110	# 2. We don't have good error locations anyway
111
112	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
113	s = value[2:]
114	code_point = int(s, 16)
115	return j8.Utf8Encode(code_point)
116
117	elif id_ == Id.Char_UBraced:
118	s = value[3:-1] # \u{123}
119	code_point = int(s, 16)
120	return j8.Utf8Encode(code_point)
121
122	else:
123	raise AssertionError(Id_str(id_))
124
125
126	def EvalSingleQuoted(id_, tokens):
127	# type: (Id_t, List[Token]) -> str
128	""" Done at parse time """
129	if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
130	Id.Left_RTSingleQuote):
131	strs = [lexer.TokenVal(t) for t in tokens]
132
133	elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
134	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
135	Id.Left_BTSingleQuote):
136	if 0:
137	for t in tokens:
138	print('T %s' % t)
139
140	strs = []
141	for t in tokens:
142	# More parse time validation for code points.
143	# EvalCStringToken() redoes some of this work, but right now it's
144	# shared with dynamic echo -e / printf, which don't have tokens.
145
146	# Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
147	# which are in OSH
148	if t.id == Id.Char_UBraced:
149	s = lexer.TokenSlice(t, 3, -1)
150	code_point = int(s, 16)
151	if code_point > 0x10ffff:
152	p_die("Code point can't be greater than U+10ffff", t)
153	if 0xD800 <= code_point and code_point < 0xE000:
154	p_die(
155	r"%s escape is illegal because it's in the surrogate range"
156	% lexer.TokenVal(t), t)
157
158	strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
159
160	else:
161	raise AssertionError(id_)
162	return ''.join(strs)
163
164
165	def _TokenConsistsOf(tok, byte_set):
166	# type: (Token, str) -> bool
167	start = tok.col
168	end = tok.col + tok.length
169	for i in xrange(start, end):
170	b = mylib.ByteAt(tok.line.content, i)
171	if not mylib.ByteInSet(b, byte_set):
172	return False
173	return True
174
175
176	def _IsLeadingSpace(tok):
177	# type: (Token) -> bool
178	""" Determine if the token before ''' etc. is space to trim """
179	return _TokenConsistsOf(tok, ' \t')
180
181
182	def _IsTrailingSpace(tok):
183	# type: (Token) -> bool
184	""" Determine if the space/newlines after ''' should be trimmed
185
186	Like s.isspace(), without legacy \f \v and Unicode.
187	"""
188	return _TokenConsistsOf(tok, ' \n\r\t')
189
190
191	# Whitespace trimming algorithms:
192	#
193	# 1. Trim what's after opening ''' or """, if it's whitespace
194	# 2. Determine what's before closing ''' or """ -- this is what you strip
195	# 3. Strip each line by mutating the token
196	# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
197	# the lossless invariant
198
199
200	def RemoveLeadingSpaceDQ(parts):
201	# type: (List[word_part_t]) -> None
202	if len(parts) <= 1: # We need at least 2 parts to strip anything
203	return
204
205	# The first token may have a newline
206	UP_first = parts[0]
207	if UP_first.tag() == word_part_e.Literal:
208	first = cast(Token, UP_first)
209	#log('T %s', first_part)
210	if _IsTrailingSpace(first):
211	# Remove the first part. TODO: This could be expensive if there are many
212	# lines.
213	parts.pop(0)
214
215	UP_last = parts[-1]
216	to_strip = None # type: Optional[str]
217	if UP_last.tag() == word_part_e.Literal:
218	last = cast(Token, UP_last)
219	if _IsLeadingSpace(last):
220	to_strip = lexer.TokenVal(last)
221	parts.pop() # Remove the last part
222
223	if to_strip is None:
224	return
225
226	n = len(to_strip)
227	for part in parts:
228	if part.tag() != word_part_e.Literal:
229	continue
230
231	lit_tok = cast(Token, part)
232
233	if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
234	# TODO: Lexer should not populate this!
235	assert lit_tok.tval is None, lit_tok.tval
236
237	lit_tok.col = n
238	lit_tok.length -= n
239	#log('n = %d, %s', n, lit_tok)
240
241	assert lit_tok.id == Id.Lit_Chars, lit_tok
242	# --tool lossless-cat has a special case for this
243	lit_tok.id = Id.Lit_CharsWithoutPrefix
244
245
246	def RemoveLeadingSpaceSQ(tokens):
247	# type: (List[Token]) -> None
248	"""Strip leading whitespace from tokens.
249
250	May return original list unmodified, or a new list.
251
252	Must respect lossless invariant - see test/lossless/multiline-str.sh
253
254	For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
255	arena.
256	"""
257	if 0:
258	log('--')
259	for tok in tokens:
260	#log('tok %s', tok)
261	import sys
262	from asdl import format as fmt
263	ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
264	tree = tok.AbbreviatedTree()
265	fmt.PrintTree(tree, ast_f)
266	print('', file=sys.stderr)
267	log('--')
268
269	if len(tokens) <= 1: # We need at least 2 parts to strip anything
270	return
271
272	# var x = ''' # strip initial newline/whitespace
273	# x
274	# '''
275	first = tokens[0]
276	if first.id == Id.Lit_Chars:
277	if _IsTrailingSpace(first):
278	tokens.pop(0) # Remove the first part
279
280	# Figure out what to strip, based on last token
281	last = tokens[-1]
282	to_strip = None # type: Optional[str]
283	if last.id == Id.Lit_Chars:
284	if _IsLeadingSpace(last):
285	to_strip = lexer.TokenVal(last)
286	tokens.pop() # Remove the last part
287
288	if to_strip is None:
289	return
290
291	#log('SQ Stripping %r', to_strip)
292	n = len(to_strip)
293
294	#log('--')
295	for tok in tokens:
296	#log('tok %s', tok)
297	# Strip leading space on tokens that begin lines, by bumping start col
298	if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
299	tok.col = n
300	tok.length -= n
301
302	assert tok.id == Id.Lit_Chars, tok
303	# --tool lossless-cat has a special case for this
304	tok.id = Id.Lit_CharsWithoutPrefix
305
306	#log('STRIP tok %s', tok)
307
308
309	# vim: sw=4