osh/word_compile.py

OILS / osh / word_compile.py View on Github | oils.pub

310 lines, 165 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""osh/word_compile.py.
4
5	These functions are called after parsing, but don't depend on any runtime
6	values.
7	"""
8
9	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10	from _devbuild.gen.syntax_asdl import (
11	Token,
12	CharCode,
13	word_part_e,
14	word_part_t,
15	)
16	from core.error import p_die
17	from data_lang import j8
18	from frontend import consts
19	from frontend import lexer
20	from mycpp import mylib
21	from mycpp.mylib import log, switch
22
23	from typing import List, Optional, cast
24
25
26	def EvalCharLiteralForRegex(tok):
27	# type: (Token) -> CharCode
28	"""For regex char classes.
29
30	Similar logic as below.
31	"""
32	id_ = tok.id
33	value = lexer.TokenVal(tok)
34
35	with switch(id_) as case:
36	if case(Id.Char_UBraced):
37	s = lexer.TokenSlice(tok, 3, -1) # \u{123}
38	i = int(s, 16)
39	return CharCode(tok, i, True) # u_braced
40
41	elif case(Id.Char_OneChar): # \'
42	# value[1] -> mylib.ByteAt()
43	one_char_str = consts.LookupCharC(value[1])
44	return CharCode(tok, ord(one_char_str), False)
45
46	elif case(Id.Char_YHex, Id.Char_Hex): # \yff, \xff is legacy
47	s = lexer.TokenSliceLeft(tok, 2)
48	i = int(s, 16)
49	return CharCode(tok, i, False)
50
51	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
52	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
53	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
54	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
55
56	assert len(value) == 1, tok
57	# value[0] -> mylib.ByteAt()
58	return CharCode(tok, ord(value[0]), False)
59
60	else:
61	raise AssertionError(tok)
62
63
64	def EvalCStringToken(id_, value):
65	# type: (Id_t, str) -> Optional[str]
66	"""All types of C-style backslash-escaped strings use this function:
67
68	- echo -e and printf at runtime
69	- $'' and b'' u'' at parse time
70	"""
71	code_point = -1
72
73	if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
74	# shopt --set no_parse_backslash detects Unknown_Backslash at PARSE
75	# time in YSH.
76	return value
77
78	# single quotes in the middle of a triple quoted string
79	elif id_ == Id.Right_SingleQuote:
80	return value
81
82	elif id_ == Id.Char_OneChar:
83	c = value[1]
84	return consts.LookupCharC(c)
85
86	elif id_ == Id.Char_Stop: # \c returns a special sentinel
87	return None
88
89	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
90	if id_ == Id.Char_Octal3: # $'\377'
91	s = value[1:]
92	else: # echo -e '\0377'
93	s = value[2:]
94
95	i = int(s, 8)
96	if i >= 256:
97	i = i % 256
98	# NOTE: This is for strict mode
99	#raise AssertionError('Out of range')
100	return chr(i)
101
102	elif id_ in (Id.Char_Hex, Id.Char_YHex):
103	s = value[2:]
104	i = int(s, 16)
105	return chr(i)
106
107	# Note: we're not doing the surrogate range and max code point checks for
108	# echo -e and printf:
109	#
110	# 1. It's not compatible with bash
111	# 2. We don't have good error locations anyway
112
113	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
114	s = value[2:]
115	code_point = int(s, 16)
116	return j8.Utf8Encode(code_point)
117
118	elif id_ == Id.Char_UBraced:
119	s = value[3:-1] # \u{123}
120	code_point = int(s, 16)
121	return j8.Utf8Encode(code_point)
122
123	else:
124	raise AssertionError(Id_str(id_))
125
126
127	def EvalSingleQuoted(id_, tokens):
128	# type: (Id_t, List[Token]) -> str
129	""" Done at parse time """
130	if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
131	Id.Left_RTSingleQuote):
132	strs = [lexer.TokenVal(t) for t in tokens]
133
134	elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
135	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
136	Id.Left_BTSingleQuote):
137	if 0:
138	for t in tokens:
139	print('T %s' % t)
140
141	strs = []
142	for t in tokens:
143	# More parse time validation for code points.
144	# EvalCStringToken() redoes some of this work, but right now it's
145	# shared with dynamic echo -e / printf, which don't have tokens.
146
147	# Only check J8 style strings, not Char_Unicode4 and Char_Unicode8,
148	# which are in OSH
149	if t.id == Id.Char_UBraced:
150	s = lexer.TokenSlice(t, 3, -1)
151	code_point = int(s, 16)
152	if code_point > 0x10ffff:
153	p_die("Code point can't be greater than U+10ffff", t)
154	if 0xD800 <= code_point and code_point < 0xE000:
155	p_die(
156	r"%s escape is illegal because it's in the surrogate range"
157	% lexer.TokenVal(t), t)
158
159	strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
160
161	else:
162	raise AssertionError(id_)
163	return ''.join(strs)
164
165
166	def _TokenConsistsOf(tok, byte_set):
167	# type: (Token, str) -> bool
168	start = tok.col
169	end = tok.col + tok.length
170	for i in xrange(start, end):
171	b = mylib.ByteAt(tok.line.content, i)
172	if not mylib.ByteInSet(b, byte_set):
173	return False
174	return True
175
176
177	def _IsLeadingSpace(tok):
178	# type: (Token) -> bool
179	""" Determine if the token before ''' etc. is space to trim """
180	return _TokenConsistsOf(tok, ' \t')
181
182
183	def _IsTrailingSpace(tok):
184	# type: (Token) -> bool
185	""" Determine if the space/newlines after ''' should be trimmed
186
187	Like s.isspace(), without legacy \f \v and Unicode.
188	"""
189	return _TokenConsistsOf(tok, ' \n\r\t')
190
191
192	# Whitespace trimming algorithms:
193	#
194	# 1. Trim what's after opening ''' or """, if it's whitespace
195	# 2. Determine what's before closing ''' or """ -- this is what you strip
196	# 3. Strip each line by mutating the token
197	# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
198	# the lossless invariant
199
200
201	def RemoveLeadingSpaceDQ(parts):
202	# type: (List[word_part_t]) -> None
203	if len(parts) <= 1: # We need at least 2 parts to strip anything
204	return
205
206	# The first token may have a newline
207	UP_first = parts[0]
208	if UP_first.tag() == word_part_e.Literal:
209	first = cast(Token, UP_first)
210	#log('T %s', first_part)
211	if _IsTrailingSpace(first):
212	# Remove the first part. TODO: This could be expensive if there are many
213	# lines.
214	parts.pop(0)
215
216	UP_last = parts[-1]
217	to_strip = None # type: Optional[str]
218	if UP_last.tag() == word_part_e.Literal:
219	last = cast(Token, UP_last)
220	if _IsLeadingSpace(last):
221	to_strip = lexer.TokenVal(last)
222	parts.pop() # Remove the last part
223
224	if to_strip is None:
225	return
226
227	n = len(to_strip)
228	for part in parts:
229	if part.tag() != word_part_e.Literal:
230	continue
231
232	lit_tok = cast(Token, part)
233
234	if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
235	# TODO: Lexer should not populate this!
236	assert lit_tok.tval is None, lit_tok.tval
237
238	lit_tok.col = n
239	lit_tok.length -= n
240	#log('n = %d, %s', n, lit_tok)
241
242	assert lit_tok.id == Id.Lit_Chars, lit_tok
243	# --tool lossless-cat has a special case for this
244	lit_tok.id = Id.Lit_CharsWithoutPrefix
245
246
247	def RemoveLeadingSpaceSQ(tokens):
248	# type: (List[Token]) -> None
249	"""Strip leading whitespace from tokens.
250
251	May return original list unmodified, or a new list.
252
253	Must respect lossless invariant - see test/lossless/multiline-str.sh
254
255	For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
256	arena.
257	"""
258	if 0:
259	log('--')
260	for tok in tokens:
261	#log('tok %s', tok)
262	import sys
263	from asdl import format as fmt
264	ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
265	tree = tok.AbbreviatedTree()
266	fmt.PrintTree(tree, ast_f)
267	print('', file=sys.stderr)
268	log('--')
269
270	if len(tokens) <= 1: # We need at least 2 parts to strip anything
271	return
272
273	# var x = ''' # strip initial newline/whitespace
274	# x
275	# '''
276	first = tokens[0]
277	if first.id == Id.Lit_Chars:
278	if _IsTrailingSpace(first):
279	tokens.pop(0) # Remove the first part
280
281	# Figure out what to strip, based on last token
282	last = tokens[-1]
283	to_strip = None # type: Optional[str]
284	if last.id == Id.Lit_Chars:
285	if _IsLeadingSpace(last):
286	to_strip = lexer.TokenVal(last)
287	tokens.pop() # Remove the last part
288
289	if to_strip is None:
290	return
291
292	#log('SQ Stripping %r', to_strip)
293	n = len(to_strip)
294
295	#log('--')
296	for tok in tokens:
297	#log('tok %s', tok)
298	# Strip leading space on tokens that begin lines, by bumping start col
299	if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
300	tok.col = n
301	tok.length -= n
302
303	assert tok.id == Id.Lit_Chars, tok
304	# --tool lossless-cat has a special case for this
305	tok.id = Id.Lit_CharsWithoutPrefix
306
307	#log('STRIP tok %s', tok)
308
309
310	# vim: sw=4