osh/braces.py

OILS / osh / braces.py View on Github | oils.pub

541 lines, 327 significant

1	#!/usr/bin/env python2
2	"""
3	braces.py - Implementation of {andy,bob}@example.com
4
5	NOTE: bash implements brace expansion in the braces.c file (835 lines). It
6	uses goto!
7
8	Possible optimization flags for Compound:
9	- has Lit_LBrace, LitRBrace -- set during word_parse phase
10	- it if has both, then do BraceDetect
11	- has BracedTuple -- set during BraceDetect
12	- if it does, then do the expansion
13	- has Lit_Star, ?, [ ] -- globbing?
14	- but after expansion do you still have those flags?
15	"""
16	from __future__ import print_function
17
18	from _devbuild.gen.id_kind_asdl import Id, Id_t
19	from _devbuild.gen.syntax_asdl import (
20	Token,
21	CompoundWord,
22	word,
23	word_e,
24	word_t,
25	word_part,
26	word_part_e,
27	word_part_t,
28	)
29	from core.error import p_die
30	from frontend import lexer
31	from frontend import match
32	from mycpp import mylib
33	from mycpp.mylib import log, tagswitch
34	from osh import word_
35
36	from typing import List, Optional, cast, TYPE_CHECKING
37	if TYPE_CHECKING:
38	from frontend.match import SimpleLexer
39
40	_ = log
41
42	# Step has to be strictly positive or negative, so we can use 0 for 'not
43	# specified'.
44	NO_STEP = 0
45
46
47	# The brace language has no syntax errors! But we still need to abort the
48	# parse. TODO: Should we expose a strict version later?
49	class _NotARange(Exception):
50
51	def __init__(self, s):
52	# type: (str) -> None
53	pass
54
55
56	class _RangeParser(object):
57	"""Grammar for ranges:
58
59	step = Dots Int
60	int_range = Int Dots Int step?
61	char_range = Char Dots Char step?
62	range = (int_range \| char_range) Eof # ensure no extra tokens!
63	"""
64
65	def __init__(self, lexer, blame_tok):
66	# type: (SimpleLexer, Token) -> None
67	self.lexer = lexer
68	self.blame_tok = blame_tok
69
70	self.token_type = Id.Undefined_Tok
71	self.token_val = ''
72
73	def _Next(self):
74	# type: () -> None
75	"""Move to the next token."""
76	self.token_type, self.token_val = self.lexer.Next()
77
78	def _Eat(self, token_type):
79	# type: (Id_t) -> str
80	if self.token_type != token_type:
81	raise _NotARange('Expected %d, got %d' %
82	(token_type, self.token_type))
83	val = self.token_val
84	self._Next()
85	return val
86
87	def _ParseStep(self):
88	# type: () -> int
89	self._Next() # past Dots
90	step = int(self._Eat(Id.Range_Int))
91	if step == 0:
92	p_die("Step can't be 0", self.blame_tok)
93	return step
94
95	def _ParseRange(self, range_kind):
96	# type: (Id_t) -> word_part.BracedRange
97	start = self.token_val
98	self._Next() # past Char
99
100	self._Eat(Id.Range_Dots)
101	end = self._Eat(range_kind)
102
103	if self.token_type == Id.Range_Dots:
104	step = self._ParseStep()
105	else:
106	step = NO_STEP
107
108	part = word_part.BracedRange(self.blame_tok, range_kind, start, end,
109	step)
110	return part
111
112	def Parse(self):
113	# type: () -> word_part.BracedRange
114	self._Next()
115	if self.token_type == Id.Range_Int:
116	part = self._ParseRange(self.token_type)
117
118	# Check step validity and fill in a default
119	start = int(part.start)
120	end = int(part.end)
121	if start < end:
122	if part.step == NO_STEP:
123	part.step = 1
124	if part.step <= 0: # 0 step is not allowed
125	p_die(
126	'Invalid step %d for ascending integer range' %
127	part.step, self.blame_tok)
128	elif start > end:
129	if part.step == NO_STEP:
130	part.step = -1
131	if part.step >= 0: # 0 step is not allowed
132	p_die(
133	'Invalid step %d for descending integer range' %
134	part.step, self.blame_tok)
135	else:
136	# {1..1} singleton range is dumb but I suppose consistent
137	part.step = 1
138
139	elif self.token_type == Id.Range_Char:
140	part = self._ParseRange(self.token_type)
141
142	# Compare integers because mycpp doesn't support < on strings!
143	start_num = ord(part.start[0])
144	end_num = ord(part.end[0])
145
146	# Check step validity and fill in a default
147	if start_num < end_num:
148	if part.step == NO_STEP:
149	part.step = 1
150	if part.step <= 0: # 0 step is not allowed
151	p_die(
152	'Invalid step %d for ascending character range' %
153	part.step, self.blame_tok)
154	elif start_num > end_num:
155	if part.step == NO_STEP:
156	part.step = -1
157	if part.step >= 0: # 0 step is not allowed
158	p_die(
159	'Invalid step %d for descending character range' %
160	part.step, self.blame_tok)
161	else:
162	# {a..a} singleton range is dumb but I suppose consistent
163	part.step = 1
164
165	# Check matching cases
166	upper1 = part.start.isupper()
167	upper2 = part.end.isupper()
168	if upper1 != upper2:
169	p_die('Mismatched cases in character range', self.blame_tok)
170
171	else:
172	raise _NotARange('')
173
174	# prevent unexpected trailing tokens
175	self._Eat(Id.Eol_Tok)
176	return part
177
178
179	def _RangePartDetect(tok):
180	# type: (Token) -> Optional[word_part.BracedRange]
181	"""Parse the token and return a new word_part if it looks like a range."""
182
183	lx = match.BraceRangeLexer(lexer.TokenVal(tok))
184	p = _RangeParser(lx, tok)
185	try:
186	part = p.Parse()
187	except _NotARange as e:
188	return None
189	return part
190
191
192	class _StackFrame(object):
193
194	def __init__(self, cur_parts):
195	# type: (List[word_part_t]) -> None
196	self.cur_parts = cur_parts
197	self.alt_part = word_part.BracedTuple([])
198	self.saw_comma = False
199
200
201	def BraceDetect(w):
202	# type: (CompoundWord) -> Optional[word.BracedTree]
203	"""Return a new word if the input word looks like a brace expansion.
204
205	e.g. {a,b} or {1..10..2} (TODO)
206	Do we want to accept {01..02} ? zsh does make some attempt to do this too.
207
208	NOTE: This is an iterative algorithm that uses a stack. The grammar-based
209	approach didn't seem natural.
210
211	It's not LL(1) because of 'part*'. And not LL(k) even? Maybe it be handled
212	with an LR parser? In any case the imperative algorithm with 'early return'
213	for a couple cases is fairly simple.
214
215	Grammar:
216	# an alternative is a literal, possibly empty, or another brace_expr
217
218	part = <any part except Literal>
219	alt = part* \| brace_expr
220
221	# a brace_expr is group of at least 2 braced and comma-separated
222	# alternatives, with optional prefix and suffix.
223	brace_expr = part* '{' alt ',' alt (',' alt)* '}' part*
224	"""
225	# Errors:
226	# }a{ - stack depth dips below 0
227	# {a,b}{ - Stack depth doesn't end at 0
228	# {a} - no comma, and also not an numeric range
229
230	# The shortest possible brace expansion is {,}. This heuristic prevents a
231	# lot of garbage from being created, since otherwise nearly every word
232	# would be checked. We could be even more precise but this is cheap.
233	if len(w.parts) < 3:
234	return None
235
236	cur_parts = [] # type: List[word_part_t]
237	stack = [] # type: List[_StackFrame]
238
239	found = False
240
241	for i, part in enumerate(w.parts):
242	append = True
243	UP_part = part
244	if part.tag() == word_part_e.Literal:
245	part = cast(Token, UP_part)
246	id_ = part.id
247	if id_ == Id.Lit_LBrace:
248	# Save prefix parts. Start new parts list.
249	new_frame = _StackFrame(cur_parts)
250	stack.append(new_frame)
251	cur_parts = [] # clear
252	append = False
253	found = True # assume found, but can early exit with None later
254
255	elif id_ == Id.Lit_Comma: # Append a new alternative.
256	# NOTE: Should we allow this:
257	# ,{a,b}
258	# or force this:
259	# \,{a,b}
260	# ? We're forcing braces right now but not commas.
261	if len(stack):
262	stack[-1].saw_comma = True
263	stack[-1].alt_part.words.append(CompoundWord(cur_parts))
264	cur_parts = [] # clear
265	append = False
266
267	elif id_ == Id.Lit_RBrace:
268	if len(stack) == 0: # e.g. echo {a,b}{ -- unbalanced {
269	return None # do not expand ANYTHING because of invalid syntax
270
271	# Detect {1..10} and {1..10..2}
272
273	#log('stack[-1]: %s', stack[-1])
274	#log('cur_parts: %s', cur_parts)
275
276	range_part = None # type: Optional[word_part_t]
277	# only allow {1..3}, not {a,1..3}
278	if not stack[-1].saw_comma and len(cur_parts) == 1:
279	# It must be ONE part. For example, -1..-100..-2 is initially
280	# lexed as a single Lit_Chars token.
281	part2 = cur_parts[0]
282	if part2.tag() == word_part_e.Literal:
283	tok = cast(Token, part2)
284	if tok.id == Id.Lit_Chars:
285	range_part = _RangePartDetect(tok)
286	if range_part:
287	frame = stack.pop()
288	cur_parts = frame.cur_parts
289	cur_parts.append(range_part)
290	append = False
291
292	# It doesn't look like a range -- process it as the last element in
293	# {a,b,c}
294
295	if not range_part:
296	if not stack[
297	-1].saw_comma: # {foo} is not a real alternative
298	return None # early return
299
300	stack[-1].alt_part.words.append(CompoundWord(cur_parts))
301
302	frame = stack.pop()
303	cur_parts = frame.cur_parts
304	cur_parts.append(frame.alt_part)
305	append = False
306
307	if append:
308	cur_parts.append(part)
309
310	if len(stack) != 0:
311	return None
312
313	if found:
314	return word.BracedTree(cur_parts)
315	else:
316	return None
317
318
319	def BraceDetectAll(words):
320	# type: (List[CompoundWord]) -> List[word_t]
321	"""Return a new list of words, possibly with BracedTree instances."""
322	out = [] # type: List[word_t]
323	for w in words:
324	brace_tree = BraceDetect(w)
325	if brace_tree:
326	out.append(brace_tree)
327	continue
328	out.append(w)
329	return out
330
331
332	def _LeadingZeros(s):
333	# type: (str) -> int
334	n = 0
335	for c in s:
336	if c == '0':
337	n += 1
338	else:
339	break
340	return n
341
342
343	def _IntToString(i, width):
344	# type: (int, int) -> str
345	s = str(i)
346	n = len(s)
347	if n < width: # width might be 0
348	# pad with zeros
349	pad = '0' * (width - n)
350	return pad + s
351	else:
352	return s
353
354
355	def _RangeStrings(part):
356	# type: (word_part.BracedRange) -> List[str]
357
358	if part.kind == Id.Range_Int:
359	nums = [] # type: List[str]
360
361	z1 = _LeadingZeros(part.start)
362	z2 = _LeadingZeros(part.end)
363
364	if z1 == 0 and z2 == 0:
365	width = 0
366	else:
367	if z1 < z2:
368	width = len(part.end)
369	else:
370	width = len(part.start)
371
372	n = int(part.start)
373	end = int(part.end)
374	step = part.step
375	if step > 0:
376	while True:
377	nums.append(_IntToString(n, width))
378	n += step
379	if n > end:
380	break
381	else:
382	while True:
383	nums.append(_IntToString(n, width))
384	n += step
385	if n < end:
386	break
387
388	return nums
389
390	else: # Id.Range_Char
391	chars = [] # type: List[str]
392
393	n = ord(part.start)
394	ord_end = ord(part.end)
395	step = part.step
396	if step > 0:
397	while True:
398	chars.append(chr(n))
399	n += step
400	if n > ord_end:
401	break
402	else:
403	while True:
404	chars.append(chr(n))
405	n += step
406	if n < ord_end:
407	break
408
409	return chars
410
411
412	def _ExpandPart(
413	parts, # type: List[word_part_t]
414	first_alt_index, # type: int
415	suffixes, # type: List[List[word_part_t]]
416	):
417	# type: (...) -> List[List[word_part_t]]
418	"""Mutually recursive with _BraceExpand.
419
420	Args:
421	parts: input parts
422	first_alt_index: index of the first BracedTuple
423	suffixes: List of suffixes to append.
424	"""
425	out = [] # type: List[List[word_part_t]]
426
427	prefix = parts[:first_alt_index]
428	expand_part = parts[first_alt_index]
429
430	UP_part = expand_part
431	with tagswitch(expand_part) as case:
432	if case(word_part_e.BracedTuple):
433	expand_part = cast(word_part.BracedTuple, UP_part)
434	# Call _BraceExpand on each of the inner words too!
435	expanded_alts = [] # type: List[List[word_part_t]]
436	for w in expand_part.words:
437	expanded_alts.extend(_BraceExpand(w.parts))
438
439	for alt_parts in expanded_alts:
440	for suffix in suffixes:
441	out_parts = [] # type: List[word_part_t]
442	out_parts.extend(prefix)
443	out_parts.extend(alt_parts)
444	out_parts.extend(suffix)
445	out.append(out_parts)
446
447	elif case(word_part_e.BracedRange):
448	expand_part = cast(word_part.BracedRange, UP_part)
449	# Not mutually recursive with _BraceExpand
450	strs = _RangeStrings(expand_part)
451
452	# Often prefix and suffixes are empty, but there's not that much to
453	# optimize
454	# log('prefix %s, suffixes %s, strs %s', prefix, suffixes, strs)
455
456	for s in strs:
457	for suffix in suffixes:
458	out_parts_ = [] # type: List[word_part_t]
459	out_parts_.extend(prefix)
460
461	# TODO: Does it help to preserve location info?
462	# t = Token(Id.Lit_Chars, expand_part.locs[0], s)
463	t = lexer.DummyToken(Id.Lit_Chars, s)
464
465	out_parts_.append(t)
466	out_parts_.extend(suffix)
467	out.append(out_parts_)
468
469	else:
470	raise AssertionError()
471
472	return out
473
474
475	def _BraceExpand(parts):
476	# type: (List[word_part_t]) -> List[List[word_part_t]]
477	"""Mutually recursive with _ExpandPart."""
478
479	# manual GC point, because brace expansion is a separate stage that does a
480	# bunch of computation outside the interpreter
481	mylib.MaybeCollect()
482
483	num_alts = 0
484	first_alt_index = -1
485	for i, part in enumerate(parts):
486	tag = part.tag()
487	if tag in (word_part_e.BracedTuple, word_part_e.BracedRange):
488	num_alts += 1
489	if num_alts == 1:
490	first_alt_index = i
491	elif num_alts == 2:
492	break # don't need to count anymore
493
494	# NOTE: There are TWO recursive calls here, not just one -- one for
495	# nested {}, and one for adjacent {}. This is hard to do iteratively.
496	if num_alts == 0:
497	return [parts]
498
499	elif num_alts == 1:
500	suffix = parts[first_alt_index + 1:]
501	return _ExpandPart(parts, first_alt_index, [suffix])
502
503	else:
504	# Now call it on the tail
505	tail_parts = parts[first_alt_index + 1:]
506	suffixes = _BraceExpand(tail_parts) # recursive call
507	return _ExpandPart(parts, first_alt_index, suffixes)
508
509
510	def BraceExpandWords(words):
511	# type: (List[word_t]) -> List[CompoundWord]
512	out = [] # type: List[CompoundWord]
513	for w in words:
514	UP_w = w
515	with tagswitch(w) as case:
516	if case(word_e.BracedTree):
517	w = cast(word.BracedTree, UP_w)
518	# Note: for the case of {1..100000}, this is a flat list of Token.
519	# Would be nice to optimize, but we don't really know the structure
520	# ahead of time
521	parts_list = _BraceExpand(w.parts)
522	for parts in parts_list:
523	expanded = CompoundWord(parts)
524
525	# Now do tilde detection on brace-expanded word
526	ti = word_.TildeDetect2(expanded)
527	if ti:
528	out.append(ti)
529	else:
530	out.append(expanded)
531
532	elif case(word_e.Compound):
533	w = cast(CompoundWord, UP_w)
534
535	# Already did tilde detection before expansion
536	out.append(w)
537
538	else:
539	raise AssertionError(w.tag())
540
541	return out