OILS / osh / braces.py View on Github | oils.pub

541 lines, 327 significant
1#!/usr/bin/env python2
2"""
3braces.py - Implementation of {andy,bob}@example.com
4
5NOTE: bash implements brace expansion in the braces.c file (835 lines). It
6uses goto!
7
8Possible optimization flags for Compound:
9- has Lit_LBrace, LitRBrace -- set during word_parse phase
10 - it if has both, then do BraceDetect
11- has BracedTuple -- set during BraceDetect
12 - if it does, then do the expansion
13- has Lit_Star, ?, [ ] -- globbing?
14 - but after expansion do you still have those flags?
15"""
16from __future__ import print_function
17
18from _devbuild.gen.id_kind_asdl import Id, Id_t
19from _devbuild.gen.syntax_asdl import (
20 Token,
21 CompoundWord,
22 word,
23 word_e,
24 word_t,
25 word_part,
26 word_part_e,
27 word_part_t,
28)
29from core.error import p_die
30from frontend import lexer
31from frontend import match
32from mycpp import mylib
33from mycpp.mylib import log, tagswitch
34from osh import word_
35
36from typing import List, Optional, cast, TYPE_CHECKING
37if TYPE_CHECKING:
38 from frontend.match import SimpleLexer
39
40_ = log
41
42# Step has to be strictly positive or negative, so we can use 0 for 'not
43# specified'.
44NO_STEP = 0
45
46
47# The brace language has no syntax errors! But we still need to abort the
48# parse. TODO: Should we expose a strict version later?
49class _NotARange(Exception):
50
51 def __init__(self, s):
52 # type: (str) -> None
53 pass
54
55
56class _RangeParser(object):
57 """Grammar for ranges:
58
59 step = Dots Int
60 int_range = Int Dots Int step?
61 char_range = Char Dots Char step?
62 range = (int_range | char_range) Eof # ensure no extra tokens!
63 """
64
65 def __init__(self, lexer, blame_tok):
66 # type: (SimpleLexer, Token) -> None
67 self.lexer = lexer
68 self.blame_tok = blame_tok
69
70 self.token_type = Id.Undefined_Tok
71 self.token_val = ''
72
73 def _Next(self):
74 # type: () -> None
75 """Move to the next token."""
76 self.token_type, self.token_val = self.lexer.Next()
77
78 def _Eat(self, token_type):
79 # type: (Id_t) -> str
80 if self.token_type != token_type:
81 raise _NotARange('Expected %d, got %d' %
82 (token_type, self.token_type))
83 val = self.token_val
84 self._Next()
85 return val
86
87 def _ParseStep(self):
88 # type: () -> int
89 self._Next() # past Dots
90 step = int(self._Eat(Id.Range_Int))
91 if step == 0:
92 p_die("Step can't be 0", self.blame_tok)
93 return step
94
95 def _ParseRange(self, range_kind):
96 # type: (Id_t) -> word_part.BracedRange
97 start = self.token_val
98 self._Next() # past Char
99
100 self._Eat(Id.Range_Dots)
101 end = self._Eat(range_kind)
102
103 if self.token_type == Id.Range_Dots:
104 step = self._ParseStep()
105 else:
106 step = NO_STEP
107
108 part = word_part.BracedRange(self.blame_tok, range_kind, start, end,
109 step)
110 return part
111
112 def Parse(self):
113 # type: () -> word_part.BracedRange
114 self._Next()
115 if self.token_type == Id.Range_Int:
116 part = self._ParseRange(self.token_type)
117
118 # Check step validity and fill in a default
119 start = int(part.start)
120 end = int(part.end)
121 if start < end:
122 if part.step == NO_STEP:
123 part.step = 1
124 if part.step <= 0: # 0 step is not allowed
125 p_die(
126 'Invalid step %d for ascending integer range' %
127 part.step, self.blame_tok)
128 elif start > end:
129 if part.step == NO_STEP:
130 part.step = -1
131 if part.step >= 0: # 0 step is not allowed
132 p_die(
133 'Invalid step %d for descending integer range' %
134 part.step, self.blame_tok)
135 else:
136 # {1..1} singleton range is dumb but I suppose consistent
137 part.step = 1
138
139 elif self.token_type == Id.Range_Char:
140 part = self._ParseRange(self.token_type)
141
142 # Compare integers because mycpp doesn't support < on strings!
143 start_num = ord(part.start[0])
144 end_num = ord(part.end[0])
145
146 # Check step validity and fill in a default
147 if start_num < end_num:
148 if part.step == NO_STEP:
149 part.step = 1
150 if part.step <= 0: # 0 step is not allowed
151 p_die(
152 'Invalid step %d for ascending character range' %
153 part.step, self.blame_tok)
154 elif start_num > end_num:
155 if part.step == NO_STEP:
156 part.step = -1
157 if part.step >= 0: # 0 step is not allowed
158 p_die(
159 'Invalid step %d for descending character range' %
160 part.step, self.blame_tok)
161 else:
162 # {a..a} singleton range is dumb but I suppose consistent
163 part.step = 1
164
165 # Check matching cases
166 upper1 = part.start.isupper()
167 upper2 = part.end.isupper()
168 if upper1 != upper2:
169 p_die('Mismatched cases in character range', self.blame_tok)
170
171 else:
172 raise _NotARange('')
173
174 # prevent unexpected trailing tokens
175 self._Eat(Id.Eol_Tok)
176 return part
177
178
179def _RangePartDetect(tok):
180 # type: (Token) -> Optional[word_part.BracedRange]
181 """Parse the token and return a new word_part if it looks like a range."""
182
183 lx = match.BraceRangeLexer(lexer.TokenVal(tok))
184 p = _RangeParser(lx, tok)
185 try:
186 part = p.Parse()
187 except _NotARange as e:
188 return None
189 return part
190
191
192class _StackFrame(object):
193
194 def __init__(self, cur_parts):
195 # type: (List[word_part_t]) -> None
196 self.cur_parts = cur_parts
197 self.alt_part = word_part.BracedTuple([])
198 self.saw_comma = False
199
200
201def BraceDetect(w):
202 # type: (CompoundWord) -> Optional[word.BracedTree]
203 """Return a new word if the input word looks like a brace expansion.
204
205 e.g. {a,b} or {1..10..2} (TODO)
206 Do we want to accept {01..02} ? zsh does make some attempt to do this too.
207
208 NOTE: This is an iterative algorithm that uses a stack. The grammar-based
209 approach didn't seem natural.
210
211 It's not LL(1) because of 'part*'. And not LL(k) even? Maybe it be handled
212 with an LR parser? In any case the imperative algorithm with 'early return'
213 for a couple cases is fairly simple.
214
215 Grammar:
216 # an alternative is a literal, possibly empty, or another brace_expr
217
218 part = <any part except Literal>
219 alt = part* | brace_expr
220
221 # a brace_expr is group of at least 2 braced and comma-separated
222 # alternatives, with optional prefix and suffix.
223 brace_expr = part* '{' alt ',' alt (',' alt)* '}' part*
224 """
225 # Errors:
226 # }a{ - stack depth dips below 0
227 # {a,b}{ - Stack depth doesn't end at 0
228 # {a} - no comma, and also not an numeric range
229
230 # The shortest possible brace expansion is {,}. This heuristic prevents a
231 # lot of garbage from being created, since otherwise nearly every word
232 # would be checked. We could be even more precise but this is cheap.
233 if len(w.parts) < 3:
234 return None
235
236 cur_parts = [] # type: List[word_part_t]
237 stack = [] # type: List[_StackFrame]
238
239 found = False
240
241 for i, part in enumerate(w.parts):
242 append = True
243 UP_part = part
244 if part.tag() == word_part_e.Literal:
245 part = cast(Token, UP_part)
246 id_ = part.id
247 if id_ == Id.Lit_LBrace:
248 # Save prefix parts. Start new parts list.
249 new_frame = _StackFrame(cur_parts)
250 stack.append(new_frame)
251 cur_parts = [] # clear
252 append = False
253 found = True # assume found, but can early exit with None later
254
255 elif id_ == Id.Lit_Comma: # Append a new alternative.
256 # NOTE: Should we allow this:
257 # ,{a,b}
258 # or force this:
259 # \,{a,b}
260 # ? We're forcing braces right now but not commas.
261 if len(stack):
262 stack[-1].saw_comma = True
263 stack[-1].alt_part.words.append(CompoundWord(cur_parts))
264 cur_parts = [] # clear
265 append = False
266
267 elif id_ == Id.Lit_RBrace:
268 if len(stack) == 0: # e.g. echo {a,b}{ -- unbalanced {
269 return None # do not expand ANYTHING because of invalid syntax
270
271 # Detect {1..10} and {1..10..2}
272
273 #log('stack[-1]: %s', stack[-1])
274 #log('cur_parts: %s', cur_parts)
275
276 range_part = None # type: Optional[word_part_t]
277 # only allow {1..3}, not {a,1..3}
278 if not stack[-1].saw_comma and len(cur_parts) == 1:
279 # It must be ONE part. For example, -1..-100..-2 is initially
280 # lexed as a single Lit_Chars token.
281 part2 = cur_parts[0]
282 if part2.tag() == word_part_e.Literal:
283 tok = cast(Token, part2)
284 if tok.id == Id.Lit_Chars:
285 range_part = _RangePartDetect(tok)
286 if range_part:
287 frame = stack.pop()
288 cur_parts = frame.cur_parts
289 cur_parts.append(range_part)
290 append = False
291
292 # It doesn't look like a range -- process it as the last element in
293 # {a,b,c}
294
295 if not range_part:
296 if not stack[
297 -1].saw_comma: # {foo} is not a real alternative
298 return None # early return
299
300 stack[-1].alt_part.words.append(CompoundWord(cur_parts))
301
302 frame = stack.pop()
303 cur_parts = frame.cur_parts
304 cur_parts.append(frame.alt_part)
305 append = False
306
307 if append:
308 cur_parts.append(part)
309
310 if len(stack) != 0:
311 return None
312
313 if found:
314 return word.BracedTree(cur_parts)
315 else:
316 return None
317
318
319def BraceDetectAll(words):
320 # type: (List[CompoundWord]) -> List[word_t]
321 """Return a new list of words, possibly with BracedTree instances."""
322 out = [] # type: List[word_t]
323 for w in words:
324 brace_tree = BraceDetect(w)
325 if brace_tree:
326 out.append(brace_tree)
327 continue
328 out.append(w)
329 return out
330
331
332def _LeadingZeros(s):
333 # type: (str) -> int
334 n = 0
335 for c in s:
336 if c == '0':
337 n += 1
338 else:
339 break
340 return n
341
342
343def _IntToString(i, width):
344 # type: (int, int) -> str
345 s = str(i)
346 n = len(s)
347 if n < width: # width might be 0
348 # pad with zeros
349 pad = '0' * (width - n)
350 return pad + s
351 else:
352 return s
353
354
355def _RangeStrings(part):
356 # type: (word_part.BracedRange) -> List[str]
357
358 if part.kind == Id.Range_Int:
359 nums = [] # type: List[str]
360
361 z1 = _LeadingZeros(part.start)
362 z2 = _LeadingZeros(part.end)
363
364 if z1 == 0 and z2 == 0:
365 width = 0
366 else:
367 if z1 < z2:
368 width = len(part.end)
369 else:
370 width = len(part.start)
371
372 n = int(part.start)
373 end = int(part.end)
374 step = part.step
375 if step > 0:
376 while True:
377 nums.append(_IntToString(n, width))
378 n += step
379 if n > end:
380 break
381 else:
382 while True:
383 nums.append(_IntToString(n, width))
384 n += step
385 if n < end:
386 break
387
388 return nums
389
390 else: # Id.Range_Char
391 chars = [] # type: List[str]
392
393 n = ord(part.start)
394 ord_end = ord(part.end)
395 step = part.step
396 if step > 0:
397 while True:
398 chars.append(chr(n))
399 n += step
400 if n > ord_end:
401 break
402 else:
403 while True:
404 chars.append(chr(n))
405 n += step
406 if n < ord_end:
407 break
408
409 return chars
410
411
412def _ExpandPart(
413 parts, # type: List[word_part_t]
414 first_alt_index, # type: int
415 suffixes, # type: List[List[word_part_t]]
416):
417 # type: (...) -> List[List[word_part_t]]
418 """Mutually recursive with _BraceExpand.
419
420 Args:
421 parts: input parts
422 first_alt_index: index of the first BracedTuple
423 suffixes: List of suffixes to append.
424 """
425 out = [] # type: List[List[word_part_t]]
426
427 prefix = parts[:first_alt_index]
428 expand_part = parts[first_alt_index]
429
430 UP_part = expand_part
431 with tagswitch(expand_part) as case:
432 if case(word_part_e.BracedTuple):
433 expand_part = cast(word_part.BracedTuple, UP_part)
434 # Call _BraceExpand on each of the inner words too!
435 expanded_alts = [] # type: List[List[word_part_t]]
436 for w in expand_part.words:
437 expanded_alts.extend(_BraceExpand(w.parts))
438
439 for alt_parts in expanded_alts:
440 for suffix in suffixes:
441 out_parts = [] # type: List[word_part_t]
442 out_parts.extend(prefix)
443 out_parts.extend(alt_parts)
444 out_parts.extend(suffix)
445 out.append(out_parts)
446
447 elif case(word_part_e.BracedRange):
448 expand_part = cast(word_part.BracedRange, UP_part)
449 # Not mutually recursive with _BraceExpand
450 strs = _RangeStrings(expand_part)
451
452 # Often prefix and suffixes are empty, but there's not that much to
453 # optimize
454 # log('prefix %s, suffixes %s, strs %s', prefix, suffixes, strs)
455
456 for s in strs:
457 for suffix in suffixes:
458 out_parts_ = [] # type: List[word_part_t]
459 out_parts_.extend(prefix)
460
461 # TODO: Does it help to preserve location info?
462 # t = Token(Id.Lit_Chars, expand_part.locs[0], s)
463 t = lexer.DummyToken(Id.Lit_Chars, s)
464
465 out_parts_.append(t)
466 out_parts_.extend(suffix)
467 out.append(out_parts_)
468
469 else:
470 raise AssertionError()
471
472 return out
473
474
475def _BraceExpand(parts):
476 # type: (List[word_part_t]) -> List[List[word_part_t]]
477 """Mutually recursive with _ExpandPart."""
478
479 # manual GC point, because brace expansion is a separate stage that does a
480 # bunch of computation outside the interpreter
481 mylib.MaybeCollect()
482
483 num_alts = 0
484 first_alt_index = -1
485 for i, part in enumerate(parts):
486 tag = part.tag()
487 if tag in (word_part_e.BracedTuple, word_part_e.BracedRange):
488 num_alts += 1
489 if num_alts == 1:
490 first_alt_index = i
491 elif num_alts == 2:
492 break # don't need to count anymore
493
494 # NOTE: There are TWO recursive calls here, not just one -- one for
495 # nested {}, and one for adjacent {}. This is hard to do iteratively.
496 if num_alts == 0:
497 return [parts]
498
499 elif num_alts == 1:
500 suffix = parts[first_alt_index + 1:]
501 return _ExpandPart(parts, first_alt_index, [suffix])
502
503 else:
504 # Now call it on the tail
505 tail_parts = parts[first_alt_index + 1:]
506 suffixes = _BraceExpand(tail_parts) # recursive call
507 return _ExpandPart(parts, first_alt_index, suffixes)
508
509
510def BraceExpandWords(words):
511 # type: (List[word_t]) -> List[CompoundWord]
512 out = [] # type: List[CompoundWord]
513 for w in words:
514 UP_w = w
515 with tagswitch(w) as case:
516 if case(word_e.BracedTree):
517 w = cast(word.BracedTree, UP_w)
518 # Note: for the case of {1..100000}, this is a flat list of Token.
519 # Would be nice to optimize, but we don't really know the structure
520 # ahead of time
521 parts_list = _BraceExpand(w.parts)
522 for parts in parts_list:
523 expanded = CompoundWord(parts)
524
525 # Now do tilde detection on brace-expanded word
526 ti = word_.TildeDetect2(expanded)
527 if ti:
528 out.append(ti)
529 else:
530 out.append(expanded)
531
532 elif case(word_e.Compound):
533 w = cast(CompoundWord, UP_w)
534
535 # Already did tilde detection before expansion
536 out.append(w)
537
538 else:
539 raise AssertionError(w.tag())
540
541 return out