OILS / osh / glob_.py View on Github | oils.pub

642 lines, 337 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7 glob_part, glob_part_e, glob_part_t,
8 loc_t)
9from _devbuild.gen.value_asdl import value
10from core import pyos, pyutil, error
11from frontend import match
12from mycpp import mylib
13from mycpp.mylib import log, print_stderr
14from pylib import os_path
15
16from libc import GLOB_PERIOD
17from _devbuild.gen.value_asdl import value_e
18from _devbuild.gen.runtime_asdl import scope_e
19
20from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21if TYPE_CHECKING:
22 from core import optview
23 from core import state
24 from frontend.match import SimpleLexer
25
26_ = log
27
28
29def LooksLikeGlob(s):
30 # type: (str) -> bool
31 """Does this string look like a glob pattern?
32
33 Like other shells, OSH avoids calls to glob() unless there are glob
34 metacharacters.
35
36 TODO: Reference lib/glob / glob_pattern functions in bash
37 $ grep glob_pattern lib/glob/*
38
39 Used:
40 1. in Globber below
41 2. for the slow path / fast path of prefix/suffix/patsub ops.
42 """
43 left_bracket = False
44 i = 0
45 n = len(s)
46 while i < n:
47 c = mylib.ByteAt(s, i)
48
49 if mylib.ByteEquals(c, '\\'):
50 i += 1
51
52 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53 return True
54
55 elif mylib.ByteEquals(c, '['):
56 left_bracket = True
57
58 elif mylib.ByteEquals(c, ']') and left_bracket:
59 # It has at least one pair of balanced []. Not bothering to check stray
60 # [ or ].
61 return True
62
63 i += 1
64 return False
65
66
67def LooksLikeStaticGlob(w):
68 # type: (CompoundWord) -> bool
69 """Like LooksLikeGlob, but for static words."""
70
71 left_bracket = False
72 for part in w.parts:
73 if part.tag() == word_part_e.Literal:
74 id_ = cast(Token, part).id
75 if id_ in (Id.Lit_Star, Id.Lit_QMark):
76 return True
77 elif id_ == Id.Lit_LBracket:
78 left_bracket = True
79 elif id_ == Id.Lit_RBracket and left_bracket:
80 return True
81 return False
82
83
84# Glob Helpers for WordParts.
85# NOTE: Escaping / doesn't work, because it's not a filename character.
86# ! : - are metachars within character classes
87# ( ) | are extended glob characters, and it's OK to add extra \ when the
88# underlying library doesn't support extended globs
89# we don't need to escape the @ in @(cc), because escaping ( is enough
90GLOB_META_CHARS = r'\*?[]-:!()|'
91
92# Check invariant needed to escape literal \ as \@
93assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96def GlobEscape(s):
97 # type: (str) -> str
98 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102def GlobEscapeBackslash(s):
103 # type: (str) -> str
104 """Glob escape a string for an unquoted var sub.
105
106 Used to evaluate something like *$v with v='a\*b.txt'
107
108 We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110 See test cases in spec/glob.test.sh
111
112 - If globbing is performed, then \* evaluates to literal '*'
113 - that is, \ is an escape for the *
114 - If globbing is NOT performed (set -o noglob or no matching files), then
115 \* evaluates to '\*'
116 - that is, the \ is preserved literally
117 """
118 return s.replace('\\', r'\@')
119
120
121# Bug fix: add [] so [[:space:]] is not special, etc.
122ERE_META_CHARS = r'\?*+{}^$.()|[]'
123
124
125def ExtendedRegexEscape(s):
126 # type: (str) -> str
127 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128 I don't think libc has a function to do this. Escape these characters:
129
130 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131 """
132 return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135def GlobUnescape(s):
136 # type: (str) -> str
137 """Remove glob escaping from a string.
138
139 Used when there is no glob match.
140 TODO: Can probably get rid of this, as long as you save the original word.
141
142 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
143 word_eval _JoinElideEscape and EvalWordToString you have to build two
144 'parallel' strings -- one escaped and one not.
145 """
146 unescaped = [] # type: List[int]
147 i = 0
148 n = len(s)
149 while i < n:
150 c = mylib.ByteAt(s, i)
151
152 if mylib.ByteEquals(c, '\\') and i != n - 1:
153 # TODO: GlobEscape() turns \ into \\, so a string should never end
154 # with a single backslash.
155 # Suppressed this assert to fix bug #698, #628 is still there.
156 # Check them again.
157 assert i != n - 1, 'Trailing backslash: %r' % s
158
159 i += 1
160 c2 = mylib.ByteAt(s, i)
161
162 if mylib.ByteInSet(c2, GLOB_META_CHARS):
163 unescaped.append(c2)
164 elif mylib.ByteEquals(c2, '@'):
165 unescaped.append(pyos.BACKSLASH_CH)
166 else:
167 raise AssertionError("Unexpected escaped character %r" % c2)
168 else:
169 unescaped.append(c)
170 i += 1
171 return mylib.JoinBytes(unescaped)
172
173
174def GlobUnescapeBackslash(s):
175 # type: (str) -> str
176 """Inverse of GlobEscapeBackslash - turns \@ into \ """
177 unescaped = [] # type: List[int]
178 i = 0
179 n = len(s)
180 while i < n:
181 c = mylib.ByteAt(s, i)
182
183 if mylib.ByteEquals(c, '\\') and i != n - 1:
184 # Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185 # could end with a single backslash?
186 assert i != n - 1, 'Trailing backslash: %r' % s
187
188 i += 1
189 c2 = mylib.ByteAt(s, i)
190
191 if mylib.ByteEquals(c2, '@'):
192 unescaped.append(pyos.BACKSLASH_CH)
193 else:
194 unescaped.append(pyos.BACKSLASH_CH)
195 unescaped.append(c2)
196 else:
197 unescaped.append(c)
198 i += 1
199 return mylib.JoinBytes(unescaped)
200
201
202# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203# positions of matches. So we convert globs to regexps.
204
205# Problems:
206# - What about unicode? Do we have to set any global variables? We want it to
207# always use utf-8?
208
209
210class _GlobParser(object):
211
212 def __init__(self, lexer):
213 # type: (SimpleLexer) -> None
214 self.lexer = lexer
215 self.token_type = Id.Undefined_Tok
216 self.token_val = ''
217 self.warnings = [] # type: List[str]
218
219 def _Next(self):
220 # type: () -> None
221 """Move to the next token."""
222 self.token_type, self.token_val = self.lexer.Next()
223
224 def _ParseCharClass(self):
225 # type: () -> List[glob_part_t]
226 """
227 Returns:
228 a CharClass if the parse succeeds, or a Literal if fails. In the latter
229 case, we also append a warning.
230 """
231 first_token = glob_part.Literal(self.token_type, self.token_val)
232 balance = 1 # We already saw a [
233 tokens = [] # type: List[Tuple[Id_t, str]]
234
235 # NOTE: There is a special rule where []] and [[] are valid globs. Also
236 # [^[] and sometimes [^]], although that one is ambiguous!
237 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238 # punting on this now because the rule isn't clear and consistent between
239 # shells.
240
241 while True:
242 self._Next()
243
244 if self.token_type == Id.Eol_Tok:
245 # TODO: location info
246 self.warnings.append(
247 'Malformed character class; treating as literal')
248 parts = [first_token] # type: List[glob_part_t]
249 for (id_, s) in tokens:
250 parts.append(glob_part.Literal(id_, s))
251 return parts
252
253 if self.token_type == Id.Glob_LBracket:
254 balance += 1
255 elif self.token_type == Id.Glob_RBracket:
256 balance -= 1
257
258 if balance == 0:
259 break
260 tokens.append(
261 (self.token_type, self.token_val)) # Don't append the last ]
262
263 negated = False
264 if len(tokens):
265 id1, _ = tokens[0]
266 # NOTE: Both ! and ^ work for negation in globs
267 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268 # TODO: Warn about the one that's not recommended?
269 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270 negated = True
271 tokens = tokens[1:]
272 strs = [s for _, s in tokens]
273 return [glob_part.CharClass(negated, strs)]
274
275 def Parse(self):
276 # type: () -> Tuple[List[glob_part_t], List[str]]
277 """
278 Returns:
279 regex string (or None if it's not a glob)
280 A list of warnings about the syntax
281 """
282 parts = [] # type: List[glob_part_t]
283
284 while True:
285 self._Next()
286 id_ = self.token_type
287 s = self.token_val
288
289 #log('%s %r', self.token_type, self.token_val)
290 if id_ == Id.Eol_Tok:
291 break
292
293 if id_ in (Id.Glob_Star, Id.Glob_QMark):
294 parts.append(glob_part.Operator(id_))
295
296 elif id_ == Id.Glob_LBracket:
297 # Could return a Literal or a CharClass
298 parts.extend(self._ParseCharClass())
299
300 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301 # BadBackslash}
302 parts.append(glob_part.Literal(id_, s))
303
304 # Also check for warnings. TODO: location info.
305 if id_ == Id.Glob_RBracket:
306 self.warnings.append('Got unescaped right bracket')
307 if id_ == Id.Glob_BadBackslash:
308 self.warnings.append('Got unescaped trailing backslash')
309
310 return parts, self.warnings
311
312
313_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
314
315
316def _GenerateERE(parts):
317 # type: (List[glob_part_t]) -> str
318 out = [] # type: List[str]
319
320 for part in parts:
321 tag = part.tag()
322 UP_part = part
323
324 if tag == glob_part_e.Literal:
325 part = cast(glob_part.Literal, UP_part)
326 if part.id == Id.Glob_EscapedChar:
327 assert len(part.s) == 2, part.s
328 # The user could have escaped a char that doesn't need regex escaping,
329 # like \b or something.
330 c = part.s[1]
331 if c in _REGEX_CHARS_TO_ESCAPE:
332 out.append('\\')
333 out.append(c)
334
335 # ! is only for char class
336 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337 out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339 # ^ is only for char class
340 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341 assert len(part.s) == 1, part.s
342 c = part.s
343 if c in _REGEX_CHARS_TO_ESCAPE:
344 out.append('\\')
345 out.append(c)
346
347 # These are UNMATCHED ones not parsed in a glob class
348 elif part.id == Id.Glob_LBracket:
349 out.append('\\[')
350
351 elif part.id == Id.Glob_RBracket:
352 out.append('\\]')
353
354 elif part.id == Id.Glob_BadBackslash:
355 out.append('\\\\')
356
357 elif part.id == Id.Glob_Caret:
358 out.append('^')
359
360 else:
361 raise AssertionError(part.id)
362
363 elif tag == glob_part_e.Operator:
364 part = cast(glob_part.Operator, UP_part)
365 if part.op_id == Id.Glob_QMark:
366 out.append('.')
367 elif part.op_id == Id.Glob_Star:
368 out.append('.*')
369 else:
370 raise AssertionError()
371
372 elif tag == glob_part_e.CharClass:
373 part = cast(glob_part.CharClass, UP_part)
374 out.append('[')
375 if part.negated:
376 out.append('^')
377
378 # Important: the character class is LITERALLY preserved, because we
379 # assume glob char classes are EXACTLY the same as regex char classes,
380 # including the escaping rules.
381 #
382 # TWO WEIRD EXCEPTIONS:
383 # \- is moved to the end as '-'.
384 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385 # want a literal, and the POSIX way to do that is to put it at the end.
386 # \] is moved to the FRONT as ]
387
388 good = [] # type: List[str]
389
390 literal_hyphen = False
391 literal_rbracket = False
392
393 for s in part.strs:
394 if s == '\-':
395 literal_hyphen = True
396 continue
397 if s == '\]':
398 literal_rbracket = True
399 continue
400 good.append(s)
401
402 if literal_rbracket:
403 out.append(']')
404
405 out.extend(good)
406
407 if literal_hyphen:
408 out.append('-')
409
410 out.append(']')
411
412 return ''.join(out)
413
414
415def GlobToERE(pat):
416 # type: (str) -> Tuple[str, List[str]]
417 lexer = match.GlobLexer(pat)
418 p = _GlobParser(lexer)
419 parts, warnings = p.Parse()
420
421 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
422 # a literal, and we could use a more efficient mechanism.
423 # But we would have to DEQUOTE before doing that.
424 if 0:
425 is_glob = False
426 for p in parts:
427 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428 is_glob = True
429 if 0:
430 log('GlobToERE()')
431 for p in parts:
432 log(' %s', p)
433
434 regex = _GenerateERE(parts)
435 #log('pat %s -> regex %s', pat, regex)
436 return regex, warnings
437
438
439# Notes for implementing extglob
440# - libc glob() doesn't have any extension!
441# - Nix stdenv uses !(foo) and @(foo|bar)
442# - can we special case these for now?
443# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
444# result!
445# - Actually I guess we can do that for all of them. That seems fine.
446# - But we have to get the statically parsed arg in here?
447# - or do dynamic parsing
448# - LooksLikeGlob() would have to respect extglob! ugh!
449# - See 2 calls in osh/word_eval.py
450
451
452class Globber(object):
453
454 def __init__(self, exec_opts, mem):
455 # type: (optview.Exec, state.Mem) -> None
456 self.exec_opts = exec_opts
457 self.mem = mem
458 # Cache for parsed GLOBIGNORE patterns to avoid re-parsing
459 self._globignore_cache = {} # type: Dict[str, List[str]]
460
461 # Other unimplemented bash options:
462 #
463 # globstar ** for directories
464 # globasciiranges ascii or unicode char classes (unicode by default)
465 # nocaseglob
466 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
467 # not glob().
468
469 def _GetGlobIgnorePatterns(self):
470 # type: () -> Optional[List[str]]
471 """Get GLOBIGNORE patterns as a list, or None if not set."""
472
473 val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
474 if val.tag() != value_e.Str:
475 return None
476
477 globignore = cast(value.Str, val).s # type: str
478 if len(globignore) == 0:
479 return None
480
481 if globignore in self._globignore_cache:
482 return self._globignore_cache[globignore]
483
484 # Split by colon to get individual patterns, but don't split colons
485 # inside bracket expressions like [[:alnum:]]
486 patterns = [] # type: List[str]
487 current = [] # type: List[str]
488 in_bracket = False
489
490 for c in globignore:
491 if c == '[':
492 in_bracket = True
493 current.append(c)
494 elif c == ']':
495 in_bracket = False
496 current.append(c)
497 elif c == ':' and not in_bracket:
498 if len(current):
499 patterns.append(''.join(current))
500 del current[:]
501 else:
502 current.append(c)
503
504 if len(current):
505 patterns.append(''.join(current))
506
507 self._globignore_cache[globignore] = patterns
508
509 return patterns
510
511 def _MatchesGlobIgnore(self, filename, patterns):
512 # type: (str, List[str]) -> bool
513 """Check if filename matches any GLOBIGNORE pattern.
514
515 Filenames . and .. are always ignored when GLOBIGNORE is set.
516 """
517 basename = os_path.basename(filename)
518 if basename in ('.', '..'):
519 return True
520
521 flags = 0
522
523 for pattern in patterns:
524 if libc.fnmatch(pattern, filename, flags):
525 return True
526
527 return False
528
529 def _Glob(self, arg, out):
530 # type: (str, List[str]) -> int
531 globignore_patterns = self._GetGlobIgnorePatterns()
532
533 try:
534 flags = 0
535 # GLOBIGNORE enables dotglob when set to a non-null value
536 if self.exec_opts.dotglob() or globignore_patterns is not None:
537 # If HAVE_GLOB_PERIOD is false, then ./configure stubs out
538 # GLOB_PERIOD as 0, a no-op
539 flags |= GLOB_PERIOD
540 results = libc.glob(arg, flags)
541 except RuntimeError as e:
542 # These errors should be rare: I/O error, out of memory, or unknown
543 # There are no syntax errors. (But see comment about globerr() in
544 # native/libc.c.)
545 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
546 msg = e.message # type: str
547 print_stderr("Error expanding glob %r: %s" % (arg, msg))
548 raise
549 #log('glob %r -> %r', arg, g)
550
551 n = len(results)
552 if n: # Something matched
553 # Omit files starting with -
554 # no_dash_glob is part of shopt --set ysh:upgrade
555 if self.exec_opts.no_dash_glob():
556 tmp = [s for s in results if not s.startswith('-')]
557 results = tmp # idiom to work around mycpp limitation
558 n = len(results)
559
560 if globignore_patterns is not None:
561 filtered = [] # type: List[str]
562 for s in results:
563 if not self._MatchesGlobIgnore(s, globignore_patterns):
564 filtered.append(s)
565 results = filtered
566 n = len(results)
567 else:
568 # XXX: libc's glob function can return '.' and '..', which
569 # are typically not of interest. Filtering in this manner
570 # is similar (but not identical) to the default bash
571 # setting of 'setopt -s globskipdots'. Supporting that
572 # option fully would require more than simply wrapping
573 # this in an if statement.
574 dotfile_filtered = [] # type: List[str]
575 for s in results:
576 if s not in ('.', '..'):
577 dotfile_filtered.append(s)
578 results = dotfile_filtered
579 n = len(results)
580
581 out.extend(results)
582 return n
583
584 return 0
585
586 def Expand(self, arg, out, blame_loc):
587 # type: (str, List[str], loc_t) -> int
588 """Given a string that MAY be a glob, perform glob expansion
589
590 If files on disk match the glob pattern, we append to the list 'out',
591 and return the number of items.
592
593 Returns:
594 Number of items appended, or -1 when glob expansion did not happen.
595 Raises:
596 error.FailGlob when nothing matched, and shopt -s failglob
597 """
598 if self.exec_opts.noglob():
599 # The caller should use the original string
600 return -1
601
602 n = self._Glob(arg, out)
603 if n:
604 return n
605
606 # Nothing matched
607 if self.exec_opts.failglob():
608 raise error.FailGlob('Pattern %r matched no files' % arg,
609 blame_loc)
610
611 if self.exec_opts.nullglob():
612 return 0
613
614 # The caller should use the original string
615 return -1
616
617 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
618 # type: (str, str, List[str]) -> int
619 if self.exec_opts.noglob():
620 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
621 # there is extra \ escaping compared with bash and mksh. OK for now
622 out.append(fnmatch_pat)
623 return 1
624
625 tmp = [] # type: List[str]
626 self._Glob(glob_pat, tmp)
627 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
628 n = len(filtered)
629
630 if n:
631 out.extend(filtered)
632 return n
633
634 if self.exec_opts.failglob():
635 return -1 # nothing matched
636
637 if self.exec_opts.nullglob():
638 return 0
639 else:
640 # See comment above
641 out.append(GlobUnescape(fnmatch_pat))
642 return 1