OILS / osh / glob_.py View on Github | oils.pub

677 lines, 335 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7 glob_part, glob_part_e, glob_part_t,
8 loc, loc_t)
9from _devbuild.gen.value_asdl import value
10from core import pyos, pyutil, error
11from frontend import match
12from mycpp import mylib
13from mycpp.mylib import log
14from pylib import os_path
15
16from libc import GLOB_PERIOD
17from _devbuild.gen.value_asdl import value_e
18from _devbuild.gen.runtime_asdl import scope_e
19
20from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21if TYPE_CHECKING:
22 from core import optview
23 from core import state
24 from frontend.match import SimpleLexer
25
26_ = log
27
28
29def LooksLikeGlob(s):
30 # type: (str) -> bool
31 """Does this string look like a glob pattern?
32
33 Like other shells, OSH avoids calls to glob() unless there are glob
34 metacharacters.
35
36 TODO: Reference lib/glob / glob_pattern functions in bash
37 $ grep glob_pattern lib/glob/*
38
39 Used:
40 1. in Globber below
41 2. for the slow path / fast path of prefix/suffix/patsub ops.
42 """
43 left_bracket = False
44 i = 0
45 n = len(s)
46 while i < n:
47 c = mylib.ByteAt(s, i)
48
49 if mylib.ByteEquals(c, '\\'):
50 i += 1
51
52 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53 return True
54
55 elif mylib.ByteEquals(c, '['):
56 left_bracket = True
57
58 elif mylib.ByteEquals(c, ']') and left_bracket:
59 # It has at least one pair of balanced []. Not bothering to check stray
60 # [ or ].
61 return True
62
63 i += 1
64 return False
65
66
67def LooksLikeStaticGlob(w):
68 # type: (CompoundWord) -> bool
69 """Like LooksLikeGlob, but for static words."""
70
71 left_bracket = False
72 for part in w.parts:
73 if part.tag() == word_part_e.Literal:
74 id_ = cast(Token, part).id
75 if id_ in (Id.Lit_Star, Id.Lit_QMark):
76 return True
77 elif id_ == Id.Lit_LBracket:
78 left_bracket = True
79 elif id_ == Id.Lit_RBracket and left_bracket:
80 return True
81 return False
82
83
84# Glob Helpers for WordParts.
85# NOTE: Escaping / doesn't work, because it's not a filename character.
86# ! : - are metachars within character classes
87# ( ) | are extended glob characters, and it's OK to add extra \ when the
88# underlying library doesn't support extended globs
89# we don't need to escape the @ in @(cc), because escaping ( is enough
90GLOB_META_CHARS = r'\*?[]-:!()|'
91
92# Check invariant needed to escape literal \ as \@
93assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96def GlobEscape(s):
97 # type: (str) -> str
98 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102def GlobEscapeBackslash(s):
103 # type: (str) -> str
104 """Glob escape a string for an unquoted var sub.
105
106 Used to evaluate something like *$v with v='a\*b.txt'
107
108 We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110 See test cases in spec/glob.test.sh
111
112 - If globbing is performed, then \* evaluates to literal '*'
113 - that is, \ is an escape for the *
114 - If globbing is NOT performed (set -o noglob or no matching files), then
115 \* evaluates to '\*'
116 - that is, the \ is preserved literally
117 """
118 return s.replace('\\', r'\@')
119
120
121# Bug fix: add [] so [[:space:]] is not special, etc.
122ERE_META_CHARS = r'\?*+{}^$.()|[]'
123
124
125def ExtendedRegexEscape(s):
126 # type: (str) -> str
127 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128 I don't think libc has a function to do this. Escape these characters:
129
130 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131 """
132 return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135def GlobUnescape(s):
136 # type: (str) -> str
137 """Remove glob escaping from a string.
138
139 Used when there is no glob match.
140 TODO: Can probably get rid of this, as long as you save the original word.
141
142 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
143 word_eval _JoinElideEscape and EvalWordToString you have to build two
144 'parallel' strings -- one escaped and one not.
145 """
146 unescaped = [] # type: List[int]
147 i = 0
148 n = len(s)
149 while i < n:
150 c = mylib.ByteAt(s, i)
151
152 if mylib.ByteEquals(c, '\\') and i != n - 1:
153 # TODO: GlobEscape() turns \ into \\, so a string should never end
154 # with a single backslash.
155 # Suppressed this assert to fix bug #698, #628 is still there.
156 # Check them again.
157 assert i != n - 1, 'Trailing backslash: %r' % s
158
159 i += 1
160 c2 = mylib.ByteAt(s, i)
161
162 if mylib.ByteInSet(c2, GLOB_META_CHARS):
163 unescaped.append(c2)
164 elif mylib.ByteEquals(c2, '@'):
165 unescaped.append(pyos.BACKSLASH_CH)
166 else:
167 raise AssertionError("Unexpected escaped character %r" % c2)
168 else:
169 unescaped.append(c)
170 i += 1
171 return mylib.JoinBytes(unescaped)
172
173
174def GlobUnescapeBackslash(s):
175 # type: (str) -> str
176 """Inverse of GlobEscapeBackslash - turns \@ into \ """
177 unescaped = [] # type: List[int]
178 i = 0
179 n = len(s)
180 while i < n:
181 c = mylib.ByteAt(s, i)
182
183 if mylib.ByteEquals(c, '\\') and i != n - 1:
184 # Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185 # could end with a single backslash?
186 assert i != n - 1, 'Trailing backslash: %r' % s
187
188 i += 1
189 c2 = mylib.ByteAt(s, i)
190
191 if mylib.ByteEquals(c2, '@'):
192 unescaped.append(pyos.BACKSLASH_CH)
193 else:
194 unescaped.append(pyos.BACKSLASH_CH)
195 unescaped.append(c2)
196 else:
197 unescaped.append(c)
198 i += 1
199 return mylib.JoinBytes(unescaped)
200
201
202# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203# positions of matches. So we convert globs to regexps.
204
205# Problems:
206# - What about unicode? Do we have to set any global variables? We want it to
207# always use utf-8?
208
209
210class _GlobParser(object):
211
212 def __init__(self, lexer):
213 # type: (SimpleLexer) -> None
214 self.lexer = lexer
215 self.token_type = Id.Undefined_Tok
216 self.token_val = ''
217 self.warnings = [] # type: List[str]
218
219 def _Next(self):
220 # type: () -> None
221 """Move to the next token."""
222 self.token_type, self.token_val = self.lexer.Next()
223
224 def _ParseCharClass(self):
225 # type: () -> List[glob_part_t]
226 """
227 Returns:
228 a CharClass if the parse succeeds, or a Literal if fails. In the latter
229 case, we also append a warning.
230 """
231 first_token = glob_part.Literal(self.token_type, self.token_val)
232 balance = 1 # We already saw a [
233 tokens = [] # type: List[Tuple[Id_t, str]]
234
235 # NOTE: There is a special rule where []] and [[] are valid globs. Also
236 # [^[] and sometimes [^]], although that one is ambiguous!
237 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238 # punting on this now because the rule isn't clear and consistent between
239 # shells.
240
241 while True:
242 self._Next()
243
244 if self.token_type == Id.Eol_Tok:
245 # TODO: location info
246 self.warnings.append(
247 'Malformed character class; treating as literal')
248 parts = [first_token] # type: List[glob_part_t]
249 for (id_, s) in tokens:
250 parts.append(glob_part.Literal(id_, s))
251 return parts
252
253 if self.token_type == Id.Glob_LBracket:
254 balance += 1
255 elif self.token_type == Id.Glob_RBracket:
256 balance -= 1
257
258 if balance == 0:
259 break
260 tokens.append(
261 (self.token_type, self.token_val)) # Don't append the last ]
262
263 negated = False
264 if len(tokens):
265 id1, _ = tokens[0]
266 # NOTE: Both ! and ^ work for negation in globs
267 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268 # TODO: Warn about the one that's not recommended?
269 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270 negated = True
271 tokens = tokens[1:]
272 strs = [s for _, s in tokens]
273 return [glob_part.CharClass(negated, strs)]
274
275 def Parse(self):
276 # type: () -> Tuple[List[glob_part_t], List[str]]
277 """
278 Returns:
279 regex string (or None if it's not a glob)
280 A list of warnings about the syntax
281 """
282 parts = [] # type: List[glob_part_t]
283
284 while True:
285 self._Next()
286 id_ = self.token_type
287 s = self.token_val
288
289 #log('%s %r', self.token_type, self.token_val)
290 if id_ == Id.Eol_Tok:
291 break
292
293 if id_ in (Id.Glob_Star, Id.Glob_QMark):
294 parts.append(glob_part.Operator(id_))
295
296 elif id_ == Id.Glob_LBracket:
297 # Could return a Literal or a CharClass
298 parts.extend(self._ParseCharClass())
299
300 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301 # BadBackslash}
302 parts.append(glob_part.Literal(id_, s))
303
304 # Also check for warnings. TODO: location info.
305 if id_ == Id.Glob_RBracket:
306 self.warnings.append('Got unescaped right bracket')
307 if id_ == Id.Glob_BadBackslash:
308 self.warnings.append('Got unescaped trailing backslash')
309
310 return parts, self.warnings
311
312
313_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
314
315
316def _GenerateERE(parts):
317 # type: (List[glob_part_t]) -> str
318 out = [] # type: List[str]
319
320 for part in parts:
321 tag = part.tag()
322 UP_part = part
323
324 if tag == glob_part_e.Literal:
325 part = cast(glob_part.Literal, UP_part)
326 if part.id == Id.Glob_EscapedChar:
327 assert len(part.s) == 2, part.s
328 # The user could have escaped a char that doesn't need regex escaping,
329 # like \b or something.
330 c = part.s[1]
331 if c in _REGEX_CHARS_TO_ESCAPE:
332 out.append('\\')
333 out.append(c)
334
335 # ! is only for char class
336 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337 out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339 # ^ is only for char class
340 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341 assert len(part.s) == 1, part.s
342 c = part.s
343 if c in _REGEX_CHARS_TO_ESCAPE:
344 out.append('\\')
345 out.append(c)
346
347 # These are UNMATCHED ones not parsed in a glob class
348 elif part.id == Id.Glob_LBracket:
349 out.append('\\[')
350
351 elif part.id == Id.Glob_RBracket:
352 out.append('\\]')
353
354 elif part.id == Id.Glob_BadBackslash:
355 out.append('\\\\')
356
357 elif part.id == Id.Glob_Caret:
358 out.append('^')
359
360 else:
361 raise AssertionError(part.id)
362
363 elif tag == glob_part_e.Operator:
364 part = cast(glob_part.Operator, UP_part)
365 if part.op_id == Id.Glob_QMark:
366 out.append('.')
367 elif part.op_id == Id.Glob_Star:
368 out.append('.*')
369 else:
370 raise AssertionError()
371
372 elif tag == glob_part_e.CharClass:
373 part = cast(glob_part.CharClass, UP_part)
374 out.append('[')
375 if part.negated:
376 out.append('^')
377
378 # Important: the character class is LITERALLY preserved, because we
379 # assume glob char classes are EXACTLY the same as regex char classes,
380 # including the escaping rules.
381 #
382 # TWO WEIRD EXCEPTIONS:
383 # \- is moved to the end as '-'.
384 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385 # want a literal, and the POSIX way to do that is to put it at the end.
386 # \] is moved to the FRONT as ]
387
388 good = [] # type: List[str]
389
390 literal_hyphen = False
391 literal_rbracket = False
392
393 for s in part.strs:
394 if s == '\-':
395 literal_hyphen = True
396 continue
397 if s == '\]':
398 literal_rbracket = True
399 continue
400 good.append(s)
401
402 if literal_rbracket:
403 out.append(']')
404
405 out.extend(good)
406
407 if literal_hyphen:
408 out.append('-')
409
410 out.append(']')
411
412 return ''.join(out)
413
414
415def GlobToERE(pat):
416 # type: (str) -> Tuple[str, List[str]]
417 lexer = match.GlobLexer(pat)
418 p = _GlobParser(lexer)
419 parts, warnings = p.Parse()
420
421 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
422 # a literal, and we could use a more efficient mechanism.
423 # But we would have to DEQUOTE before doing that.
424 if 0:
425 is_glob = False
426 for p in parts:
427 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428 is_glob = True
429 if 0:
430 log('GlobToERE()')
431 for p in parts:
432 log(' %s', p)
433
434 regex = _GenerateERE(parts)
435 #log('pat %s -> regex %s', pat, regex)
436 return regex, warnings
437
438
439# Notes for implementing extglob
440# - libc glob() doesn't have any extension!
441# - Nix stdenv uses !(foo) and @(foo|bar)
442# - can we special case these for now?
443# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
444# result!
445# - Actually I guess we can do that for all of them. That seems fine.
446# - But we have to get the statically parsed arg in here?
447# - or do dynamic parsing
448# - LooksLikeGlob() would have to respect extglob! ugh!
449# - See 2 calls in osh/word_eval.py
450
451
452def _StringMatchesAnyPattern(s, patterns):
453 # type: (str, List[str]) -> bool
454 """Check if string matches any pattern in the list.
455
456 Returns True if s matches any pattern, or if s is . or ..
457 (which are always filtered when GLOBIGNORE is set).
458 """
459 flags = 0
460 for pattern in patterns:
461 if libc.fnmatch(pattern, s, flags):
462 return True
463
464 return False
465
466
467class Globber(object):
468
469 def __init__(self, exec_opts, mem):
470 # type: (optview.Exec, state.Mem) -> None
471 self.exec_opts = exec_opts
472 self.mem = mem
473 # Cache for parsed GLOBIGNORE patterns to avoid re-parsing
474 self._globignore_cache = {} # type: Dict[str, List[str]]
475
476 # Other unimplemented bash options:
477 #
478 # globstar ** for directories
479 # globasciiranges ascii or unicode char classes (unicode by default)
480 # nocaseglob
481 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
482 # not glob().
483
484 def _GetGlobIgnorePatterns(self):
485 # type: () -> Optional[List[str]]
486 """Get GLOBIGNORE patterns as a list, or None if not set."""
487
488 val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
489 if val.tag() != value_e.Str:
490 return None
491
492 globignore = cast(value.Str, val).s # type: str
493 if len(globignore) == 0:
494 return None
495
496 if globignore in self._globignore_cache:
497 return self._globignore_cache[globignore]
498
499 # Split by colon to get individual patterns, but don't split colons
500 # inside bracket expressions like [[:alnum:]]
501 patterns = [] # type: List[str]
502 current = [] # type: List[str]
503 in_bracket = False
504
505 for c in globignore:
506 if c == '[':
507 in_bracket = True
508 current.append(c)
509 elif c == ']':
510 in_bracket = False
511 current.append(c)
512 elif c == ':' and not in_bracket:
513 if len(current):
514 patterns.append(''.join(current))
515 del current[:]
516 else:
517 current.append(c)
518
519 if len(current):
520 patterns.append(''.join(current))
521
522 self._globignore_cache[globignore] = patterns
523
524 return patterns
525
526 def DoLibcGlob(self, arg, out, blame_loc):
527 # type: (str, List[str], loc_t) -> None
528 """For the io.libcGlob() API"""
529 try:
530 results = libc.glob(arg, 0)
531 except RuntimeError as e:
532 # Rare glob errors, like GLOB_NOSPACE
533 # Note: dash has a fatal sh_error() on GLOB_NOSPACE
534
535 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
536 msg = e.message # type: str
537 raise error.Structured(error.CODEC_STATUS, msg, blame_loc)
538
539 out.extend(results)
540
541 def DoShellGlob(self, arg, out, blame_loc=loc.Missing):
542 # type: (str, List[str], loc_t) -> int
543 """For word evaluation and the io.glob() API
544
545 Respects these filters:
546 - GLOBIGNORE
547 - dotglob turns into C GLOB_PERIOD
548 - no_dash_glob
549 - globskipdots
550
551 But NOT these; they are done at a higher level
552 - noglob
553 - failglob
554 - nullglob - ditto
555
556 TODO:
557 - ysh globbing should not respect globals like GLOBIGNORE?
558 - only no_dash_glob by default?
559 - split into two functions:
560 - compatible io.glob()
561 - controlled io.libcGlob()
562 """
563 globignore_patterns = self._GetGlobIgnorePatterns()
564
565 flags = 0
566 # shopt -u dotglob (default): echo * does not return say .gitignore
567 # If GLOBIGNORE is set, then dotglob is NOT respected - we return ..
568 if self.exec_opts.dotglob() or globignore_patterns is not None:
569 # If HAVE_GLOB_PERIOD is false, then ./configure stubs out
570 # GLOB_PERIOD as 0, a no-op
571 flags |= GLOB_PERIOD
572
573 try:
574 results = libc.glob(arg, flags)
575 except RuntimeError as e:
576 # Rare glob errors, like GLOB_NOSPACE
577 # Note: dash has a fatal sh_error() on GLOB_NOSPACE
578
579 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
580 msg = e.message # type: str
581 raise error.Structured(error.CODEC_STATUS, msg, blame_loc)
582 #log('glob %r -> %r', arg, g)
583
584 if len(results) == 0:
585 return 0 # nothing matched
586
587 # Something matched
588
589 if globignore_patterns is not None: # Handle GLOBIGNORE
590 # When GLOBIGNORE is set, bash doesn't respect shopt -u
591 # globskipdots! The entries . and .. are skipped, even if they
592 # do NOT match GLOBIGNORE
593 tmp = [
594 s for s in results
595 if not _StringMatchesAnyPattern(s, globignore_patterns) and
596 os_path.basename(s) not in ('.', '..')
597 ]
598 results = tmp # idiom to work around mycpp limitation
599
600 skipdots = True
601
602 else: # Do filtering that's NOT GLOBIGNORE
603 # no_dash_glob: Omit files starting with -
604 # (part of shopt --set ysh:upgrade)
605 if self.exec_opts.no_dash_glob():
606 tmp = [s for s in results if not s.startswith('-')]
607 results = tmp
608
609 # globskipdots: Remove . and .. entries returned by libc.
610 if self.exec_opts.globskipdots():
611 tmp = [s for s in results if s not in ('.', '..')]
612 results = tmp
613
614 out.extend(results)
615 return len(results)
616
617 def Expand(self, arg, out, blame_loc):
618 # type: (str, List[str], loc_t) -> int
619 """Given a string that MAY be a glob, perform glob expansion
620
621 If files on disk match the glob pattern, we append to the list 'out',
622 and return the number of items.
623
624 Returns:
625 Number of items appended, or -1 when glob expansion did not happen.
626 Raises:
627 error.FailGlob when nothing matched, and shopt -s failglob
628 """
629 if self.exec_opts.noglob():
630 # The caller should use the original string
631 return -1
632
633 n = self.DoShellGlob(arg, out)
634 if n:
635 return n
636
637 # Nothing matched
638 if self.exec_opts.failglob():
639 raise error.FailGlob('Pattern %r matched no files' % arg,
640 blame_loc)
641
642 if self.exec_opts.nullglob():
643 return 0
644
645 # The caller should use the original string
646 return -1
647
648 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
649 # type: (str, str, List[str]) -> int
650 """
651 Returns:
652 The number of items appended, or -1 when glob expansion did not happen
653 """
654 if self.exec_opts.noglob():
655 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
656 # there is extra \ escaping compared with bash and mksh. OK for now
657 out.append(fnmatch_pat)
658 return 1
659
660 tmp = [] # type: List[str]
661 self.DoShellGlob(glob_pat, tmp)
662 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
663 n = len(filtered)
664
665 if n:
666 out.extend(filtered)
667 return n
668
669 if self.exec_opts.failglob():
670 return -1 # nothing matched
671
672 if self.exec_opts.nullglob():
673 return 0
674
675 # Expand to fnmatch_pat, as above
676 out.append(GlobUnescape(fnmatch_pat))
677 return 1