OILS / osh / glob_.py View on Github | oils.pub

652 lines, 330 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7 glob_part, glob_part_e, glob_part_t,
8 loc_t)
9from _devbuild.gen.value_asdl import value
10from core import pyos, pyutil, error
11from frontend import match
12from mycpp import mylib
13from mycpp.mylib import log, print_stderr
14from pylib import os_path
15
16from libc import GLOB_PERIOD
17from _devbuild.gen.value_asdl import value_e
18from _devbuild.gen.runtime_asdl import scope_e
19
20from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21if TYPE_CHECKING:
22 from core import optview
23 from core import state
24 from frontend.match import SimpleLexer
25
26_ = log
27
28
29def LooksLikeGlob(s):
30 # type: (str) -> bool
31 """Does this string look like a glob pattern?
32
33 Like other shells, OSH avoids calls to glob() unless there are glob
34 metacharacters.
35
36 TODO: Reference lib/glob / glob_pattern functions in bash
37 $ grep glob_pattern lib/glob/*
38
39 Used:
40 1. in Globber below
41 2. for the slow path / fast path of prefix/suffix/patsub ops.
42 """
43 left_bracket = False
44 i = 0
45 n = len(s)
46 while i < n:
47 c = mylib.ByteAt(s, i)
48
49 if mylib.ByteEquals(c, '\\'):
50 i += 1
51
52 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53 return True
54
55 elif mylib.ByteEquals(c, '['):
56 left_bracket = True
57
58 elif mylib.ByteEquals(c, ']') and left_bracket:
59 # It has at least one pair of balanced []. Not bothering to check stray
60 # [ or ].
61 return True
62
63 i += 1
64 return False
65
66
67def LooksLikeStaticGlob(w):
68 # type: (CompoundWord) -> bool
69 """Like LooksLikeGlob, but for static words."""
70
71 left_bracket = False
72 for part in w.parts:
73 if part.tag() == word_part_e.Literal:
74 id_ = cast(Token, part).id
75 if id_ in (Id.Lit_Star, Id.Lit_QMark):
76 return True
77 elif id_ == Id.Lit_LBracket:
78 left_bracket = True
79 elif id_ == Id.Lit_RBracket and left_bracket:
80 return True
81 return False
82
83
84# Glob Helpers for WordParts.
85# NOTE: Escaping / doesn't work, because it's not a filename character.
86# ! : - are metachars within character classes
87# ( ) | are extended glob characters, and it's OK to add extra \ when the
88# underlying library doesn't support extended globs
89# we don't need to escape the @ in @(cc), because escaping ( is enough
90GLOB_META_CHARS = r'\*?[]-:!()|'
91
92# Check invariant needed to escape literal \ as \@
93assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96def GlobEscape(s):
97 # type: (str) -> str
98 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102def GlobEscapeBackslash(s):
103 # type: (str) -> str
104 """Glob escape a string for an unquoted var sub.
105
106 Used to evaluate something like *$v with v='a\*b.txt'
107
108 We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110 See test cases in spec/glob.test.sh
111
112 - If globbing is performed, then \* evaluates to literal '*'
113 - that is, \ is an escape for the *
114 - If globbing is NOT performed (set -o noglob or no matching files), then
115 \* evaluates to '\*'
116 - that is, the \ is preserved literally
117 """
118 return s.replace('\\', r'\@')
119
120
121# Bug fix: add [] so [[:space:]] is not special, etc.
122ERE_META_CHARS = r'\?*+{}^$.()|[]'
123
124
125def ExtendedRegexEscape(s):
126 # type: (str) -> str
127 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128 I don't think libc has a function to do this. Escape these characters:
129
130 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131 """
132 return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135def GlobUnescape(s):
136 # type: (str) -> str
137 """Remove glob escaping from a string.
138
139 Used when there is no glob match.
140 TODO: Can probably get rid of this, as long as you save the original word.
141
142 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
143 word_eval _JoinElideEscape and EvalWordToString you have to build two
144 'parallel' strings -- one escaped and one not.
145 """
146 unescaped = [] # type: List[int]
147 i = 0
148 n = len(s)
149 while i < n:
150 c = mylib.ByteAt(s, i)
151
152 if mylib.ByteEquals(c, '\\') and i != n - 1:
153 # TODO: GlobEscape() turns \ into \\, so a string should never end
154 # with a single backslash.
155 # Suppressed this assert to fix bug #698, #628 is still there.
156 # Check them again.
157 assert i != n - 1, 'Trailing backslash: %r' % s
158
159 i += 1
160 c2 = mylib.ByteAt(s, i)
161
162 if mylib.ByteInSet(c2, GLOB_META_CHARS):
163 unescaped.append(c2)
164 elif mylib.ByteEquals(c2, '@'):
165 unescaped.append(pyos.BACKSLASH_CH)
166 else:
167 raise AssertionError("Unexpected escaped character %r" % c2)
168 else:
169 unescaped.append(c)
170 i += 1
171 return mylib.JoinBytes(unescaped)
172
173
174def GlobUnescapeBackslash(s):
175 # type: (str) -> str
176 """Inverse of GlobEscapeBackslash - turns \@ into \ """
177 unescaped = [] # type: List[int]
178 i = 0
179 n = len(s)
180 while i < n:
181 c = mylib.ByteAt(s, i)
182
183 if mylib.ByteEquals(c, '\\') and i != n - 1:
184 # Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185 # could end with a single backslash?
186 assert i != n - 1, 'Trailing backslash: %r' % s
187
188 i += 1
189 c2 = mylib.ByteAt(s, i)
190
191 if mylib.ByteEquals(c2, '@'):
192 unescaped.append(pyos.BACKSLASH_CH)
193 else:
194 unescaped.append(pyos.BACKSLASH_CH)
195 unescaped.append(c2)
196 else:
197 unescaped.append(c)
198 i += 1
199 return mylib.JoinBytes(unescaped)
200
201
202# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203# positions of matches. So we convert globs to regexps.
204
205# Problems:
206# - What about unicode? Do we have to set any global variables? We want it to
207# always use utf-8?
208
209
210class _GlobParser(object):
211
212 def __init__(self, lexer):
213 # type: (SimpleLexer) -> None
214 self.lexer = lexer
215 self.token_type = Id.Undefined_Tok
216 self.token_val = ''
217 self.warnings = [] # type: List[str]
218
219 def _Next(self):
220 # type: () -> None
221 """Move to the next token."""
222 self.token_type, self.token_val = self.lexer.Next()
223
224 def _ParseCharClass(self):
225 # type: () -> List[glob_part_t]
226 """
227 Returns:
228 a CharClass if the parse succeeds, or a Literal if fails. In the latter
229 case, we also append a warning.
230 """
231 first_token = glob_part.Literal(self.token_type, self.token_val)
232 balance = 1 # We already saw a [
233 tokens = [] # type: List[Tuple[Id_t, str]]
234
235 # NOTE: There is a special rule where []] and [[] are valid globs. Also
236 # [^[] and sometimes [^]], although that one is ambiguous!
237 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238 # punting on this now because the rule isn't clear and consistent between
239 # shells.
240
241 while True:
242 self._Next()
243
244 if self.token_type == Id.Eol_Tok:
245 # TODO: location info
246 self.warnings.append(
247 'Malformed character class; treating as literal')
248 parts = [first_token] # type: List[glob_part_t]
249 for (id_, s) in tokens:
250 parts.append(glob_part.Literal(id_, s))
251 return parts
252
253 if self.token_type == Id.Glob_LBracket:
254 balance += 1
255 elif self.token_type == Id.Glob_RBracket:
256 balance -= 1
257
258 if balance == 0:
259 break
260 tokens.append(
261 (self.token_type, self.token_val)) # Don't append the last ]
262
263 negated = False
264 if len(tokens):
265 id1, _ = tokens[0]
266 # NOTE: Both ! and ^ work for negation in globs
267 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268 # TODO: Warn about the one that's not recommended?
269 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270 negated = True
271 tokens = tokens[1:]
272 strs = [s for _, s in tokens]
273 return [glob_part.CharClass(negated, strs)]
274
275 def Parse(self):
276 # type: () -> Tuple[List[glob_part_t], List[str]]
277 """
278 Returns:
279 regex string (or None if it's not a glob)
280 A list of warnings about the syntax
281 """
282 parts = [] # type: List[glob_part_t]
283
284 while True:
285 self._Next()
286 id_ = self.token_type
287 s = self.token_val
288
289 #log('%s %r', self.token_type, self.token_val)
290 if id_ == Id.Eol_Tok:
291 break
292
293 if id_ in (Id.Glob_Star, Id.Glob_QMark):
294 parts.append(glob_part.Operator(id_))
295
296 elif id_ == Id.Glob_LBracket:
297 # Could return a Literal or a CharClass
298 parts.extend(self._ParseCharClass())
299
300 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301 # BadBackslash}
302 parts.append(glob_part.Literal(id_, s))
303
304 # Also check for warnings. TODO: location info.
305 if id_ == Id.Glob_RBracket:
306 self.warnings.append('Got unescaped right bracket')
307 if id_ == Id.Glob_BadBackslash:
308 self.warnings.append('Got unescaped trailing backslash')
309
310 return parts, self.warnings
311
312
313_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
314
315
316def _GenerateERE(parts):
317 # type: (List[glob_part_t]) -> str
318 out = [] # type: List[str]
319
320 for part in parts:
321 tag = part.tag()
322 UP_part = part
323
324 if tag == glob_part_e.Literal:
325 part = cast(glob_part.Literal, UP_part)
326 if part.id == Id.Glob_EscapedChar:
327 assert len(part.s) == 2, part.s
328 # The user could have escaped a char that doesn't need regex escaping,
329 # like \b or something.
330 c = part.s[1]
331 if c in _REGEX_CHARS_TO_ESCAPE:
332 out.append('\\')
333 out.append(c)
334
335 # ! is only for char class
336 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337 out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339 # ^ is only for char class
340 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341 assert len(part.s) == 1, part.s
342 c = part.s
343 if c in _REGEX_CHARS_TO_ESCAPE:
344 out.append('\\')
345 out.append(c)
346
347 # These are UNMATCHED ones not parsed in a glob class
348 elif part.id == Id.Glob_LBracket:
349 out.append('\\[')
350
351 elif part.id == Id.Glob_RBracket:
352 out.append('\\]')
353
354 elif part.id == Id.Glob_BadBackslash:
355 out.append('\\\\')
356
357 elif part.id == Id.Glob_Caret:
358 out.append('^')
359
360 else:
361 raise AssertionError(part.id)
362
363 elif tag == glob_part_e.Operator:
364 part = cast(glob_part.Operator, UP_part)
365 if part.op_id == Id.Glob_QMark:
366 out.append('.')
367 elif part.op_id == Id.Glob_Star:
368 out.append('.*')
369 else:
370 raise AssertionError()
371
372 elif tag == glob_part_e.CharClass:
373 part = cast(glob_part.CharClass, UP_part)
374 out.append('[')
375 if part.negated:
376 out.append('^')
377
378 # Important: the character class is LITERALLY preserved, because we
379 # assume glob char classes are EXACTLY the same as regex char classes,
380 # including the escaping rules.
381 #
382 # TWO WEIRD EXCEPTIONS:
383 # \- is moved to the end as '-'.
384 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385 # want a literal, and the POSIX way to do that is to put it at the end.
386 # \] is moved to the FRONT as ]
387
388 good = [] # type: List[str]
389
390 literal_hyphen = False
391 literal_rbracket = False
392
393 for s in part.strs:
394 if s == '\-':
395 literal_hyphen = True
396 continue
397 if s == '\]':
398 literal_rbracket = True
399 continue
400 good.append(s)
401
402 if literal_rbracket:
403 out.append(']')
404
405 out.extend(good)
406
407 if literal_hyphen:
408 out.append('-')
409
410 out.append(']')
411
412 return ''.join(out)
413
414
415def GlobToERE(pat):
416 # type: (str) -> Tuple[str, List[str]]
417 lexer = match.GlobLexer(pat)
418 p = _GlobParser(lexer)
419 parts, warnings = p.Parse()
420
421 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
422 # a literal, and we could use a more efficient mechanism.
423 # But we would have to DEQUOTE before doing that.
424 if 0:
425 is_glob = False
426 for p in parts:
427 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428 is_glob = True
429 if 0:
430 log('GlobToERE()')
431 for p in parts:
432 log(' %s', p)
433
434 regex = _GenerateERE(parts)
435 #log('pat %s -> regex %s', pat, regex)
436 return regex, warnings
437
438
439# Notes for implementing extglob
440# - libc glob() doesn't have any extension!
441# - Nix stdenv uses !(foo) and @(foo|bar)
442# - can we special case these for now?
443# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
444# result!
445# - Actually I guess we can do that for all of them. That seems fine.
446# - But we have to get the statically parsed arg in here?
447# - or do dynamic parsing
448# - LooksLikeGlob() would have to respect extglob! ugh!
449# - See 2 calls in osh/word_eval.py
450
451
452def _StringMatchesAnyPattern(s, patterns):
453 # type: (str, List[str]) -> bool
454 """Check if string matches any pattern in the list.
455
456 Returns True if s matches any pattern, or if s is . or ..
457 (which are always filtered when GLOBIGNORE is set).
458 """
459 basename = os_path.basename(s)
460 if basename in ('.', '..'):
461 return True
462
463 flags = 0
464 for pattern in patterns:
465 if libc.fnmatch(pattern, s, flags):
466 return True
467
468 return False
469
470
471class Globber(object):
472
473 def __init__(self, exec_opts, mem):
474 # type: (optview.Exec, state.Mem) -> None
475 self.exec_opts = exec_opts
476 self.mem = mem
477 # Cache for parsed GLOBIGNORE patterns to avoid re-parsing
478 self._globignore_cache = {} # type: Dict[str, List[str]]
479
480 # Other unimplemented bash options:
481 #
482 # globstar ** for directories
483 # globasciiranges ascii or unicode char classes (unicode by default)
484 # nocaseglob
485 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
486 # not glob().
487
488 def _GetGlobIgnorePatterns(self):
489 # type: () -> Optional[List[str]]
490 """Get GLOBIGNORE patterns as a list, or None if not set."""
491
492 val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
493 if val.tag() != value_e.Str:
494 return None
495
496 globignore = cast(value.Str, val).s # type: str
497 if len(globignore) == 0:
498 return None
499
500 if globignore in self._globignore_cache:
501 return self._globignore_cache[globignore]
502
503 # Split by colon to get individual patterns, but don't split colons
504 # inside bracket expressions like [[:alnum:]]
505 patterns = [] # type: List[str]
506 current = [] # type: List[str]
507 in_bracket = False
508
509 for c in globignore:
510 if c == '[':
511 in_bracket = True
512 current.append(c)
513 elif c == ']':
514 in_bracket = False
515 current.append(c)
516 elif c == ':' and not in_bracket:
517 if len(current):
518 patterns.append(''.join(current))
519 del current[:]
520 else:
521 current.append(c)
522
523 if len(current):
524 patterns.append(''.join(current))
525
526 self._globignore_cache[globignore] = patterns
527
528 return patterns
529
530 def DoGlob(self, arg, out):
531 # type: (str, List[str]) -> int
532 """
533 Respects:
534 - GLOBIGNORE
535 - dotglob
536 - no_dash_glob
537
538 But NOT
539 - noglob - done at the wordl evel
540 - nullglob - ditto
541
542 TODO:
543 - ysh globbing should not respect globals like GLOBIGNORE?
544 - only no_dash_glob by default?
545 - split into pure io.glob() and legacyGlob() function?
546 - this respects GLOBIGNORE
547 """
548 globignore_patterns = self._GetGlobIgnorePatterns()
549
550 flags = 0
551 # shopt -u dotglob (default): echo * does not return say .gitignore
552 # If GLOBIGNORE is set, then dotglob is NOT respected - we return ..
553 if self.exec_opts.dotglob() or globignore_patterns is not None:
554 # If HAVE_GLOB_PERIOD is false, then ./configure stubs out
555 # GLOB_PERIOD as 0, a no-op
556 flags |= GLOB_PERIOD
557
558 try:
559 results = libc.glob(arg, flags)
560 except RuntimeError as e:
561 # These errors should be rare: I/O error, out of memory, or unknown
562 # There are no syntax errors. (But see comment about globerr() in
563 # native/libc.c.)
564 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
565 msg = e.message # type: str
566 print_stderr("Error expanding glob %r: %s" % (arg, msg))
567 raise
568 #log('glob %r -> %r', arg, g)
569
570 if len(results): # Something matched
571 # Omit files starting with -
572 # no_dash_glob is part of shopt --set ysh:upgrade
573 if self.exec_opts.no_dash_glob():
574 tmp = [s for s in results if not s.startswith('-')]
575 results = tmp # idiom to work around mycpp limitation
576
577 if globignore_patterns is not None:
578 # Handle GLOBIGNORE
579 tmp = [
580 s for s in results
581 if not _StringMatchesAnyPattern(s, globignore_patterns)
582 ]
583
584 results = tmp # idiom to work around mycpp limitation
585 else:
586 # Remove . and .. entries returned by libc.
587 # This is 'shopt -s globskipdots'. TODO: support it fully?
588 tmp = [s for s in results if not s in ('.', '..')]
589 results = tmp # idiom to work around mycpp limitation
590
591 out.extend(results)
592 return len(results)
593
594 return 0
595
596 def Expand(self, arg, out, blame_loc):
597 # type: (str, List[str], loc_t) -> int
598 """Given a string that MAY be a glob, perform glob expansion
599
600 If files on disk match the glob pattern, we append to the list 'out',
601 and return the number of items.
602
603 Returns:
604 Number of items appended, or -1 when glob expansion did not happen.
605 Raises:
606 error.FailGlob when nothing matched, and shopt -s failglob
607 """
608 if self.exec_opts.noglob():
609 # The caller should use the original string
610 return -1
611
612 n = self.DoGlob(arg, out)
613 if n:
614 return n
615
616 # Nothing matched
617 if self.exec_opts.failglob():
618 raise error.FailGlob('Pattern %r matched no files' % arg,
619 blame_loc)
620
621 if self.exec_opts.nullglob():
622 return 0
623
624 # The caller should use the original string
625 return -1
626
627 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
628 # type: (str, str, List[str]) -> int
629 if self.exec_opts.noglob():
630 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
631 # there is extra \ escaping compared with bash and mksh. OK for now
632 out.append(fnmatch_pat)
633 return 1
634
635 tmp = [] # type: List[str]
636 self.DoGlob(glob_pat, tmp)
637 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
638 n = len(filtered)
639
640 if n:
641 out.extend(filtered)
642 return n
643
644 if self.exec_opts.failglob():
645 return -1 # nothing matched
646
647 if self.exec_opts.nullglob():
648 return 0
649 else:
650 # See comment above
651 out.append(GlobUnescape(fnmatch_pat))
652 return 1