OILS / osh / glob_.py View on Github | oils.pub

565 lines, 281 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7 glob_part, glob_part_e, glob_part_t,
8 loc_t)
9from core import pyos, pyutil, error
10from frontend import match
11from mycpp import mylib
12from mycpp.mylib import log, print_stderr
13
14from libc import GLOB_PERIOD
15
16from typing import List, Tuple, cast, TYPE_CHECKING
17if TYPE_CHECKING:
18 from core import optview
19 from frontend.match import SimpleLexer
20
21_ = log
22
23
24def LooksLikeGlob(s):
25 # type: (str) -> bool
26 """Does this string look like a glob pattern?
27
28 Like other shells, OSH avoids calls to glob() unless there are glob
29 metacharacters.
30
31 TODO: Reference lib/glob / glob_pattern functions in bash
32 $ grep glob_pattern lib/glob/*
33
34 Used:
35 1. in Globber below
36 2. for the slow path / fast path of prefix/suffix/patsub ops.
37 """
38 left_bracket = False
39 i = 0
40 n = len(s)
41 while i < n:
42 c = mylib.ByteAt(s, i)
43
44 if mylib.ByteEquals(c, '\\'):
45 i += 1
46
47 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
48 return True
49
50 elif mylib.ByteEquals(c, '['):
51 left_bracket = True
52
53 elif mylib.ByteEquals(c, ']') and left_bracket:
54 # It has at least one pair of balanced []. Not bothering to check stray
55 # [ or ].
56 return True
57
58 i += 1
59 return False
60
61
62def LooksLikeStaticGlob(w):
63 # type: (CompoundWord) -> bool
64 """Like LooksLikeGlob, but for static words."""
65
66 left_bracket = False
67 for part in w.parts:
68 if part.tag() == word_part_e.Literal:
69 id_ = cast(Token, part).id
70 if id_ in (Id.Lit_Star, Id.Lit_QMark):
71 return True
72 elif id_ == Id.Lit_LBracket:
73 left_bracket = True
74 elif id_ == Id.Lit_RBracket and left_bracket:
75 return True
76 return False
77
78
79# Glob Helpers for WordParts.
80# NOTE: Escaping / doesn't work, because it's not a filename character.
81# ! : - are metachars within character classes
82# ( ) | are extended glob characters, and it's OK to add extra \ when the
83# underlying library doesn't support extended globs
84# we don't need to escape the @ in @(cc), because escaping ( is enough
85GLOB_META_CHARS = r'\*?[]-:!()|'
86
87# Check invariant needed to escape literal \ as \@
88assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
89
90
91def GlobEscape(s):
92 # type: (str) -> str
93 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
94 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
95
96
97def GlobEscapeBackslash(s):
98 # type: (str) -> str
99 """Glob escape a string for an unquoted var sub.
100
101 Used to evaluate something like *$v with v='a\*b.txt'
102
103 We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
104
105 See test cases in spec/glob.test.sh
106
107 - If globbing is performed, then \* evaluates to literal '*'
108 - that is, \ is an escape for the *
109 - If globbing is NOT performed (set -o noglob or no matching files), then
110 \* evaluates to '\*'
111 - that is, the \ is preserved literally
112 """
113 return s.replace('\\', r'\@')
114
115
116# Bug fix: add [] so [[:space:]] is not special, etc.
117ERE_META_CHARS = r'\?*+{}^$.()|[]'
118
119
120def ExtendedRegexEscape(s):
121 # type: (str) -> str
122 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
123 I don't think libc has a function to do this. Escape these characters:
124
125 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
126 """
127 return pyutil.BackslashEscape(s, ERE_META_CHARS)
128
129
130def GlobUnescape(s):
131 # type: (str) -> str
132 """Remove glob escaping from a string.
133
134 Used when there is no glob match.
135 TODO: Can probably get rid of this, as long as you save the original word.
136
137 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
138 word_eval _JoinElideEscape and EvalWordToString you have to build two
139 'parallel' strings -- one escaped and one not.
140 """
141 unescaped = [] # type: List[int]
142 i = 0
143 n = len(s)
144 while i < n:
145 c = mylib.ByteAt(s, i)
146
147 if mylib.ByteEquals(c, '\\') and i != n - 1:
148 # TODO: GlobEscape() turns \ into \\, so a string should never end
149 # with a single backslash.
150 # Suppressed this assert to fix bug #698, #628 is still there.
151 # Check them again.
152 assert i != n - 1, 'Trailing backslash: %r' % s
153
154 i += 1
155 c2 = mylib.ByteAt(s, i)
156
157 if mylib.ByteInSet(c2, GLOB_META_CHARS):
158 unescaped.append(c2)
159 elif mylib.ByteEquals(c2, '@'):
160 unescaped.append(pyos.BACKSLASH_CH)
161 else:
162 raise AssertionError("Unexpected escaped character %r" % c2)
163 else:
164 unescaped.append(c)
165 i += 1
166 return mylib.JoinBytes(unescaped)
167
168
169def GlobUnescapeBackslash(s):
170 # type: (str) -> str
171 """Inverse of GlobEscapeBackslash - turns \@ into \ """
172 unescaped = [] # type: List[int]
173 i = 0
174 n = len(s)
175 while i < n:
176 c = mylib.ByteAt(s, i)
177
178 if mylib.ByteEquals(c, '\\') and i != n - 1:
179 # Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
180 # could end with a single backslash?
181 assert i != n - 1, 'Trailing backslash: %r' % s
182
183 i += 1
184 c2 = mylib.ByteAt(s, i)
185
186 if mylib.ByteEquals(c2, '@'):
187 unescaped.append(pyos.BACKSLASH_CH)
188 else:
189 unescaped.append(pyos.BACKSLASH_CH)
190 unescaped.append(c2)
191 else:
192 unescaped.append(c)
193 i += 1
194 return mylib.JoinBytes(unescaped)
195
196
197# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
198# positions of matches. So we convert globs to regexps.
199
200# Problems:
201# - What about unicode? Do we have to set any global variables? We want it to
202# always use utf-8?
203
204
205class _GlobParser(object):
206
207 def __init__(self, lexer):
208 # type: (SimpleLexer) -> None
209 self.lexer = lexer
210 self.token_type = Id.Undefined_Tok
211 self.token_val = ''
212 self.warnings = [] # type: List[str]
213
214 def _Next(self):
215 # type: () -> None
216 """Move to the next token."""
217 self.token_type, self.token_val = self.lexer.Next()
218
219 def _ParseCharClass(self):
220 # type: () -> List[glob_part_t]
221 """
222 Returns:
223 a CharClass if the parse succeeds, or a Literal if fails. In the latter
224 case, we also append a warning.
225 """
226 first_token = glob_part.Literal(self.token_type, self.token_val)
227 balance = 1 # We already saw a [
228 tokens = [] # type: List[Tuple[Id_t, str]]
229
230 # NOTE: There is a special rule where []] and [[] are valid globs. Also
231 # [^[] and sometimes [^]], although that one is ambiguous!
232 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
233 # punting on this now because the rule isn't clear and consistent between
234 # shells.
235
236 while True:
237 self._Next()
238
239 if self.token_type == Id.Eol_Tok:
240 # TODO: location info
241 self.warnings.append(
242 'Malformed character class; treating as literal')
243 parts = [first_token] # type: List[glob_part_t]
244 for (id_, s) in tokens:
245 parts.append(glob_part.Literal(id_, s))
246 return parts
247
248 if self.token_type == Id.Glob_LBracket:
249 balance += 1
250 elif self.token_type == Id.Glob_RBracket:
251 balance -= 1
252
253 if balance == 0:
254 break
255 tokens.append(
256 (self.token_type, self.token_val)) # Don't append the last ]
257
258 negated = False
259 if len(tokens):
260 id1, _ = tokens[0]
261 # NOTE: Both ! and ^ work for negation in globs
262 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
263 # TODO: Warn about the one that's not recommended?
264 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
265 negated = True
266 tokens = tokens[1:]
267 strs = [s for _, s in tokens]
268 return [glob_part.CharClass(negated, strs)]
269
270 def Parse(self):
271 # type: () -> Tuple[List[glob_part_t], List[str]]
272 """
273 Returns:
274 regex string (or None if it's not a glob)
275 A list of warnings about the syntax
276 """
277 parts = [] # type: List[glob_part_t]
278
279 while True:
280 self._Next()
281 id_ = self.token_type
282 s = self.token_val
283
284 #log('%s %r', self.token_type, self.token_val)
285 if id_ == Id.Eol_Tok:
286 break
287
288 if id_ in (Id.Glob_Star, Id.Glob_QMark):
289 parts.append(glob_part.Operator(id_))
290
291 elif id_ == Id.Glob_LBracket:
292 # Could return a Literal or a CharClass
293 parts.extend(self._ParseCharClass())
294
295 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
296 # BadBackslash}
297 parts.append(glob_part.Literal(id_, s))
298
299 # Also check for warnings. TODO: location info.
300 if id_ == Id.Glob_RBracket:
301 self.warnings.append('Got unescaped right bracket')
302 if id_ == Id.Glob_BadBackslash:
303 self.warnings.append('Got unescaped trailing backslash')
304
305 return parts, self.warnings
306
307
308_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
309
310
311def _GenerateERE(parts):
312 # type: (List[glob_part_t]) -> str
313 out = [] # type: List[str]
314
315 for part in parts:
316 tag = part.tag()
317 UP_part = part
318
319 if tag == glob_part_e.Literal:
320 part = cast(glob_part.Literal, UP_part)
321 if part.id == Id.Glob_EscapedChar:
322 assert len(part.s) == 2, part.s
323 # The user could have escaped a char that doesn't need regex escaping,
324 # like \b or something.
325 c = part.s[1]
326 if c in _REGEX_CHARS_TO_ESCAPE:
327 out.append('\\')
328 out.append(c)
329
330 # ! is only for char class
331 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
332 out.append(part.s) # e.g. 'py' doesn't need to be escaped
333
334 # ^ is only for char class
335 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
336 assert len(part.s) == 1, part.s
337 c = part.s
338 if c in _REGEX_CHARS_TO_ESCAPE:
339 out.append('\\')
340 out.append(c)
341
342 # These are UNMATCHED ones not parsed in a glob class
343 elif part.id == Id.Glob_LBracket:
344 out.append('\\[')
345
346 elif part.id == Id.Glob_RBracket:
347 out.append('\\]')
348
349 elif part.id == Id.Glob_BadBackslash:
350 out.append('\\\\')
351
352 elif part.id == Id.Glob_Caret:
353 out.append('^')
354
355 else:
356 raise AssertionError(part.id)
357
358 elif tag == glob_part_e.Operator:
359 part = cast(glob_part.Operator, UP_part)
360 if part.op_id == Id.Glob_QMark:
361 out.append('.')
362 elif part.op_id == Id.Glob_Star:
363 out.append('.*')
364 else:
365 raise AssertionError()
366
367 elif tag == glob_part_e.CharClass:
368 part = cast(glob_part.CharClass, UP_part)
369 out.append('[')
370 if part.negated:
371 out.append('^')
372
373 # Important: the character class is LITERALLY preserved, because we
374 # assume glob char classes are EXACTLY the same as regex char classes,
375 # including the escaping rules.
376 #
377 # TWO WEIRD EXCEPTIONS:
378 # \- is moved to the end as '-'.
379 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
380 # want a literal, and the POSIX way to do that is to put it at the end.
381 # \] is moved to the FRONT as ]
382
383 good = [] # type: List[str]
384
385 literal_hyphen = False
386 literal_rbracket = False
387
388 for s in part.strs:
389 if s == '\-':
390 literal_hyphen = True
391 continue
392 if s == '\]':
393 literal_rbracket = True
394 continue
395 good.append(s)
396
397 if literal_rbracket:
398 out.append(']')
399
400 out.extend(good)
401
402 if literal_hyphen:
403 out.append('-')
404
405 out.append(']')
406
407 return ''.join(out)
408
409
410def GlobToERE(pat):
411 # type: (str) -> Tuple[str, List[str]]
412 lexer = match.GlobLexer(pat)
413 p = _GlobParser(lexer)
414 parts, warnings = p.Parse()
415
416 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
417 # a literal, and we could use a more efficient mechanism.
418 # But we would have to DEQUOTE before doing that.
419 if 0:
420 is_glob = False
421 for p in parts:
422 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
423 is_glob = True
424 if 0:
425 log('GlobToERE()')
426 for p in parts:
427 log(' %s', p)
428
429 regex = _GenerateERE(parts)
430 #log('pat %s -> regex %s', pat, regex)
431 return regex, warnings
432
433
434# Notes for implementing extglob
435# - libc glob() doesn't have any extension!
436# - Nix stdenv uses !(foo) and @(foo|bar)
437# - can we special case these for now?
438# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
439# result!
440# - Actually I guess we can do that for all of them. That seems fine.
441# - But we have to get the statically parsed arg in here?
442# - or do dynamic parsing
443# - LooksLikeGlob() would have to respect extglob! ugh!
444# - See 2 calls in osh/word_eval.py
445
446
447class Globber(object):
448
449 def __init__(self, exec_opts):
450 # type: (optview.Exec) -> None
451 self.exec_opts = exec_opts
452
453 # Other unimplemented bash options:
454 #
455 # dotglob dotfiles are matched
456 # globstar ** for directories
457 # globasciiranges ascii or unicode char classes (unicode by default)
458 # nocaseglob
459 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
460 # not glob().
461 #
462 # NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
463 # do. Could a default GLOBIGNORE to ignore flags on the file system be
464 # part of the security solution? It doesn't seem totally sound.
465
466 def _Glob(self, arg, out):
467 # type: (str, List[str]) -> int
468 try:
469 flags = 0
470 if self.exec_opts.dotglob():
471 # If HAVE_GLOB_PERIOD is false, then ./configure stubs out
472 # GLOB_PERIOD as 0, a no-op
473 flags |= GLOB_PERIOD
474 results = libc.glob(arg, flags)
475 except RuntimeError as e:
476 # These errors should be rare: I/O error, out of memory, or unknown
477 # There are no syntax errors. (But see comment about globerr() in
478 # native/libc.c.)
479 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
480 msg = e.message # type: str
481 print_stderr("Error expanding glob %r: %s" % (arg, msg))
482 raise
483 #log('glob %r -> %r', arg, g)
484
485 n = len(results)
486 if n: # Something matched
487 # Omit files starting with -
488 # no_dash_glob is part of shopt --set ysh:upgrade
489 if self.exec_opts.no_dash_glob():
490 tmp = [s for s in results if not s.startswith('-')]
491 results = tmp # idiom to work around mycpp limitation
492 n = len(results)
493
494 # XXX: libc's glob function can return '.' and '..', which
495 # are typically not of interest. Filtering in this manner
496 # is similar (but not identical) to the default bash
497 # setting of 'setopt -s globskipdots'. Supporting that
498 # option fully would require more than simply wrapping
499 # this in an if statement.
500 n = 0
501 for s in results:
502 if s not in ('.', '..'):
503 out.append(s)
504 n += 1
505 return n
506
507 return 0
508
509 def Expand(self, arg, out, blame_loc):
510 # type: (str, List[str], loc_t) -> int
511 """Given a string that MAY be a glob, perform glob expansion
512
513 If files on disk match the glob pattern, we append to the list 'out',
514 and return the number of items.
515
516 Returns:
517 Number of items appended, or -1 when glob expansion did not happen.
518 Raises:
519 error.FailGlob when nothing matched, and shopt -s failglob
520 """
521 if self.exec_opts.noglob():
522 # The caller should use the original string
523 return -1
524
525 n = self._Glob(arg, out)
526 if n:
527 return n
528
529 # Nothing matched
530 if self.exec_opts.failglob():
531 raise error.FailGlob('Pattern %r matched no files' % arg,
532 blame_loc)
533
534 if self.exec_opts.nullglob():
535 return 0
536
537 # The caller should use the original string
538 return -1
539
540 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
541 # type: (str, str, List[str]) -> int
542 if self.exec_opts.noglob():
543 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
544 # there is extra \ escaping compared with bash and mksh. OK for now
545 out.append(fnmatch_pat)
546 return 1
547
548 tmp = [] # type: List[str]
549 self._Glob(glob_pat, tmp)
550 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
551 n = len(filtered)
552
553 if n:
554 out.extend(filtered)
555 return n
556
557 if self.exec_opts.failglob():
558 return -1 # nothing matched
559
560 if self.exec_opts.nullglob():
561 return 0
562 else:
563 # See comment above
564 out.append(GlobUnescape(fnmatch_pat))
565 return 1