1 | """Glob_.py."""
|
2 |
|
3 | import libc
|
4 |
|
5 | from _devbuild.gen.id_kind_asdl import Id, Id_t
|
6 | from _devbuild.gen.syntax_asdl import (
|
7 | CompoundWord,
|
8 | Token,
|
9 | word_part_e,
|
10 | glob_part,
|
11 | glob_part_e,
|
12 | glob_part_t,
|
13 | )
|
14 | from core import pyutil
|
15 | from frontend import match
|
16 | from libc import GLOB_PERIOD
|
17 | from mycpp import mylib
|
18 | from mycpp.mylib import log, print_stderr
|
19 |
|
20 | from typing import List, Tuple, cast, TYPE_CHECKING
|
21 | if TYPE_CHECKING:
|
22 | from core import optview
|
23 | from frontend.match import SimpleLexer
|
24 |
|
25 | _ = log
|
26 |
|
27 |
|
28 | def LooksLikeGlob(s):
|
29 | # type: (str) -> bool
|
30 | """Does this string look like a glob pattern?
|
31 |
|
32 | Like other shells, OSH avoids calls to glob() unless there are glob
|
33 | metacharacters.
|
34 |
|
35 | TODO: Reference lib/glob / glob_pattern functions in bash
|
36 | $ grep glob_pattern lib/glob/*
|
37 |
|
38 | Used:
|
39 | 1. in Globber below
|
40 | 2. for the slow path / fast path of prefix/suffix/patsub ops.
|
41 | """
|
42 | left_bracket = False
|
43 | i = 0
|
44 | n = len(s)
|
45 | while i < n:
|
46 | c = mylib.ByteAt(s, i)
|
47 |
|
48 | if mylib.ByteEquals(c, '\\'):
|
49 | i += 1
|
50 |
|
51 | elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
|
52 | return True
|
53 |
|
54 | elif mylib.ByteEquals(c, '['):
|
55 | left_bracket = True
|
56 |
|
57 | elif mylib.ByteEquals(c, ']') and left_bracket:
|
58 | # It has at least one pair of balanced []. Not bothering to check stray
|
59 | # [ or ].
|
60 | return True
|
61 |
|
62 | i += 1
|
63 | return False
|
64 |
|
65 |
|
66 | def LooksLikeStaticGlob(w):
|
67 | # type: (CompoundWord) -> bool
|
68 | """Like LooksLikeGlob, but for static words."""
|
69 |
|
70 | left_bracket = False
|
71 | for part in w.parts:
|
72 | if part.tag() == word_part_e.Literal:
|
73 | id_ = cast(Token, part).id
|
74 | if id_ in (Id.Lit_Star, Id.Lit_QMark):
|
75 | return True
|
76 | elif id_ == Id.Lit_LBracket:
|
77 | left_bracket = True
|
78 | elif id_ == Id.Lit_RBracket and left_bracket:
|
79 | return True
|
80 | return False
|
81 |
|
82 |
|
83 | # Glob Helpers for WordParts.
|
84 | # NOTE: Escaping / doesn't work, because it's not a filename character.
|
85 | # ! : - are metachars within character classes
|
86 | # ( ) | are extended glob characters, and it's OK to add extra \ when the
|
87 | # underlying library doesn't support extended globs
|
88 | # we don't need to escape the @ in @(cc), because escaping ( is enough
|
89 | GLOB_META_CHARS = r'\*?[]-:!()|'
|
90 |
|
91 |
|
92 | def GlobEscape(s):
|
93 | # type: (str) -> str
|
94 | """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
|
95 | return pyutil.BackslashEscape(s, GLOB_META_CHARS)
|
96 |
|
97 |
|
98 | # Bug fix: add [] so [[:space:]] is not special, etc.
|
99 | ERE_META_CHARS = r'\?*+{}^$.()|[]'
|
100 |
|
101 |
|
102 | def ExtendedRegexEscape(s):
|
103 | # type: (str) -> str
|
104 | """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
|
105 | I don't think libc has a function to do this. Escape these characters:
|
106 |
|
107 | https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
|
108 | """
|
109 | return pyutil.BackslashEscape(s, ERE_META_CHARS)
|
110 |
|
111 |
|
112 | def GlobUnescape(s):
|
113 | # type: (str) -> str
|
114 | """Remove glob escaping from a string.
|
115 |
|
116 | Used when there is no glob match.
|
117 | TODO: Can probably get rid of this, as long as you save the original word.
|
118 |
|
119 | Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
|
120 | word_eval _JoinElideEscape and EvalWordToString you have to build two
|
121 | 'parallel' strings -- one escaped and one not.
|
122 | """
|
123 | unescaped = [] # type: List[int]
|
124 | i = 0
|
125 | n = len(s)
|
126 | while i < n:
|
127 | c = mylib.ByteAt(s, i)
|
128 |
|
129 | if mylib.ByteEquals(c, '\\') and i != n - 1:
|
130 | # Suppressed this to fix bug #698, #628 is still there.
|
131 | assert i != n - 1, 'Trailing backslash: %r' % s
|
132 | i += 1
|
133 | c2 = mylib.ByteAt(s, i)
|
134 |
|
135 | if mylib.ByteInSet(c2, GLOB_META_CHARS):
|
136 | unescaped.append(c2)
|
137 | else:
|
138 | raise AssertionError("Unexpected escaped character %r" % c2)
|
139 | else:
|
140 | unescaped.append(c)
|
141 | i += 1
|
142 | return mylib.JoinBytes(unescaped)
|
143 |
|
144 |
|
145 | # For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
|
146 | # positions of matches. So we convert globs to regexps.
|
147 |
|
148 | # Problems:
|
149 | # - What about unicode? Do we have to set any global variables? We want it to
|
150 | # always use utf-8?
|
151 |
|
152 |
|
153 | class _GlobParser(object):
|
154 |
|
155 | def __init__(self, lexer):
|
156 | # type: (SimpleLexer) -> None
|
157 | self.lexer = lexer
|
158 | self.token_type = Id.Undefined_Tok
|
159 | self.token_val = ''
|
160 | self.warnings = [] # type: List[str]
|
161 |
|
162 | def _Next(self):
|
163 | # type: () -> None
|
164 | """Move to the next token."""
|
165 | self.token_type, self.token_val = self.lexer.Next()
|
166 |
|
167 | def _ParseCharClass(self):
|
168 | # type: () -> List[glob_part_t]
|
169 | """
|
170 | Returns:
|
171 | a CharClass if the parse succeeds, or a Literal if fails. In the latter
|
172 | case, we also append a warning.
|
173 | """
|
174 | first_token = glob_part.Literal(self.token_type, self.token_val)
|
175 | balance = 1 # We already saw a [
|
176 | tokens = [] # type: List[Tuple[Id_t, str]]
|
177 |
|
178 | # NOTE: There is a special rule where []] and [[] are valid globs. Also
|
179 | # [^[] and sometimes [^]], although that one is ambiguous!
|
180 | # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
|
181 | # punting on this now because the rule isn't clear and consistent between
|
182 | # shells.
|
183 |
|
184 | while True:
|
185 | self._Next()
|
186 |
|
187 | if self.token_type == Id.Eol_Tok:
|
188 | # TODO: location info
|
189 | self.warnings.append(
|
190 | 'Malformed character class; treating as literal')
|
191 | parts = [first_token] # type: List[glob_part_t]
|
192 | for (id_, s) in tokens:
|
193 | parts.append(glob_part.Literal(id_, s))
|
194 | return parts
|
195 |
|
196 | if self.token_type == Id.Glob_LBracket:
|
197 | balance += 1
|
198 | elif self.token_type == Id.Glob_RBracket:
|
199 | balance -= 1
|
200 |
|
201 | if balance == 0:
|
202 | break
|
203 | tokens.append(
|
204 | (self.token_type, self.token_val)) # Don't append the last ]
|
205 |
|
206 | negated = False
|
207 | if len(tokens):
|
208 | id1, _ = tokens[0]
|
209 | # NOTE: Both ! and ^ work for negation in globs
|
210 | # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
|
211 | # TODO: Warn about the one that's not recommended?
|
212 | if id1 in (Id.Glob_Bang, Id.Glob_Caret):
|
213 | negated = True
|
214 | tokens = tokens[1:]
|
215 | strs = [s for _, s in tokens]
|
216 | return [glob_part.CharClass(negated, strs)]
|
217 |
|
218 | def Parse(self):
|
219 | # type: () -> Tuple[List[glob_part_t], List[str]]
|
220 | """
|
221 | Returns:
|
222 | regex string (or None if it's not a glob)
|
223 | A list of warnings about the syntax
|
224 | """
|
225 | parts = [] # type: List[glob_part_t]
|
226 |
|
227 | while True:
|
228 | self._Next()
|
229 | id_ = self.token_type
|
230 | s = self.token_val
|
231 |
|
232 | #log('%s %r', self.token_type, self.token_val)
|
233 | if id_ == Id.Eol_Tok:
|
234 | break
|
235 |
|
236 | if id_ in (Id.Glob_Star, Id.Glob_QMark):
|
237 | parts.append(glob_part.Operator(id_))
|
238 |
|
239 | elif id_ == Id.Glob_LBracket:
|
240 | # Could return a Literal or a CharClass
|
241 | parts.extend(self._ParseCharClass())
|
242 |
|
243 | else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
|
244 | # BadBackslash}
|
245 | parts.append(glob_part.Literal(id_, s))
|
246 |
|
247 | # Also check for warnings. TODO: location info.
|
248 | if id_ == Id.Glob_RBracket:
|
249 | self.warnings.append('Got unescaped right bracket')
|
250 | if id_ == Id.Glob_BadBackslash:
|
251 | self.warnings.append('Got unescaped trailing backslash')
|
252 |
|
253 | return parts, self.warnings
|
254 |
|
255 |
|
256 | _REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
|
257 |
|
258 |
|
259 | def _GenerateERE(parts):
|
260 | # type: (List[glob_part_t]) -> str
|
261 | out = [] # type: List[str]
|
262 |
|
263 | for part in parts:
|
264 | tag = part.tag()
|
265 | UP_part = part
|
266 |
|
267 | if tag == glob_part_e.Literal:
|
268 | part = cast(glob_part.Literal, UP_part)
|
269 | if part.id == Id.Glob_EscapedChar:
|
270 | assert len(part.s) == 2, part.s
|
271 | # The user could have escaped a char that doesn't need regex escaping,
|
272 | # like \b or something.
|
273 | c = part.s[1]
|
274 | if c in _REGEX_CHARS_TO_ESCAPE:
|
275 | out.append('\\')
|
276 | out.append(c)
|
277 |
|
278 | # ! is only for char class
|
279 | elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
|
280 | out.append(part.s) # e.g. 'py' doesn't need to be escaped
|
281 |
|
282 | # ^ is only for char class
|
283 | elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
|
284 | assert len(part.s) == 1, part.s
|
285 | c = part.s
|
286 | if c in _REGEX_CHARS_TO_ESCAPE:
|
287 | out.append('\\')
|
288 | out.append(c)
|
289 |
|
290 | # These are UNMATCHED ones not parsed in a glob class
|
291 | elif part.id == Id.Glob_LBracket:
|
292 | out.append('\\[')
|
293 |
|
294 | elif part.id == Id.Glob_RBracket:
|
295 | out.append('\\]')
|
296 |
|
297 | elif part.id == Id.Glob_BadBackslash:
|
298 | out.append('\\\\')
|
299 |
|
300 | elif part.id == Id.Glob_Caret:
|
301 | out.append('^')
|
302 |
|
303 | else:
|
304 | raise AssertionError(part.id)
|
305 |
|
306 | elif tag == glob_part_e.Operator:
|
307 | part = cast(glob_part.Operator, UP_part)
|
308 | if part.op_id == Id.Glob_QMark:
|
309 | out.append('.')
|
310 | elif part.op_id == Id.Glob_Star:
|
311 | out.append('.*')
|
312 | else:
|
313 | raise AssertionError()
|
314 |
|
315 | elif tag == glob_part_e.CharClass:
|
316 | part = cast(glob_part.CharClass, UP_part)
|
317 | out.append('[')
|
318 | if part.negated:
|
319 | out.append('^')
|
320 |
|
321 | # Important: the character class is LITERALLY preserved, because we
|
322 | # assume glob char classes are EXACTLY the same as regex char classes,
|
323 | # including the escaping rules.
|
324 | #
|
325 | # TWO WEIRD EXCEPTIONS:
|
326 | # \- is moved to the end as '-'.
|
327 | # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
|
328 | # want a literal, and the POSIX way to do that is to put it at the end.
|
329 | # \] is moved to the FRONT as ]
|
330 |
|
331 | good = [] # type: List[str]
|
332 |
|
333 | literal_hyphen = False
|
334 | literal_rbracket = False
|
335 |
|
336 | for s in part.strs:
|
337 | if s == '\-':
|
338 | literal_hyphen = True
|
339 | continue
|
340 | if s == '\]':
|
341 | literal_rbracket = True
|
342 | continue
|
343 | good.append(s)
|
344 |
|
345 | if literal_rbracket:
|
346 | out.append(']')
|
347 |
|
348 | out.extend(good)
|
349 |
|
350 | if literal_hyphen:
|
351 | out.append('-')
|
352 |
|
353 | out.append(']')
|
354 |
|
355 | return ''.join(out)
|
356 |
|
357 |
|
358 | def GlobToERE(pat):
|
359 | # type: (str) -> Tuple[str, List[str]]
|
360 | lexer = match.GlobLexer(pat)
|
361 | p = _GlobParser(lexer)
|
362 | parts, warnings = p.Parse()
|
363 |
|
364 | # Vestigial: if there is nothing like * ? or [abc], then the whole string is
|
365 | # a literal, and we could use a more efficient mechanism.
|
366 | # But we would have to DEQUOTE before doing that.
|
367 | if 0:
|
368 | is_glob = False
|
369 | for p in parts:
|
370 | if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
|
371 | is_glob = True
|
372 | if 0:
|
373 | log('GlobToERE()')
|
374 | for p in parts:
|
375 | log(' %s', p)
|
376 |
|
377 | regex = _GenerateERE(parts)
|
378 | #log('pat %s -> regex %s', pat, regex)
|
379 | return regex, warnings
|
380 |
|
381 |
|
382 | # Notes for implementing extglob
|
383 | # - libc glob() doesn't have any extension!
|
384 | # - Nix stdenv uses !(foo) and @(foo|bar)
|
385 | # - can we special case these for now?
|
386 | # - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
|
387 | # result!
|
388 | # - Actually I guess we can do that for all of them. That seems fine.
|
389 | # - But we have to get the statically parsed arg in here?
|
390 | # - or do dynamic parsing
|
391 | # - LooksLikeGlob() would have to respect extglob! ugh!
|
392 | # - See 2 calls in osh/word_eval.py
|
393 |
|
394 |
|
395 | class Globber(object):
|
396 |
|
397 | def __init__(self, exec_opts):
|
398 | # type: (optview.Exec) -> None
|
399 | self.exec_opts = exec_opts
|
400 |
|
401 | # Other unimplemented bash options:
|
402 | #
|
403 | # dotglob dotfiles are matched
|
404 | # globstar ** for directories
|
405 | # globasciiranges ascii or unicode char classes (unicode by default)
|
406 | # nocaseglob
|
407 | # extglob the @() !() syntax -- libc helps us with fnmatch(), but
|
408 | # not glob().
|
409 | #
|
410 | # NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
|
411 | # do. Could a default GLOBIGNORE to ignore flags on the file system be
|
412 | # part of the security solution? It doesn't seem totally sound.
|
413 |
|
414 | def _Glob(self, arg, out):
|
415 | # type: (str, List[str]) -> int
|
416 | try:
|
417 | flags = 0
|
418 | if self.exec_opts.dotglob():
|
419 | flags |= GLOB_PERIOD
|
420 | results = libc.glob(arg, flags)
|
421 | except RuntimeError as e:
|
422 | # These errors should be rare: I/O error, out of memory, or unknown
|
423 | # There are no syntax errors. (But see comment about globerr() in
|
424 | # native/libc.c.)
|
425 | # note: MyPy doesn't know RuntimeError has e.message (and e.args)
|
426 | msg = e.message # type: str
|
427 | print_stderr("Error expanding glob %r: %s" % (arg, msg))
|
428 | raise
|
429 | #log('glob %r -> %r', arg, g)
|
430 |
|
431 | n = len(results)
|
432 | if n: # Something matched
|
433 | # Omit files starting with -
|
434 | # dashglob turned OFF with shopt -s oil:upgrade.
|
435 | if not self.exec_opts.dashglob():
|
436 | tmp = [s for s in results if not s.startswith('-')]
|
437 | results = tmp # idiom to work around mycpp limitation
|
438 | n = len(results)
|
439 |
|
440 | # XXX: libc's glob function can return '.' and '..', which
|
441 | # are typically not of interest. Filtering in this manner
|
442 | # is similar (but not identical) to the default bash
|
443 | # setting of 'setopt -s globskipdots'. Supporting that
|
444 | # option fully would require more than simply wrapping
|
445 | # this in an if statement.
|
446 | n = 0
|
447 | for s in results:
|
448 | if s not in ('.', '..'):
|
449 | out.append(s)
|
450 | n += 1
|
451 | return n
|
452 |
|
453 | return 0
|
454 |
|
455 | def Expand(self, arg, out):
|
456 | # type: (str, List[str]) -> int
|
457 | """Given a string that could be a glob, append a list of strings to
|
458 | 'out'.
|
459 |
|
460 | Returns:
|
461 | Number of items appended, or -1 for fatal failglob error.
|
462 | """
|
463 | if self.exec_opts.noglob():
|
464 | # we didn't glob escape it in osh/word_eval.py
|
465 | out.append(arg)
|
466 | return 1
|
467 |
|
468 | n = self._Glob(arg, out)
|
469 | if n:
|
470 | return n
|
471 |
|
472 | # Nothing matched
|
473 | if self.exec_opts.failglob():
|
474 | return -1
|
475 |
|
476 | if self.exec_opts.nullglob():
|
477 | return 0
|
478 | else:
|
479 | # Return the original string
|
480 | out.append(GlobUnescape(arg))
|
481 | return 1
|
482 |
|
483 | def ExpandExtended(self, glob_pat, fnmatch_pat, out):
|
484 | # type: (str, str, List[str]) -> int
|
485 | if self.exec_opts.noglob():
|
486 | # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
|
487 | # there is extra \ escaping compared with bash and mksh. OK for now
|
488 | out.append(fnmatch_pat)
|
489 | return 1
|
490 |
|
491 | tmp = [] # type: List[str]
|
492 | self._Glob(glob_pat, tmp)
|
493 | filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
|
494 | n = len(filtered)
|
495 |
|
496 | if n:
|
497 | out.extend(filtered)
|
498 | return n
|
499 |
|
500 | if self.exec_opts.failglob():
|
501 | return -1 # nothing matched
|
502 |
|
503 | if self.exec_opts.nullglob():
|
504 | return 0
|
505 | else:
|
506 | # See comment above
|
507 | out.append(GlobUnescape(fnmatch_pat))
|
508 | return 1
|