OILS / osh / glob_.py View on Github | oilshell.org

508 lines, 264 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (
7 CompoundWord,
8 Token,
9 word_part_e,
10 glob_part,
11 glob_part_e,
12 glob_part_t,
13)
14from core import pyutil
15from frontend import match
16from libc import GLOB_PERIOD
17from mycpp import mylib
18from mycpp.mylib import log, print_stderr
19
20from typing import List, Tuple, cast, TYPE_CHECKING
21if TYPE_CHECKING:
22 from core import optview
23 from frontend.match import SimpleLexer
24
25_ = log
26
27
28def LooksLikeGlob(s):
29 # type: (str) -> bool
30 """Does this string look like a glob pattern?
31
32 Like other shells, OSH avoids calls to glob() unless there are glob
33 metacharacters.
34
35 TODO: Reference lib/glob / glob_pattern functions in bash
36 $ grep glob_pattern lib/glob/*
37
38 Used:
39 1. in Globber below
40 2. for the slow path / fast path of prefix/suffix/patsub ops.
41 """
42 left_bracket = False
43 i = 0
44 n = len(s)
45 while i < n:
46 c = mylib.ByteAt(s, i)
47
48 if mylib.ByteEquals(c, '\\'):
49 i += 1
50
51 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
52 return True
53
54 elif mylib.ByteEquals(c, '['):
55 left_bracket = True
56
57 elif mylib.ByteEquals(c, ']') and left_bracket:
58 # It has at least one pair of balanced []. Not bothering to check stray
59 # [ or ].
60 return True
61
62 i += 1
63 return False
64
65
66def LooksLikeStaticGlob(w):
67 # type: (CompoundWord) -> bool
68 """Like LooksLikeGlob, but for static words."""
69
70 left_bracket = False
71 for part in w.parts:
72 if part.tag() == word_part_e.Literal:
73 id_ = cast(Token, part).id
74 if id_ in (Id.Lit_Star, Id.Lit_QMark):
75 return True
76 elif id_ == Id.Lit_LBracket:
77 left_bracket = True
78 elif id_ == Id.Lit_RBracket and left_bracket:
79 return True
80 return False
81
82
83# Glob Helpers for WordParts.
84# NOTE: Escaping / doesn't work, because it's not a filename character.
85# ! : - are metachars within character classes
86# ( ) | are extended glob characters, and it's OK to add extra \ when the
87# underlying library doesn't support extended globs
88# we don't need to escape the @ in @(cc), because escaping ( is enough
89GLOB_META_CHARS = r'\*?[]-:!()|'
90
91
92def GlobEscape(s):
93 # type: (str) -> str
94 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
95 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
96
97
98# Bug fix: add [] so [[:space:]] is not special, etc.
99ERE_META_CHARS = r'\?*+{}^$.()|[]'
100
101
102def ExtendedRegexEscape(s):
103 # type: (str) -> str
104 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
105 I don't think libc has a function to do this. Escape these characters:
106
107 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
108 """
109 return pyutil.BackslashEscape(s, ERE_META_CHARS)
110
111
112def GlobUnescape(s):
113 # type: (str) -> str
114 """Remove glob escaping from a string.
115
116 Used when there is no glob match.
117 TODO: Can probably get rid of this, as long as you save the original word.
118
119 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
120 word_eval _JoinElideEscape and EvalWordToString you have to build two
121 'parallel' strings -- one escaped and one not.
122 """
123 unescaped = [] # type: List[int]
124 i = 0
125 n = len(s)
126 while i < n:
127 c = mylib.ByteAt(s, i)
128
129 if mylib.ByteEquals(c, '\\') and i != n - 1:
130 # Suppressed this to fix bug #698, #628 is still there.
131 assert i != n - 1, 'Trailing backslash: %r' % s
132 i += 1
133 c2 = mylib.ByteAt(s, i)
134
135 if mylib.ByteInSet(c2, GLOB_META_CHARS):
136 unescaped.append(c2)
137 else:
138 raise AssertionError("Unexpected escaped character %r" % c2)
139 else:
140 unescaped.append(c)
141 i += 1
142 return mylib.JoinBytes(unescaped)
143
144
145# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
146# positions of matches. So we convert globs to regexps.
147
148# Problems:
149# - What about unicode? Do we have to set any global variables? We want it to
150# always use utf-8?
151
152
153class _GlobParser(object):
154
155 def __init__(self, lexer):
156 # type: (SimpleLexer) -> None
157 self.lexer = lexer
158 self.token_type = Id.Undefined_Tok
159 self.token_val = ''
160 self.warnings = [] # type: List[str]
161
162 def _Next(self):
163 # type: () -> None
164 """Move to the next token."""
165 self.token_type, self.token_val = self.lexer.Next()
166
167 def _ParseCharClass(self):
168 # type: () -> List[glob_part_t]
169 """
170 Returns:
171 a CharClass if the parse succeeds, or a Literal if fails. In the latter
172 case, we also append a warning.
173 """
174 first_token = glob_part.Literal(self.token_type, self.token_val)
175 balance = 1 # We already saw a [
176 tokens = [] # type: List[Tuple[Id_t, str]]
177
178 # NOTE: There is a special rule where []] and [[] are valid globs. Also
179 # [^[] and sometimes [^]], although that one is ambiguous!
180 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
181 # punting on this now because the rule isn't clear and consistent between
182 # shells.
183
184 while True:
185 self._Next()
186
187 if self.token_type == Id.Eol_Tok:
188 # TODO: location info
189 self.warnings.append(
190 'Malformed character class; treating as literal')
191 parts = [first_token] # type: List[glob_part_t]
192 for (id_, s) in tokens:
193 parts.append(glob_part.Literal(id_, s))
194 return parts
195
196 if self.token_type == Id.Glob_LBracket:
197 balance += 1
198 elif self.token_type == Id.Glob_RBracket:
199 balance -= 1
200
201 if balance == 0:
202 break
203 tokens.append(
204 (self.token_type, self.token_val)) # Don't append the last ]
205
206 negated = False
207 if len(tokens):
208 id1, _ = tokens[0]
209 # NOTE: Both ! and ^ work for negation in globs
210 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
211 # TODO: Warn about the one that's not recommended?
212 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
213 negated = True
214 tokens = tokens[1:]
215 strs = [s for _, s in tokens]
216 return [glob_part.CharClass(negated, strs)]
217
218 def Parse(self):
219 # type: () -> Tuple[List[glob_part_t], List[str]]
220 """
221 Returns:
222 regex string (or None if it's not a glob)
223 A list of warnings about the syntax
224 """
225 parts = [] # type: List[glob_part_t]
226
227 while True:
228 self._Next()
229 id_ = self.token_type
230 s = self.token_val
231
232 #log('%s %r', self.token_type, self.token_val)
233 if id_ == Id.Eol_Tok:
234 break
235
236 if id_ in (Id.Glob_Star, Id.Glob_QMark):
237 parts.append(glob_part.Operator(id_))
238
239 elif id_ == Id.Glob_LBracket:
240 # Could return a Literal or a CharClass
241 parts.extend(self._ParseCharClass())
242
243 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
244 # BadBackslash}
245 parts.append(glob_part.Literal(id_, s))
246
247 # Also check for warnings. TODO: location info.
248 if id_ == Id.Glob_RBracket:
249 self.warnings.append('Got unescaped right bracket')
250 if id_ == Id.Glob_BadBackslash:
251 self.warnings.append('Got unescaped trailing backslash')
252
253 return parts, self.warnings
254
255
256_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
257
258
259def _GenerateERE(parts):
260 # type: (List[glob_part_t]) -> str
261 out = [] # type: List[str]
262
263 for part in parts:
264 tag = part.tag()
265 UP_part = part
266
267 if tag == glob_part_e.Literal:
268 part = cast(glob_part.Literal, UP_part)
269 if part.id == Id.Glob_EscapedChar:
270 assert len(part.s) == 2, part.s
271 # The user could have escaped a char that doesn't need regex escaping,
272 # like \b or something.
273 c = part.s[1]
274 if c in _REGEX_CHARS_TO_ESCAPE:
275 out.append('\\')
276 out.append(c)
277
278 # ! is only for char class
279 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
280 out.append(part.s) # e.g. 'py' doesn't need to be escaped
281
282 # ^ is only for char class
283 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
284 assert len(part.s) == 1, part.s
285 c = part.s
286 if c in _REGEX_CHARS_TO_ESCAPE:
287 out.append('\\')
288 out.append(c)
289
290 # These are UNMATCHED ones not parsed in a glob class
291 elif part.id == Id.Glob_LBracket:
292 out.append('\\[')
293
294 elif part.id == Id.Glob_RBracket:
295 out.append('\\]')
296
297 elif part.id == Id.Glob_BadBackslash:
298 out.append('\\\\')
299
300 elif part.id == Id.Glob_Caret:
301 out.append('^')
302
303 else:
304 raise AssertionError(part.id)
305
306 elif tag == glob_part_e.Operator:
307 part = cast(glob_part.Operator, UP_part)
308 if part.op_id == Id.Glob_QMark:
309 out.append('.')
310 elif part.op_id == Id.Glob_Star:
311 out.append('.*')
312 else:
313 raise AssertionError()
314
315 elif tag == glob_part_e.CharClass:
316 part = cast(glob_part.CharClass, UP_part)
317 out.append('[')
318 if part.negated:
319 out.append('^')
320
321 # Important: the character class is LITERALLY preserved, because we
322 # assume glob char classes are EXACTLY the same as regex char classes,
323 # including the escaping rules.
324 #
325 # TWO WEIRD EXCEPTIONS:
326 # \- is moved to the end as '-'.
327 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
328 # want a literal, and the POSIX way to do that is to put it at the end.
329 # \] is moved to the FRONT as ]
330
331 good = [] # type: List[str]
332
333 literal_hyphen = False
334 literal_rbracket = False
335
336 for s in part.strs:
337 if s == '\-':
338 literal_hyphen = True
339 continue
340 if s == '\]':
341 literal_rbracket = True
342 continue
343 good.append(s)
344
345 if literal_rbracket:
346 out.append(']')
347
348 out.extend(good)
349
350 if literal_hyphen:
351 out.append('-')
352
353 out.append(']')
354
355 return ''.join(out)
356
357
358def GlobToERE(pat):
359 # type: (str) -> Tuple[str, List[str]]
360 lexer = match.GlobLexer(pat)
361 p = _GlobParser(lexer)
362 parts, warnings = p.Parse()
363
364 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
365 # a literal, and we could use a more efficient mechanism.
366 # But we would have to DEQUOTE before doing that.
367 if 0:
368 is_glob = False
369 for p in parts:
370 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
371 is_glob = True
372 if 0:
373 log('GlobToERE()')
374 for p in parts:
375 log(' %s', p)
376
377 regex = _GenerateERE(parts)
378 #log('pat %s -> regex %s', pat, regex)
379 return regex, warnings
380
381
382# Notes for implementing extglob
383# - libc glob() doesn't have any extension!
384# - Nix stdenv uses !(foo) and @(foo|bar)
385# - can we special case these for now?
386# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
387# result!
388# - Actually I guess we can do that for all of them. That seems fine.
389# - But we have to get the statically parsed arg in here?
390# - or do dynamic parsing
391# - LooksLikeGlob() would have to respect extglob! ugh!
392# - See 2 calls in osh/word_eval.py
393
394
395class Globber(object):
396
397 def __init__(self, exec_opts):
398 # type: (optview.Exec) -> None
399 self.exec_opts = exec_opts
400
401 # Other unimplemented bash options:
402 #
403 # dotglob dotfiles are matched
404 # globstar ** for directories
405 # globasciiranges ascii or unicode char classes (unicode by default)
406 # nocaseglob
407 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
408 # not glob().
409 #
410 # NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
411 # do. Could a default GLOBIGNORE to ignore flags on the file system be
412 # part of the security solution? It doesn't seem totally sound.
413
414 def _Glob(self, arg, out):
415 # type: (str, List[str]) -> int
416 try:
417 flags = 0
418 if self.exec_opts.dotglob():
419 flags |= GLOB_PERIOD
420 results = libc.glob(arg, flags)
421 except RuntimeError as e:
422 # These errors should be rare: I/O error, out of memory, or unknown
423 # There are no syntax errors. (But see comment about globerr() in
424 # native/libc.c.)
425 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
426 msg = e.message # type: str
427 print_stderr("Error expanding glob %r: %s" % (arg, msg))
428 raise
429 #log('glob %r -> %r', arg, g)
430
431 n = len(results)
432 if n: # Something matched
433 # Omit files starting with -
434 # dashglob turned OFF with shopt -s oil:upgrade.
435 if not self.exec_opts.dashglob():
436 tmp = [s for s in results if not s.startswith('-')]
437 results = tmp # idiom to work around mycpp limitation
438 n = len(results)
439
440 # XXX: libc's glob function can return '.' and '..', which
441 # are typically not of interest. Filtering in this manner
442 # is similar (but not identical) to the default bash
443 # setting of 'setopt -s globskipdots'. Supporting that
444 # option fully would require more than simply wrapping
445 # this in an if statement.
446 n = 0
447 for s in results:
448 if s not in ('.', '..'):
449 out.append(s)
450 n += 1
451 return n
452
453 return 0
454
455 def Expand(self, arg, out):
456 # type: (str, List[str]) -> int
457 """Given a string that could be a glob, append a list of strings to
458 'out'.
459
460 Returns:
461 Number of items appended, or -1 for fatal failglob error.
462 """
463 if self.exec_opts.noglob():
464 # we didn't glob escape it in osh/word_eval.py
465 out.append(arg)
466 return 1
467
468 n = self._Glob(arg, out)
469 if n:
470 return n
471
472 # Nothing matched
473 if self.exec_opts.failglob():
474 return -1
475
476 if self.exec_opts.nullglob():
477 return 0
478 else:
479 # Return the original string
480 out.append(GlobUnescape(arg))
481 return 1
482
483 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
484 # type: (str, str, List[str]) -> int
485 if self.exec_opts.noglob():
486 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
487 # there is extra \ escaping compared with bash and mksh. OK for now
488 out.append(fnmatch_pat)
489 return 1
490
491 tmp = [] # type: List[str]
492 self._Glob(glob_pat, tmp)
493 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
494 n = len(filtered)
495
496 if n:
497 out.extend(filtered)
498 return n
499
500 if self.exec_opts.failglob():
501 return -1 # nothing matched
502
503 if self.exec_opts.nullglob():
504 return 0
505 else:
506 # See comment above
507 out.append(GlobUnescape(fnmatch_pat))
508 return 1