OILS / osh / glob_.py View on Github | oils.pub

509 lines, 264 significant
1"""Glob_.py."""
2
3import libc
4
5from _devbuild.gen.id_kind_asdl import Id, Id_t
6from _devbuild.gen.syntax_asdl import (
7 CompoundWord,
8 Token,
9 word_part_e,
10 glob_part,
11 glob_part_e,
12 glob_part_t,
13)
14from core import pyutil
15from frontend import match
16from mycpp import mylib
17from mycpp.mylib import log, print_stderr
18
19from libc import GLOB_PERIOD, HAVE_GLOB_PERIOD
20
21from typing import List, Tuple, cast, TYPE_CHECKING
22if TYPE_CHECKING:
23 from core import optview
24 from frontend.match import SimpleLexer
25
26_ = log
27
28
29def LooksLikeGlob(s):
30 # type: (str) -> bool
31 """Does this string look like a glob pattern?
32
33 Like other shells, OSH avoids calls to glob() unless there are glob
34 metacharacters.
35
36 TODO: Reference lib/glob / glob_pattern functions in bash
37 $ grep glob_pattern lib/glob/*
38
39 Used:
40 1. in Globber below
41 2. for the slow path / fast path of prefix/suffix/patsub ops.
42 """
43 left_bracket = False
44 i = 0
45 n = len(s)
46 while i < n:
47 c = mylib.ByteAt(s, i)
48
49 if mylib.ByteEquals(c, '\\'):
50 i += 1
51
52 elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53 return True
54
55 elif mylib.ByteEquals(c, '['):
56 left_bracket = True
57
58 elif mylib.ByteEquals(c, ']') and left_bracket:
59 # It has at least one pair of balanced []. Not bothering to check stray
60 # [ or ].
61 return True
62
63 i += 1
64 return False
65
66
67def LooksLikeStaticGlob(w):
68 # type: (CompoundWord) -> bool
69 """Like LooksLikeGlob, but for static words."""
70
71 left_bracket = False
72 for part in w.parts:
73 if part.tag() == word_part_e.Literal:
74 id_ = cast(Token, part).id
75 if id_ in (Id.Lit_Star, Id.Lit_QMark):
76 return True
77 elif id_ == Id.Lit_LBracket:
78 left_bracket = True
79 elif id_ == Id.Lit_RBracket and left_bracket:
80 return True
81 return False
82
83
84# Glob Helpers for WordParts.
85# NOTE: Escaping / doesn't work, because it's not a filename character.
86# ! : - are metachars within character classes
87# ( ) | are extended glob characters, and it's OK to add extra \ when the
88# underlying library doesn't support extended globs
89# we don't need to escape the @ in @(cc), because escaping ( is enough
90GLOB_META_CHARS = r'\*?[]-:!()|'
91
92
93def GlobEscape(s):
94 # type: (str) -> str
95 """For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
96 return pyutil.BackslashEscape(s, GLOB_META_CHARS)
97
98
99# Bug fix: add [] so [[:space:]] is not special, etc.
100ERE_META_CHARS = r'\?*+{}^$.()|[]'
101
102
103def ExtendedRegexEscape(s):
104 # type: (str) -> str
105 """Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
106 I don't think libc has a function to do this. Escape these characters:
107
108 https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
109 """
110 return pyutil.BackslashEscape(s, ERE_META_CHARS)
111
112
113def GlobUnescape(s):
114 # type: (str) -> str
115 """Remove glob escaping from a string.
116
117 Used when there is no glob match.
118 TODO: Can probably get rid of this, as long as you save the original word.
119
120 Complicated example: 'a*b'*.py, which will be escaped to a\*b*.py. So in
121 word_eval _JoinElideEscape and EvalWordToString you have to build two
122 'parallel' strings -- one escaped and one not.
123 """
124 unescaped = [] # type: List[int]
125 i = 0
126 n = len(s)
127 while i < n:
128 c = mylib.ByteAt(s, i)
129
130 if mylib.ByteEquals(c, '\\') and i != n - 1:
131 # Suppressed this to fix bug #698, #628 is still there.
132 assert i != n - 1, 'Trailing backslash: %r' % s
133 i += 1
134 c2 = mylib.ByteAt(s, i)
135
136 if mylib.ByteInSet(c2, GLOB_META_CHARS):
137 unescaped.append(c2)
138 else:
139 raise AssertionError("Unexpected escaped character %r" % c2)
140 else:
141 unescaped.append(c)
142 i += 1
143 return mylib.JoinBytes(unescaped)
144
145
146# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
147# positions of matches. So we convert globs to regexps.
148
149# Problems:
150# - What about unicode? Do we have to set any global variables? We want it to
151# always use utf-8?
152
153
154class _GlobParser(object):
155
156 def __init__(self, lexer):
157 # type: (SimpleLexer) -> None
158 self.lexer = lexer
159 self.token_type = Id.Undefined_Tok
160 self.token_val = ''
161 self.warnings = [] # type: List[str]
162
163 def _Next(self):
164 # type: () -> None
165 """Move to the next token."""
166 self.token_type, self.token_val = self.lexer.Next()
167
168 def _ParseCharClass(self):
169 # type: () -> List[glob_part_t]
170 """
171 Returns:
172 a CharClass if the parse succeeds, or a Literal if fails. In the latter
173 case, we also append a warning.
174 """
175 first_token = glob_part.Literal(self.token_type, self.token_val)
176 balance = 1 # We already saw a [
177 tokens = [] # type: List[Tuple[Id_t, str]]
178
179 # NOTE: There is a special rule where []] and [[] are valid globs. Also
180 # [^[] and sometimes [^]], although that one is ambiguous!
181 # And [[:space:]] and [[.class.]] has to be taken into account too. I'm
182 # punting on this now because the rule isn't clear and consistent between
183 # shells.
184
185 while True:
186 self._Next()
187
188 if self.token_type == Id.Eol_Tok:
189 # TODO: location info
190 self.warnings.append(
191 'Malformed character class; treating as literal')
192 parts = [first_token] # type: List[glob_part_t]
193 for (id_, s) in tokens:
194 parts.append(glob_part.Literal(id_, s))
195 return parts
196
197 if self.token_type == Id.Glob_LBracket:
198 balance += 1
199 elif self.token_type == Id.Glob_RBracket:
200 balance -= 1
201
202 if balance == 0:
203 break
204 tokens.append(
205 (self.token_type, self.token_val)) # Don't append the last ]
206
207 negated = False
208 if len(tokens):
209 id1, _ = tokens[0]
210 # NOTE: Both ! and ^ work for negation in globs
211 # https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
212 # TODO: Warn about the one that's not recommended?
213 if id1 in (Id.Glob_Bang, Id.Glob_Caret):
214 negated = True
215 tokens = tokens[1:]
216 strs = [s for _, s in tokens]
217 return [glob_part.CharClass(negated, strs)]
218
219 def Parse(self):
220 # type: () -> Tuple[List[glob_part_t], List[str]]
221 """
222 Returns:
223 regex string (or None if it's not a glob)
224 A list of warnings about the syntax
225 """
226 parts = [] # type: List[glob_part_t]
227
228 while True:
229 self._Next()
230 id_ = self.token_type
231 s = self.token_val
232
233 #log('%s %r', self.token_type, self.token_val)
234 if id_ == Id.Eol_Tok:
235 break
236
237 if id_ in (Id.Glob_Star, Id.Glob_QMark):
238 parts.append(glob_part.Operator(id_))
239
240 elif id_ == Id.Glob_LBracket:
241 # Could return a Literal or a CharClass
242 parts.extend(self._ParseCharClass())
243
244 else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
245 # BadBackslash}
246 parts.append(glob_part.Literal(id_, s))
247
248 # Also check for warnings. TODO: location info.
249 if id_ == Id.Glob_RBracket:
250 self.warnings.append('Got unescaped right bracket')
251 if id_ == Id.Glob_BadBackslash:
252 self.warnings.append('Got unescaped trailing backslash')
253
254 return parts, self.warnings
255
256
257_REGEX_CHARS_TO_ESCAPE = '.|^$()+*?[]{}\\'
258
259
260def _GenerateERE(parts):
261 # type: (List[glob_part_t]) -> str
262 out = [] # type: List[str]
263
264 for part in parts:
265 tag = part.tag()
266 UP_part = part
267
268 if tag == glob_part_e.Literal:
269 part = cast(glob_part.Literal, UP_part)
270 if part.id == Id.Glob_EscapedChar:
271 assert len(part.s) == 2, part.s
272 # The user could have escaped a char that doesn't need regex escaping,
273 # like \b or something.
274 c = part.s[1]
275 if c in _REGEX_CHARS_TO_ESCAPE:
276 out.append('\\')
277 out.append(c)
278
279 # ! is only for char class
280 elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
281 out.append(part.s) # e.g. 'py' doesn't need to be escaped
282
283 # ^ is only for char class
284 elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
285 assert len(part.s) == 1, part.s
286 c = part.s
287 if c in _REGEX_CHARS_TO_ESCAPE:
288 out.append('\\')
289 out.append(c)
290
291 # These are UNMATCHED ones not parsed in a glob class
292 elif part.id == Id.Glob_LBracket:
293 out.append('\\[')
294
295 elif part.id == Id.Glob_RBracket:
296 out.append('\\]')
297
298 elif part.id == Id.Glob_BadBackslash:
299 out.append('\\\\')
300
301 elif part.id == Id.Glob_Caret:
302 out.append('^')
303
304 else:
305 raise AssertionError(part.id)
306
307 elif tag == glob_part_e.Operator:
308 part = cast(glob_part.Operator, UP_part)
309 if part.op_id == Id.Glob_QMark:
310 out.append('.')
311 elif part.op_id == Id.Glob_Star:
312 out.append('.*')
313 else:
314 raise AssertionError()
315
316 elif tag == glob_part_e.CharClass:
317 part = cast(glob_part.CharClass, UP_part)
318 out.append('[')
319 if part.negated:
320 out.append('^')
321
322 # Important: the character class is LITERALLY preserved, because we
323 # assume glob char classes are EXACTLY the same as regex char classes,
324 # including the escaping rules.
325 #
326 # TWO WEIRD EXCEPTIONS:
327 # \- is moved to the end as '-'.
328 # In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
329 # want a literal, and the POSIX way to do that is to put it at the end.
330 # \] is moved to the FRONT as ]
331
332 good = [] # type: List[str]
333
334 literal_hyphen = False
335 literal_rbracket = False
336
337 for s in part.strs:
338 if s == '\-':
339 literal_hyphen = True
340 continue
341 if s == '\]':
342 literal_rbracket = True
343 continue
344 good.append(s)
345
346 if literal_rbracket:
347 out.append(']')
348
349 out.extend(good)
350
351 if literal_hyphen:
352 out.append('-')
353
354 out.append(']')
355
356 return ''.join(out)
357
358
359def GlobToERE(pat):
360 # type: (str) -> Tuple[str, List[str]]
361 lexer = match.GlobLexer(pat)
362 p = _GlobParser(lexer)
363 parts, warnings = p.Parse()
364
365 # Vestigial: if there is nothing like * ? or [abc], then the whole string is
366 # a literal, and we could use a more efficient mechanism.
367 # But we would have to DEQUOTE before doing that.
368 if 0:
369 is_glob = False
370 for p in parts:
371 if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
372 is_glob = True
373 if 0:
374 log('GlobToERE()')
375 for p in parts:
376 log(' %s', p)
377
378 regex = _GenerateERE(parts)
379 #log('pat %s -> regex %s', pat, regex)
380 return regex, warnings
381
382
383# Notes for implementing extglob
384# - libc glob() doesn't have any extension!
385# - Nix stdenv uses !(foo) and @(foo|bar)
386# - can we special case these for now?
387# - !(foo|bar) -- change it to *, and then just do fnmatch() to filter the
388# result!
389# - Actually I guess we can do that for all of them. That seems fine.
390# - But we have to get the statically parsed arg in here?
391# - or do dynamic parsing
392# - LooksLikeGlob() would have to respect extglob! ugh!
393# - See 2 calls in osh/word_eval.py
394
395
396class Globber(object):
397
398 def __init__(self, exec_opts):
399 # type: (optview.Exec) -> None
400 self.exec_opts = exec_opts
401
402 # Other unimplemented bash options:
403 #
404 # dotglob dotfiles are matched
405 # globstar ** for directories
406 # globasciiranges ascii or unicode char classes (unicode by default)
407 # nocaseglob
408 # extglob the @() !() syntax -- libc helps us with fnmatch(), but
409 # not glob().
410 #
411 # NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
412 # do. Could a default GLOBIGNORE to ignore flags on the file system be
413 # part of the security solution? It doesn't seem totally sound.
414
415 def _Glob(self, arg, out):
416 # type: (str, List[str]) -> int
417 try:
418 flags = 0
419 if self.exec_opts.dotglob() and HAVE_GLOB_PERIOD:
420 flags |= GLOB_PERIOD
421 results = libc.glob(arg, flags)
422 except RuntimeError as e:
423 # These errors should be rare: I/O error, out of memory, or unknown
424 # There are no syntax errors. (But see comment about globerr() in
425 # native/libc.c.)
426 # note: MyPy doesn't know RuntimeError has e.message (and e.args)
427 msg = e.message # type: str
428 print_stderr("Error expanding glob %r: %s" % (arg, msg))
429 raise
430 #log('glob %r -> %r', arg, g)
431
432 n = len(results)
433 if n: # Something matched
434 # Omit files starting with -
435 # dashglob turned OFF with shopt -s oil:upgrade.
436 if not self.exec_opts.dashglob():
437 tmp = [s for s in results if not s.startswith('-')]
438 results = tmp # idiom to work around mycpp limitation
439 n = len(results)
440
441 # XXX: libc's glob function can return '.' and '..', which
442 # are typically not of interest. Filtering in this manner
443 # is similar (but not identical) to the default bash
444 # setting of 'setopt -s globskipdots'. Supporting that
445 # option fully would require more than simply wrapping
446 # this in an if statement.
447 n = 0
448 for s in results:
449 if s not in ('.', '..'):
450 out.append(s)
451 n += 1
452 return n
453
454 return 0
455
456 def Expand(self, arg, out):
457 # type: (str, List[str]) -> int
458 """Given a string that could be a glob, append a list of strings to
459 'out'.
460
461 Returns:
462 Number of items appended, or -1 for fatal failglob error.
463 """
464 if self.exec_opts.noglob():
465 # we didn't glob escape it in osh/word_eval.py
466 out.append(arg)
467 return 1
468
469 n = self._Glob(arg, out)
470 if n:
471 return n
472
473 # Nothing matched
474 if self.exec_opts.failglob():
475 return -1
476
477 if self.exec_opts.nullglob():
478 return 0
479 else:
480 # Return the original string
481 out.append(GlobUnescape(arg))
482 return 1
483
484 def ExpandExtended(self, glob_pat, fnmatch_pat, out):
485 # type: (str, str, List[str]) -> int
486 if self.exec_opts.noglob():
487 # Return the fnmatch_pat. Note: this means we turn ,() into @(), and
488 # there is extra \ escaping compared with bash and mksh. OK for now
489 out.append(fnmatch_pat)
490 return 1
491
492 tmp = [] # type: List[str]
493 self._Glob(glob_pat, tmp)
494 filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
495 n = len(filtered)
496
497 if n:
498 out.extend(filtered)
499 return n
500
501 if self.exec_opts.failglob():
502 return -1 # nothing matched
503
504 if self.exec_opts.nullglob():
505 return 0
506 else:
507 # See comment above
508 out.append(GlobUnescape(fnmatch_pat))
509 return 1