osh/glob

OILS / osh / glob_.py View on Github | oils.pub

565 lines, 281 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7	glob_part, glob_part_e, glob_part_t,
8	loc_t)
9	from core import pyos, pyutil, error
10	from frontend import match
11	from mycpp import mylib
12	from mycpp.mylib import log, print_stderr
13
14	from libc import GLOB_PERIOD
15
16	from typing import List, Tuple, cast, TYPE_CHECKING
17	if TYPE_CHECKING:
18	from core import optview
19	from frontend.match import SimpleLexer
20
21	_ = log
22
23
24	def LooksLikeGlob(s):
25	# type: (str) -> bool
26	"""Does this string look like a glob pattern?
27
28	Like other shells, OSH avoids calls to glob() unless there are glob
29	metacharacters.
30
31	TODO: Reference lib/glob / glob_pattern functions in bash
32	$ grep glob_pattern lib/glob/*
33
34	Used:
35	1. in Globber below
36	2. for the slow path / fast path of prefix/suffix/patsub ops.
37	"""
38	left_bracket = False
39	i = 0
40	n = len(s)
41	while i < n:
42	c = mylib.ByteAt(s, i)
43
44	if mylib.ByteEquals(c, '\\'):
45	i += 1
46
47	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
48	return True
49
50	elif mylib.ByteEquals(c, '['):
51	left_bracket = True
52
53	elif mylib.ByteEquals(c, ']') and left_bracket:
54	# It has at least one pair of balanced []. Not bothering to check stray
55	# [ or ].
56	return True
57
58	i += 1
59	return False
60
61
62	def LooksLikeStaticGlob(w):
63	# type: (CompoundWord) -> bool
64	"""Like LooksLikeGlob, but for static words."""
65
66	left_bracket = False
67	for part in w.parts:
68	if part.tag() == word_part_e.Literal:
69	id_ = cast(Token, part).id
70	if id_ in (Id.Lit_Star, Id.Lit_QMark):
71	return True
72	elif id_ == Id.Lit_LBracket:
73	left_bracket = True
74	elif id_ == Id.Lit_RBracket and left_bracket:
75	return True
76	return False
77
78
79	# Glob Helpers for WordParts.
80	# NOTE: Escaping / doesn't work, because it's not a filename character.
81	# ! : - are metachars within character classes
82	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
83	# underlying library doesn't support extended globs
84	# we don't need to escape the @ in @(cc), because escaping ( is enough
85	GLOB_META_CHARS = r'\*?[]-:!()\|'
86
87	# Check invariant needed to escape literal \ as \@
88	assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
89
90
91	def GlobEscape(s):
92	# type: (str) -> str
93	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
94	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
95
96
97	def GlobEscapeBackslash(s):
98	# type: (str) -> str
99	"""Glob escape a string for an unquoted var sub.
100
101	Used to evaluate something like $v with v='a\b.txt'
102
103	We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
104
105	See test cases in spec/glob.test.sh
106
107	- If globbing is performed, then \* evaluates to literal '*'
108	- that is, \ is an escape for the *
109	- If globbing is NOT performed (set -o noglob or no matching files), then
110	\* evaluates to '\*'
111	- that is, the \ is preserved literally
112	"""
113	return s.replace('\\', r'\@')
114
115
116	# Bug fix: add [] so [[:space:]] is not special, etc.
117	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
118
119
120	def ExtendedRegexEscape(s):
121	# type: (str) -> str
122	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
123	I don't think libc has a function to do this. Escape these characters:
124
125	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
126	"""
127	return pyutil.BackslashEscape(s, ERE_META_CHARS)
128
129
130	def GlobUnescape(s):
131	# type: (str) -> str
132	"""Remove glob escaping from a string.
133
134	Used when there is no glob match.
135	TODO: Can probably get rid of this, as long as you save the original word.
136
137	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
138	word_eval _JoinElideEscape and EvalWordToString you have to build two
139	'parallel' strings -- one escaped and one not.
140	"""
141	unescaped = [] # type: List[int]
142	i = 0
143	n = len(s)
144	while i < n:
145	c = mylib.ByteAt(s, i)
146
147	if mylib.ByteEquals(c, '\\') and i != n - 1:
148	# TODO: GlobEscape() turns \ into \\, so a string should never end
149	# with a single backslash.
150	# Suppressed this assert to fix bug #698, #628 is still there.
151	# Check them again.
152	assert i != n - 1, 'Trailing backslash: %r' % s
153
154	i += 1
155	c2 = mylib.ByteAt(s, i)
156
157	if mylib.ByteInSet(c2, GLOB_META_CHARS):
158	unescaped.append(c2)
159	elif mylib.ByteEquals(c2, '@'):
160	unescaped.append(pyos.BACKSLASH_CH)
161	else:
162	raise AssertionError("Unexpected escaped character %r" % c2)
163	else:
164	unescaped.append(c)
165	i += 1
166	return mylib.JoinBytes(unescaped)
167
168
169	def GlobUnescapeBackslash(s):
170	# type: (str) -> str
171	"""Inverse of GlobEscapeBackslash - turns \@ into \ """
172	unescaped = [] # type: List[int]
173	i = 0
174	n = len(s)
175	while i < n:
176	c = mylib.ByteAt(s, i)
177
178	if mylib.ByteEquals(c, '\\') and i != n - 1:
179	# Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
180	# could end with a single backslash?
181	assert i != n - 1, 'Trailing backslash: %r' % s
182
183	i += 1
184	c2 = mylib.ByteAt(s, i)
185
186	if mylib.ByteEquals(c2, '@'):
187	unescaped.append(pyos.BACKSLASH_CH)
188	else:
189	unescaped.append(pyos.BACKSLASH_CH)
190	unescaped.append(c2)
191	else:
192	unescaped.append(c)
193	i += 1
194	return mylib.JoinBytes(unescaped)
195
196
197	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
198	# positions of matches. So we convert globs to regexps.
199
200	# Problems:
201	# - What about unicode? Do we have to set any global variables? We want it to
202	# always use utf-8?
203
204
205	class _GlobParser(object):
206
207	def __init__(self, lexer):
208	# type: (SimpleLexer) -> None
209	self.lexer = lexer
210	self.token_type = Id.Undefined_Tok
211	self.token_val = ''
212	self.warnings = [] # type: List[str]
213
214	def _Next(self):
215	# type: () -> None
216	"""Move to the next token."""
217	self.token_type, self.token_val = self.lexer.Next()
218
219	def _ParseCharClass(self):
220	# type: () -> List[glob_part_t]
221	"""
222	Returns:
223	a CharClass if the parse succeeds, or a Literal if fails. In the latter
224	case, we also append a warning.
225	"""
226	first_token = glob_part.Literal(self.token_type, self.token_val)
227	balance = 1 # We already saw a [
228	tokens = [] # type: List[Tuple[Id_t, str]]
229
230	# NOTE: There is a special rule where []] and [[] are valid globs. Also
231	# [^[] and sometimes [^]], although that one is ambiguous!
232	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
233	# punting on this now because the rule isn't clear and consistent between
234	# shells.
235
236	while True:
237	self._Next()
238
239	if self.token_type == Id.Eol_Tok:
240	# TODO: location info
241	self.warnings.append(
242	'Malformed character class; treating as literal')
243	parts = [first_token] # type: List[glob_part_t]
244	for (id_, s) in tokens:
245	parts.append(glob_part.Literal(id_, s))
246	return parts
247
248	if self.token_type == Id.Glob_LBracket:
249	balance += 1
250	elif self.token_type == Id.Glob_RBracket:
251	balance -= 1
252
253	if balance == 0:
254	break
255	tokens.append(
256	(self.token_type, self.token_val)) # Don't append the last ]
257
258	negated = False
259	if len(tokens):
260	id1, _ = tokens[0]
261	# NOTE: Both ! and ^ work for negation in globs
262	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
263	# TODO: Warn about the one that's not recommended?
264	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
265	negated = True
266	tokens = tokens[1:]
267	strs = [s for _, s in tokens]
268	return [glob_part.CharClass(negated, strs)]
269
270	def Parse(self):
271	# type: () -> Tuple[List[glob_part_t], List[str]]
272	"""
273	Returns:
274	regex string (or None if it's not a glob)
275	A list of warnings about the syntax
276	"""
277	parts = [] # type: List[glob_part_t]
278
279	while True:
280	self._Next()
281	id_ = self.token_type
282	s = self.token_val
283
284	#log('%s %r', self.token_type, self.token_val)
285	if id_ == Id.Eol_Tok:
286	break
287
288	if id_ in (Id.Glob_Star, Id.Glob_QMark):
289	parts.append(glob_part.Operator(id_))
290
291	elif id_ == Id.Glob_LBracket:
292	# Could return a Literal or a CharClass
293	parts.extend(self._ParseCharClass())
294
295	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
296	# BadBackslash}
297	parts.append(glob_part.Literal(id_, s))
298
299	# Also check for warnings. TODO: location info.
300	if id_ == Id.Glob_RBracket:
301	self.warnings.append('Got unescaped right bracket')
302	if id_ == Id.Glob_BadBackslash:
303	self.warnings.append('Got unescaped trailing backslash')
304
305	return parts, self.warnings
306
307
308	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
309
310
311	def _GenerateERE(parts):
312	# type: (List[glob_part_t]) -> str
313	out = [] # type: List[str]
314
315	for part in parts:
316	tag = part.tag()
317	UP_part = part
318
319	if tag == glob_part_e.Literal:
320	part = cast(glob_part.Literal, UP_part)
321	if part.id == Id.Glob_EscapedChar:
322	assert len(part.s) == 2, part.s
323	# The user could have escaped a char that doesn't need regex escaping,
324	# like \b or something.
325	c = part.s[1]
326	if c in _REGEX_CHARS_TO_ESCAPE:
327	out.append('\\')
328	out.append(c)
329
330	# ! is only for char class
331	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
332	out.append(part.s) # e.g. 'py' doesn't need to be escaped
333
334	# ^ is only for char class
335	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
336	assert len(part.s) == 1, part.s
337	c = part.s
338	if c in _REGEX_CHARS_TO_ESCAPE:
339	out.append('\\')
340	out.append(c)
341
342	# These are UNMATCHED ones not parsed in a glob class
343	elif part.id == Id.Glob_LBracket:
344	out.append('\\[')
345
346	elif part.id == Id.Glob_RBracket:
347	out.append('\\]')
348
349	elif part.id == Id.Glob_BadBackslash:
350	out.append('\\\\')
351
352	elif part.id == Id.Glob_Caret:
353	out.append('^')
354
355	else:
356	raise AssertionError(part.id)
357
358	elif tag == glob_part_e.Operator:
359	part = cast(glob_part.Operator, UP_part)
360	if part.op_id == Id.Glob_QMark:
361	out.append('.')
362	elif part.op_id == Id.Glob_Star:
363	out.append('.*')
364	else:
365	raise AssertionError()
366
367	elif tag == glob_part_e.CharClass:
368	part = cast(glob_part.CharClass, UP_part)
369	out.append('[')
370	if part.negated:
371	out.append('^')
372
373	# Important: the character class is LITERALLY preserved, because we
374	# assume glob char classes are EXACTLY the same as regex char classes,
375	# including the escaping rules.
376	#
377	# TWO WEIRD EXCEPTIONS:
378	# \- is moved to the end as '-'.
379	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
380	# want a literal, and the POSIX way to do that is to put it at the end.
381	# \] is moved to the FRONT as ]
382
383	good = [] # type: List[str]
384
385	literal_hyphen = False
386	literal_rbracket = False
387
388	for s in part.strs:
389	if s == '\-':
390	literal_hyphen = True
391	continue
392	if s == '\]':
393	literal_rbracket = True
394	continue
395	good.append(s)
396
397	if literal_rbracket:
398	out.append(']')
399
400	out.extend(good)
401
402	if literal_hyphen:
403	out.append('-')
404
405	out.append(']')
406
407	return ''.join(out)
408
409
410	def GlobToERE(pat):
411	# type: (str) -> Tuple[str, List[str]]
412	lexer = match.GlobLexer(pat)
413	p = _GlobParser(lexer)
414	parts, warnings = p.Parse()
415
416	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
417	# a literal, and we could use a more efficient mechanism.
418	# But we would have to DEQUOTE before doing that.
419	if 0:
420	is_glob = False
421	for p in parts:
422	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
423	is_glob = True
424	if 0:
425	log('GlobToERE()')
426	for p in parts:
427	log(' %s', p)
428
429	regex = _GenerateERE(parts)
430	#log('pat %s -> regex %s', pat, regex)
431	return regex, warnings
432
433
434	# Notes for implementing extglob
435	# - libc glob() doesn't have any extension!
436	# - Nix stdenv uses !(foo) and @(foo\|bar)
437	# - can we special case these for now?
438	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
439	# result!
440	# - Actually I guess we can do that for all of them. That seems fine.
441	# - But we have to get the statically parsed arg in here?
442	# - or do dynamic parsing
443	# - LooksLikeGlob() would have to respect extglob! ugh!
444	# - See 2 calls in osh/word_eval.py
445
446
447	class Globber(object):
448
449	def __init__(self, exec_opts):
450	# type: (optview.Exec) -> None
451	self.exec_opts = exec_opts
452
453	# Other unimplemented bash options:
454	#
455	# dotglob dotfiles are matched
456	# globstar ** for directories
457	# globasciiranges ascii or unicode char classes (unicode by default)
458	# nocaseglob
459	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
460	# not glob().
461	#
462	# NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
463	# do. Could a default GLOBIGNORE to ignore flags on the file system be
464	# part of the security solution? It doesn't seem totally sound.
465
466	def _Glob(self, arg, out):
467	# type: (str, List[str]) -> int
468	try:
469	flags = 0
470	if self.exec_opts.dotglob():
471	# If HAVE_GLOB_PERIOD is false, then ./configure stubs out
472	# GLOB_PERIOD as 0, a no-op
473	flags \|= GLOB_PERIOD
474	results = libc.glob(arg, flags)
475	except RuntimeError as e:
476	# These errors should be rare: I/O error, out of memory, or unknown
477	# There are no syntax errors. (But see comment about globerr() in
478	# native/libc.c.)
479	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
480	msg = e.message # type: str
481	print_stderr("Error expanding glob %r: %s" % (arg, msg))
482	raise
483	#log('glob %r -> %r', arg, g)
484
485	n = len(results)
486	if n: # Something matched
487	# Omit files starting with -
488	# no_dash_glob is part of shopt --set ysh:upgrade
489	if self.exec_opts.no_dash_glob():
490	tmp = [s for s in results if not s.startswith('-')]
491	results = tmp # idiom to work around mycpp limitation
492	n = len(results)
493
494	# XXX: libc's glob function can return '.' and '..', which
495	# are typically not of interest. Filtering in this manner
496	# is similar (but not identical) to the default bash
497	# setting of 'setopt -s globskipdots'. Supporting that
498	# option fully would require more than simply wrapping
499	# this in an if statement.
500	n = 0
501	for s in results:
502	if s not in ('.', '..'):
503	out.append(s)
504	n += 1
505	return n
506
507	return 0
508
509	def Expand(self, arg, out, blame_loc):
510	# type: (str, List[str], loc_t) -> int
511	"""Given a string that MAY be a glob, perform glob expansion
512
513	If files on disk match the glob pattern, we append to the list 'out',
514	and return the number of items.
515
516	Returns:
517	Number of items appended, or -1 when glob expansion did not happen.
518	Raises:
519	error.FailGlob when nothing matched, and shopt -s failglob
520	"""
521	if self.exec_opts.noglob():
522	# The caller should use the original string
523	return -1
524
525	n = self._Glob(arg, out)
526	if n:
527	return n
528
529	# Nothing matched
530	if self.exec_opts.failglob():
531	raise error.FailGlob('Pattern %r matched no files' % arg,
532	blame_loc)
533
534	if self.exec_opts.nullglob():
535	return 0
536
537	# The caller should use the original string
538	return -1
539
540	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
541	# type: (str, str, List[str]) -> int
542	if self.exec_opts.noglob():
543	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
544	# there is extra \ escaping compared with bash and mksh. OK for now
545	out.append(fnmatch_pat)
546	return 1
547
548	tmp = [] # type: List[str]
549	self._Glob(glob_pat, tmp)
550	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
551	n = len(filtered)
552
553	if n:
554	out.extend(filtered)
555	return n
556
557	if self.exec_opts.failglob():
558	return -1 # nothing matched
559
560	if self.exec_opts.nullglob():
561	return 0
562	else:
563	# See comment above
564	out.append(GlobUnescape(fnmatch_pat))
565	return 1