osh/glob

OILS / osh / glob_.py View on Github | oils.pub

509 lines, 264 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (
7	CompoundWord,
8	Token,
9	word_part_e,
10	glob_part,
11	glob_part_e,
12	glob_part_t,
13	)
14	from core import pyutil
15	from frontend import match
16	from mycpp import mylib
17	from mycpp.mylib import log, print_stderr
18
19	from libc import GLOB_PERIOD, HAVE_GLOB_PERIOD
20
21	from typing import List, Tuple, cast, TYPE_CHECKING
22	if TYPE_CHECKING:
23	from core import optview
24	from frontend.match import SimpleLexer
25
26	_ = log
27
28
29	def LooksLikeGlob(s):
30	# type: (str) -> bool
31	"""Does this string look like a glob pattern?
32
33	Like other shells, OSH avoids calls to glob() unless there are glob
34	metacharacters.
35
36	TODO: Reference lib/glob / glob_pattern functions in bash
37	$ grep glob_pattern lib/glob/*
38
39	Used:
40	1. in Globber below
41	2. for the slow path / fast path of prefix/suffix/patsub ops.
42	"""
43	left_bracket = False
44	i = 0
45	n = len(s)
46	while i < n:
47	c = mylib.ByteAt(s, i)
48
49	if mylib.ByteEquals(c, '\\'):
50	i += 1
51
52	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53	return True
54
55	elif mylib.ByteEquals(c, '['):
56	left_bracket = True
57
58	elif mylib.ByteEquals(c, ']') and left_bracket:
59	# It has at least one pair of balanced []. Not bothering to check stray
60	# [ or ].
61	return True
62
63	i += 1
64	return False
65
66
67	def LooksLikeStaticGlob(w):
68	# type: (CompoundWord) -> bool
69	"""Like LooksLikeGlob, but for static words."""
70
71	left_bracket = False
72	for part in w.parts:
73	if part.tag() == word_part_e.Literal:
74	id_ = cast(Token, part).id
75	if id_ in (Id.Lit_Star, Id.Lit_QMark):
76	return True
77	elif id_ == Id.Lit_LBracket:
78	left_bracket = True
79	elif id_ == Id.Lit_RBracket and left_bracket:
80	return True
81	return False
82
83
84	# Glob Helpers for WordParts.
85	# NOTE: Escaping / doesn't work, because it's not a filename character.
86	# ! : - are metachars within character classes
87	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
88	# underlying library doesn't support extended globs
89	# we don't need to escape the @ in @(cc), because escaping ( is enough
90	GLOB_META_CHARS = r'\*?[]-:!()\|'
91
92
93	def GlobEscape(s):
94	# type: (str) -> str
95	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
96	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
97
98
99	# Bug fix: add [] so [[:space:]] is not special, etc.
100	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
101
102
103	def ExtendedRegexEscape(s):
104	# type: (str) -> str
105	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
106	I don't think libc has a function to do this. Escape these characters:
107
108	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
109	"""
110	return pyutil.BackslashEscape(s, ERE_META_CHARS)
111
112
113	def GlobUnescape(s):
114	# type: (str) -> str
115	"""Remove glob escaping from a string.
116
117	Used when there is no glob match.
118	TODO: Can probably get rid of this, as long as you save the original word.
119
120	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
121	word_eval _JoinElideEscape and EvalWordToString you have to build two
122	'parallel' strings -- one escaped and one not.
123	"""
124	unescaped = [] # type: List[int]
125	i = 0
126	n = len(s)
127	while i < n:
128	c = mylib.ByteAt(s, i)
129
130	if mylib.ByteEquals(c, '\\') and i != n - 1:
131	# Suppressed this to fix bug #698, #628 is still there.
132	assert i != n - 1, 'Trailing backslash: %r' % s
133	i += 1
134	c2 = mylib.ByteAt(s, i)
135
136	if mylib.ByteInSet(c2, GLOB_META_CHARS):
137	unescaped.append(c2)
138	else:
139	raise AssertionError("Unexpected escaped character %r" % c2)
140	else:
141	unescaped.append(c)
142	i += 1
143	return mylib.JoinBytes(unescaped)
144
145
146	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
147	# positions of matches. So we convert globs to regexps.
148
149	# Problems:
150	# - What about unicode? Do we have to set any global variables? We want it to
151	# always use utf-8?
152
153
154	class _GlobParser(object):
155
156	def __init__(self, lexer):
157	# type: (SimpleLexer) -> None
158	self.lexer = lexer
159	self.token_type = Id.Undefined_Tok
160	self.token_val = ''
161	self.warnings = [] # type: List[str]
162
163	def _Next(self):
164	# type: () -> None
165	"""Move to the next token."""
166	self.token_type, self.token_val = self.lexer.Next()
167
168	def _ParseCharClass(self):
169	# type: () -> List[glob_part_t]
170	"""
171	Returns:
172	a CharClass if the parse succeeds, or a Literal if fails. In the latter
173	case, we also append a warning.
174	"""
175	first_token = glob_part.Literal(self.token_type, self.token_val)
176	balance = 1 # We already saw a [
177	tokens = [] # type: List[Tuple[Id_t, str]]
178
179	# NOTE: There is a special rule where []] and [[] are valid globs. Also
180	# [^[] and sometimes [^]], although that one is ambiguous!
181	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
182	# punting on this now because the rule isn't clear and consistent between
183	# shells.
184
185	while True:
186	self._Next()
187
188	if self.token_type == Id.Eol_Tok:
189	# TODO: location info
190	self.warnings.append(
191	'Malformed character class; treating as literal')
192	parts = [first_token] # type: List[glob_part_t]
193	for (id_, s) in tokens:
194	parts.append(glob_part.Literal(id_, s))
195	return parts
196
197	if self.token_type == Id.Glob_LBracket:
198	balance += 1
199	elif self.token_type == Id.Glob_RBracket:
200	balance -= 1
201
202	if balance == 0:
203	break
204	tokens.append(
205	(self.token_type, self.token_val)) # Don't append the last ]
206
207	negated = False
208	if len(tokens):
209	id1, _ = tokens[0]
210	# NOTE: Both ! and ^ work for negation in globs
211	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
212	# TODO: Warn about the one that's not recommended?
213	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
214	negated = True
215	tokens = tokens[1:]
216	strs = [s for _, s in tokens]
217	return [glob_part.CharClass(negated, strs)]
218
219	def Parse(self):
220	# type: () -> Tuple[List[glob_part_t], List[str]]
221	"""
222	Returns:
223	regex string (or None if it's not a glob)
224	A list of warnings about the syntax
225	"""
226	parts = [] # type: List[glob_part_t]
227
228	while True:
229	self._Next()
230	id_ = self.token_type
231	s = self.token_val
232
233	#log('%s %r', self.token_type, self.token_val)
234	if id_ == Id.Eol_Tok:
235	break
236
237	if id_ in (Id.Glob_Star, Id.Glob_QMark):
238	parts.append(glob_part.Operator(id_))
239
240	elif id_ == Id.Glob_LBracket:
241	# Could return a Literal or a CharClass
242	parts.extend(self._ParseCharClass())
243
244	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
245	# BadBackslash}
246	parts.append(glob_part.Literal(id_, s))
247
248	# Also check for warnings. TODO: location info.
249	if id_ == Id.Glob_RBracket:
250	self.warnings.append('Got unescaped right bracket')
251	if id_ == Id.Glob_BadBackslash:
252	self.warnings.append('Got unescaped trailing backslash')
253
254	return parts, self.warnings
255
256
257	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
258
259
260	def _GenerateERE(parts):
261	# type: (List[glob_part_t]) -> str
262	out = [] # type: List[str]
263
264	for part in parts:
265	tag = part.tag()
266	UP_part = part
267
268	if tag == glob_part_e.Literal:
269	part = cast(glob_part.Literal, UP_part)
270	if part.id == Id.Glob_EscapedChar:
271	assert len(part.s) == 2, part.s
272	# The user could have escaped a char that doesn't need regex escaping,
273	# like \b or something.
274	c = part.s[1]
275	if c in _REGEX_CHARS_TO_ESCAPE:
276	out.append('\\')
277	out.append(c)
278
279	# ! is only for char class
280	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
281	out.append(part.s) # e.g. 'py' doesn't need to be escaped
282
283	# ^ is only for char class
284	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
285	assert len(part.s) == 1, part.s
286	c = part.s
287	if c in _REGEX_CHARS_TO_ESCAPE:
288	out.append('\\')
289	out.append(c)
290
291	# These are UNMATCHED ones not parsed in a glob class
292	elif part.id == Id.Glob_LBracket:
293	out.append('\\[')
294
295	elif part.id == Id.Glob_RBracket:
296	out.append('\\]')
297
298	elif part.id == Id.Glob_BadBackslash:
299	out.append('\\\\')
300
301	elif part.id == Id.Glob_Caret:
302	out.append('^')
303
304	else:
305	raise AssertionError(part.id)
306
307	elif tag == glob_part_e.Operator:
308	part = cast(glob_part.Operator, UP_part)
309	if part.op_id == Id.Glob_QMark:
310	out.append('.')
311	elif part.op_id == Id.Glob_Star:
312	out.append('.*')
313	else:
314	raise AssertionError()
315
316	elif tag == glob_part_e.CharClass:
317	part = cast(glob_part.CharClass, UP_part)
318	out.append('[')
319	if part.negated:
320	out.append('^')
321
322	# Important: the character class is LITERALLY preserved, because we
323	# assume glob char classes are EXACTLY the same as regex char classes,
324	# including the escaping rules.
325	#
326	# TWO WEIRD EXCEPTIONS:
327	# \- is moved to the end as '-'.
328	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
329	# want a literal, and the POSIX way to do that is to put it at the end.
330	# \] is moved to the FRONT as ]
331
332	good = [] # type: List[str]
333
334	literal_hyphen = False
335	literal_rbracket = False
336
337	for s in part.strs:
338	if s == '\-':
339	literal_hyphen = True
340	continue
341	if s == '\]':
342	literal_rbracket = True
343	continue
344	good.append(s)
345
346	if literal_rbracket:
347	out.append(']')
348
349	out.extend(good)
350
351	if literal_hyphen:
352	out.append('-')
353
354	out.append(']')
355
356	return ''.join(out)
357
358
359	def GlobToERE(pat):
360	# type: (str) -> Tuple[str, List[str]]
361	lexer = match.GlobLexer(pat)
362	p = _GlobParser(lexer)
363	parts, warnings = p.Parse()
364
365	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
366	# a literal, and we could use a more efficient mechanism.
367	# But we would have to DEQUOTE before doing that.
368	if 0:
369	is_glob = False
370	for p in parts:
371	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
372	is_glob = True
373	if 0:
374	log('GlobToERE()')
375	for p in parts:
376	log(' %s', p)
377
378	regex = _GenerateERE(parts)
379	#log('pat %s -> regex %s', pat, regex)
380	return regex, warnings
381
382
383	# Notes for implementing extglob
384	# - libc glob() doesn't have any extension!
385	# - Nix stdenv uses !(foo) and @(foo\|bar)
386	# - can we special case these for now?
387	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
388	# result!
389	# - Actually I guess we can do that for all of them. That seems fine.
390	# - But we have to get the statically parsed arg in here?
391	# - or do dynamic parsing
392	# - LooksLikeGlob() would have to respect extglob! ugh!
393	# - See 2 calls in osh/word_eval.py
394
395
396	class Globber(object):
397
398	def __init__(self, exec_opts):
399	# type: (optview.Exec) -> None
400	self.exec_opts = exec_opts
401
402	# Other unimplemented bash options:
403	#
404	# dotglob dotfiles are matched
405	# globstar ** for directories
406	# globasciiranges ascii or unicode char classes (unicode by default)
407	# nocaseglob
408	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
409	# not glob().
410	#
411	# NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
412	# do. Could a default GLOBIGNORE to ignore flags on the file system be
413	# part of the security solution? It doesn't seem totally sound.
414
415	def _Glob(self, arg, out):
416	# type: (str, List[str]) -> int
417	try:
418	flags = 0
419	if self.exec_opts.dotglob() and HAVE_GLOB_PERIOD:
420	flags \|= GLOB_PERIOD
421	results = libc.glob(arg, flags)
422	except RuntimeError as e:
423	# These errors should be rare: I/O error, out of memory, or unknown
424	# There are no syntax errors. (But see comment about globerr() in
425	# native/libc.c.)
426	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
427	msg = e.message # type: str
428	print_stderr("Error expanding glob %r: %s" % (arg, msg))
429	raise
430	#log('glob %r -> %r', arg, g)
431
432	n = len(results)
433	if n: # Something matched
434	# Omit files starting with -
435	# dashglob turned OFF with shopt -s oil:upgrade.
436	if not self.exec_opts.dashglob():
437	tmp = [s for s in results if not s.startswith('-')]
438	results = tmp # idiom to work around mycpp limitation
439	n = len(results)
440
441	# XXX: libc's glob function can return '.' and '..', which
442	# are typically not of interest. Filtering in this manner
443	# is similar (but not identical) to the default bash
444	# setting of 'setopt -s globskipdots'. Supporting that
445	# option fully would require more than simply wrapping
446	# this in an if statement.
447	n = 0
448	for s in results:
449	if s not in ('.', '..'):
450	out.append(s)
451	n += 1
452	return n
453
454	return 0
455
456	def Expand(self, arg, out):
457	# type: (str, List[str]) -> int
458	"""Given a string that could be a glob, append a list of strings to
459	'out'.
460
461	Returns:
462	Number of items appended, or -1 for fatal failglob error.
463	"""
464	if self.exec_opts.noglob():
465	# we didn't glob escape it in osh/word_eval.py
466	out.append(arg)
467	return 1
468
469	n = self._Glob(arg, out)
470	if n:
471	return n
472
473	# Nothing matched
474	if self.exec_opts.failglob():
475	return -1
476
477	if self.exec_opts.nullglob():
478	return 0
479	else:
480	# Return the original string
481	out.append(GlobUnescape(arg))
482	return 1
483
484	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
485	# type: (str, str, List[str]) -> int
486	if self.exec_opts.noglob():
487	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
488	# there is extra \ escaping compared with bash and mksh. OK for now
489	out.append(fnmatch_pat)
490	return 1
491
492	tmp = [] # type: List[str]
493	self._Glob(glob_pat, tmp)
494	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
495	n = len(filtered)
496
497	if n:
498	out.extend(filtered)
499	return n
500
501	if self.exec_opts.failglob():
502	return -1 # nothing matched
503
504	if self.exec_opts.nullglob():
505	return 0
506	else:
507	# See comment above
508	out.append(GlobUnescape(fnmatch_pat))
509	return 1