osh/glob

OILS / osh / glob_.py View on Github | oilshell.org

508 lines, 264 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (
7	CompoundWord,
8	Token,
9	word_part_e,
10	glob_part,
11	glob_part_e,
12	glob_part_t,
13	)
14	from core import pyutil
15	from frontend import match
16	from libc import GLOB_PERIOD
17	from mycpp import mylib
18	from mycpp.mylib import log, print_stderr
19
20	from typing import List, Tuple, cast, TYPE_CHECKING
21	if TYPE_CHECKING:
22	from core import optview
23	from frontend.match import SimpleLexer
24
25	_ = log
26
27
28	def LooksLikeGlob(s):
29	# type: (str) -> bool
30	"""Does this string look like a glob pattern?
31
32	Like other shells, OSH avoids calls to glob() unless there are glob
33	metacharacters.
34
35	TODO: Reference lib/glob / glob_pattern functions in bash
36	$ grep glob_pattern lib/glob/*
37
38	Used:
39	1. in Globber below
40	2. for the slow path / fast path of prefix/suffix/patsub ops.
41	"""
42	left_bracket = False
43	i = 0
44	n = len(s)
45	while i < n:
46	c = mylib.ByteAt(s, i)
47
48	if mylib.ByteEquals(c, '\\'):
49	i += 1
50
51	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
52	return True
53
54	elif mylib.ByteEquals(c, '['):
55	left_bracket = True
56
57	elif mylib.ByteEquals(c, ']') and left_bracket:
58	# It has at least one pair of balanced []. Not bothering to check stray
59	# [ or ].
60	return True
61
62	i += 1
63	return False
64
65
66	def LooksLikeStaticGlob(w):
67	# type: (CompoundWord) -> bool
68	"""Like LooksLikeGlob, but for static words."""
69
70	left_bracket = False
71	for part in w.parts:
72	if part.tag() == word_part_e.Literal:
73	id_ = cast(Token, part).id
74	if id_ in (Id.Lit_Star, Id.Lit_QMark):
75	return True
76	elif id_ == Id.Lit_LBracket:
77	left_bracket = True
78	elif id_ == Id.Lit_RBracket and left_bracket:
79	return True
80	return False
81
82
83	# Glob Helpers for WordParts.
84	# NOTE: Escaping / doesn't work, because it's not a filename character.
85	# ! : - are metachars within character classes
86	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
87	# underlying library doesn't support extended globs
88	# we don't need to escape the @ in @(cc), because escaping ( is enough
89	GLOB_META_CHARS = r'\*?[]-:!()\|'
90
91
92	def GlobEscape(s):
93	# type: (str) -> str
94	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
95	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
96
97
98	# Bug fix: add [] so [[:space:]] is not special, etc.
99	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
100
101
102	def ExtendedRegexEscape(s):
103	# type: (str) -> str
104	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
105	I don't think libc has a function to do this. Escape these characters:
106
107	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
108	"""
109	return pyutil.BackslashEscape(s, ERE_META_CHARS)
110
111
112	def GlobUnescape(s):
113	# type: (str) -> str
114	"""Remove glob escaping from a string.
115
116	Used when there is no glob match.
117	TODO: Can probably get rid of this, as long as you save the original word.
118
119	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
120	word_eval _JoinElideEscape and EvalWordToString you have to build two
121	'parallel' strings -- one escaped and one not.
122	"""
123	unescaped = [] # type: List[int]
124	i = 0
125	n = len(s)
126	while i < n:
127	c = mylib.ByteAt(s, i)
128
129	if mylib.ByteEquals(c, '\\') and i != n - 1:
130	# Suppressed this to fix bug #698, #628 is still there.
131	assert i != n - 1, 'Trailing backslash: %r' % s
132	i += 1
133	c2 = mylib.ByteAt(s, i)
134
135	if mylib.ByteInSet(c2, GLOB_META_CHARS):
136	unescaped.append(c2)
137	else:
138	raise AssertionError("Unexpected escaped character %r" % c2)
139	else:
140	unescaped.append(c)
141	i += 1
142	return mylib.JoinBytes(unescaped)
143
144
145	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
146	# positions of matches. So we convert globs to regexps.
147
148	# Problems:
149	# - What about unicode? Do we have to set any global variables? We want it to
150	# always use utf-8?
151
152
153	class _GlobParser(object):
154
155	def __init__(self, lexer):
156	# type: (SimpleLexer) -> None
157	self.lexer = lexer
158	self.token_type = Id.Undefined_Tok
159	self.token_val = ''
160	self.warnings = [] # type: List[str]
161
162	def _Next(self):
163	# type: () -> None
164	"""Move to the next token."""
165	self.token_type, self.token_val = self.lexer.Next()
166
167	def _ParseCharClass(self):
168	# type: () -> List[glob_part_t]
169	"""
170	Returns:
171	a CharClass if the parse succeeds, or a Literal if fails. In the latter
172	case, we also append a warning.
173	"""
174	first_token = glob_part.Literal(self.token_type, self.token_val)
175	balance = 1 # We already saw a [
176	tokens = [] # type: List[Tuple[Id_t, str]]
177
178	# NOTE: There is a special rule where []] and [[] are valid globs. Also
179	# [^[] and sometimes [^]], although that one is ambiguous!
180	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
181	# punting on this now because the rule isn't clear and consistent between
182	# shells.
183
184	while True:
185	self._Next()
186
187	if self.token_type == Id.Eol_Tok:
188	# TODO: location info
189	self.warnings.append(
190	'Malformed character class; treating as literal')
191	parts = [first_token] # type: List[glob_part_t]
192	for (id_, s) in tokens:
193	parts.append(glob_part.Literal(id_, s))
194	return parts
195
196	if self.token_type == Id.Glob_LBracket:
197	balance += 1
198	elif self.token_type == Id.Glob_RBracket:
199	balance -= 1
200
201	if balance == 0:
202	break
203	tokens.append(
204	(self.token_type, self.token_val)) # Don't append the last ]
205
206	negated = False
207	if len(tokens):
208	id1, _ = tokens[0]
209	# NOTE: Both ! and ^ work for negation in globs
210	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
211	# TODO: Warn about the one that's not recommended?
212	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
213	negated = True
214	tokens = tokens[1:]
215	strs = [s for _, s in tokens]
216	return [glob_part.CharClass(negated, strs)]
217
218	def Parse(self):
219	# type: () -> Tuple[List[glob_part_t], List[str]]
220	"""
221	Returns:
222	regex string (or None if it's not a glob)
223	A list of warnings about the syntax
224	"""
225	parts = [] # type: List[glob_part_t]
226
227	while True:
228	self._Next()
229	id_ = self.token_type
230	s = self.token_val
231
232	#log('%s %r', self.token_type, self.token_val)
233	if id_ == Id.Eol_Tok:
234	break
235
236	if id_ in (Id.Glob_Star, Id.Glob_QMark):
237	parts.append(glob_part.Operator(id_))
238
239	elif id_ == Id.Glob_LBracket:
240	# Could return a Literal or a CharClass
241	parts.extend(self._ParseCharClass())
242
243	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
244	# BadBackslash}
245	parts.append(glob_part.Literal(id_, s))
246
247	# Also check for warnings. TODO: location info.
248	if id_ == Id.Glob_RBracket:
249	self.warnings.append('Got unescaped right bracket')
250	if id_ == Id.Glob_BadBackslash:
251	self.warnings.append('Got unescaped trailing backslash')
252
253	return parts, self.warnings
254
255
256	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
257
258
259	def _GenerateERE(parts):
260	# type: (List[glob_part_t]) -> str
261	out = [] # type: List[str]
262
263	for part in parts:
264	tag = part.tag()
265	UP_part = part
266
267	if tag == glob_part_e.Literal:
268	part = cast(glob_part.Literal, UP_part)
269	if part.id == Id.Glob_EscapedChar:
270	assert len(part.s) == 2, part.s
271	# The user could have escaped a char that doesn't need regex escaping,
272	# like \b or something.
273	c = part.s[1]
274	if c in _REGEX_CHARS_TO_ESCAPE:
275	out.append('\\')
276	out.append(c)
277
278	# ! is only for char class
279	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
280	out.append(part.s) # e.g. 'py' doesn't need to be escaped
281
282	# ^ is only for char class
283	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
284	assert len(part.s) == 1, part.s
285	c = part.s
286	if c in _REGEX_CHARS_TO_ESCAPE:
287	out.append('\\')
288	out.append(c)
289
290	# These are UNMATCHED ones not parsed in a glob class
291	elif part.id == Id.Glob_LBracket:
292	out.append('\\[')
293
294	elif part.id == Id.Glob_RBracket:
295	out.append('\\]')
296
297	elif part.id == Id.Glob_BadBackslash:
298	out.append('\\\\')
299
300	elif part.id == Id.Glob_Caret:
301	out.append('^')
302
303	else:
304	raise AssertionError(part.id)
305
306	elif tag == glob_part_e.Operator:
307	part = cast(glob_part.Operator, UP_part)
308	if part.op_id == Id.Glob_QMark:
309	out.append('.')
310	elif part.op_id == Id.Glob_Star:
311	out.append('.*')
312	else:
313	raise AssertionError()
314
315	elif tag == glob_part_e.CharClass:
316	part = cast(glob_part.CharClass, UP_part)
317	out.append('[')
318	if part.negated:
319	out.append('^')
320
321	# Important: the character class is LITERALLY preserved, because we
322	# assume glob char classes are EXACTLY the same as regex char classes,
323	# including the escaping rules.
324	#
325	# TWO WEIRD EXCEPTIONS:
326	# \- is moved to the end as '-'.
327	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
328	# want a literal, and the POSIX way to do that is to put it at the end.
329	# \] is moved to the FRONT as ]
330
331	good = [] # type: List[str]
332
333	literal_hyphen = False
334	literal_rbracket = False
335
336	for s in part.strs:
337	if s == '\-':
338	literal_hyphen = True
339	continue
340	if s == '\]':
341	literal_rbracket = True
342	continue
343	good.append(s)
344
345	if literal_rbracket:
346	out.append(']')
347
348	out.extend(good)
349
350	if literal_hyphen:
351	out.append('-')
352
353	out.append(']')
354
355	return ''.join(out)
356
357
358	def GlobToERE(pat):
359	# type: (str) -> Tuple[str, List[str]]
360	lexer = match.GlobLexer(pat)
361	p = _GlobParser(lexer)
362	parts, warnings = p.Parse()
363
364	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
365	# a literal, and we could use a more efficient mechanism.
366	# But we would have to DEQUOTE before doing that.
367	if 0:
368	is_glob = False
369	for p in parts:
370	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
371	is_glob = True
372	if 0:
373	log('GlobToERE()')
374	for p in parts:
375	log(' %s', p)
376
377	regex = _GenerateERE(parts)
378	#log('pat %s -> regex %s', pat, regex)
379	return regex, warnings
380
381
382	# Notes for implementing extglob
383	# - libc glob() doesn't have any extension!
384	# - Nix stdenv uses !(foo) and @(foo\|bar)
385	# - can we special case these for now?
386	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
387	# result!
388	# - Actually I guess we can do that for all of them. That seems fine.
389	# - But we have to get the statically parsed arg in here?
390	# - or do dynamic parsing
391	# - LooksLikeGlob() would have to respect extglob! ugh!
392	# - See 2 calls in osh/word_eval.py
393
394
395	class Globber(object):
396
397	def __init__(self, exec_opts):
398	# type: (optview.Exec) -> None
399	self.exec_opts = exec_opts
400
401	# Other unimplemented bash options:
402	#
403	# dotglob dotfiles are matched
404	# globstar ** for directories
405	# globasciiranges ascii or unicode char classes (unicode by default)
406	# nocaseglob
407	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
408	# not glob().
409	#
410	# NOTE: Bash also respects the GLOBIGNORE variable, but no other shells
411	# do. Could a default GLOBIGNORE to ignore flags on the file system be
412	# part of the security solution? It doesn't seem totally sound.
413
414	def _Glob(self, arg, out):
415	# type: (str, List[str]) -> int
416	try:
417	flags = 0
418	if self.exec_opts.dotglob():
419	flags \|= GLOB_PERIOD
420	results = libc.glob(arg, flags)
421	except RuntimeError as e:
422	# These errors should be rare: I/O error, out of memory, or unknown
423	# There are no syntax errors. (But see comment about globerr() in
424	# native/libc.c.)
425	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
426	msg = e.message # type: str
427	print_stderr("Error expanding glob %r: %s" % (arg, msg))
428	raise
429	#log('glob %r -> %r', arg, g)
430
431	n = len(results)
432	if n: # Something matched
433	# Omit files starting with -
434	# dashglob turned OFF with shopt -s oil:upgrade.
435	if not self.exec_opts.dashglob():
436	tmp = [s for s in results if not s.startswith('-')]
437	results = tmp # idiom to work around mycpp limitation
438	n = len(results)
439
440	# XXX: libc's glob function can return '.' and '..', which
441	# are typically not of interest. Filtering in this manner
442	# is similar (but not identical) to the default bash
443	# setting of 'setopt -s globskipdots'. Supporting that
444	# option fully would require more than simply wrapping
445	# this in an if statement.
446	n = 0
447	for s in results:
448	if s not in ('.', '..'):
449	out.append(s)
450	n += 1
451	return n
452
453	return 0
454
455	def Expand(self, arg, out):
456	# type: (str, List[str]) -> int
457	"""Given a string that could be a glob, append a list of strings to
458	'out'.
459
460	Returns:
461	Number of items appended, or -1 for fatal failglob error.
462	"""
463	if self.exec_opts.noglob():
464	# we didn't glob escape it in osh/word_eval.py
465	out.append(arg)
466	return 1
467
468	n = self._Glob(arg, out)
469	if n:
470	return n
471
472	# Nothing matched
473	if self.exec_opts.failglob():
474	return -1
475
476	if self.exec_opts.nullglob():
477	return 0
478	else:
479	# Return the original string
480	out.append(GlobUnescape(arg))
481	return 1
482
483	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
484	# type: (str, str, List[str]) -> int
485	if self.exec_opts.noglob():
486	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
487	# there is extra \ escaping compared with bash and mksh. OK for now
488	out.append(fnmatch_pat)
489	return 1
490
491	tmp = [] # type: List[str]
492	self._Glob(glob_pat, tmp)
493	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
494	n = len(filtered)
495
496	if n:
497	out.extend(filtered)
498	return n
499
500	if self.exec_opts.failglob():
501	return -1 # nothing matched
502
503	if self.exec_opts.nullglob():
504	return 0
505	else:
506	# See comment above
507	out.append(GlobUnescape(fnmatch_pat))
508	return 1