osh/glob

OILS / osh / glob_.py View on Github | oils.pub

642 lines, 337 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7	glob_part, glob_part_e, glob_part_t,
8	loc_t)
9	from _devbuild.gen.value_asdl import value
10	from core import pyos, pyutil, error
11	from frontend import match
12	from mycpp import mylib
13	from mycpp.mylib import log, print_stderr
14	from pylib import os_path
15
16	from libc import GLOB_PERIOD
17	from _devbuild.gen.value_asdl import value_e
18	from _devbuild.gen.runtime_asdl import scope_e
19
20	from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21	if TYPE_CHECKING:
22	from core import optview
23	from core import state
24	from frontend.match import SimpleLexer
25
26	_ = log
27
28
29	def LooksLikeGlob(s):
30	# type: (str) -> bool
31	"""Does this string look like a glob pattern?
32
33	Like other shells, OSH avoids calls to glob() unless there are glob
34	metacharacters.
35
36	TODO: Reference lib/glob / glob_pattern functions in bash
37	$ grep glob_pattern lib/glob/*
38
39	Used:
40	1. in Globber below
41	2. for the slow path / fast path of prefix/suffix/patsub ops.
42	"""
43	left_bracket = False
44	i = 0
45	n = len(s)
46	while i < n:
47	c = mylib.ByteAt(s, i)
48
49	if mylib.ByteEquals(c, '\\'):
50	i += 1
51
52	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53	return True
54
55	elif mylib.ByteEquals(c, '['):
56	left_bracket = True
57
58	elif mylib.ByteEquals(c, ']') and left_bracket:
59	# It has at least one pair of balanced []. Not bothering to check stray
60	# [ or ].
61	return True
62
63	i += 1
64	return False
65
66
67	def LooksLikeStaticGlob(w):
68	# type: (CompoundWord) -> bool
69	"""Like LooksLikeGlob, but for static words."""
70
71	left_bracket = False
72	for part in w.parts:
73	if part.tag() == word_part_e.Literal:
74	id_ = cast(Token, part).id
75	if id_ in (Id.Lit_Star, Id.Lit_QMark):
76	return True
77	elif id_ == Id.Lit_LBracket:
78	left_bracket = True
79	elif id_ == Id.Lit_RBracket and left_bracket:
80	return True
81	return False
82
83
84	# Glob Helpers for WordParts.
85	# NOTE: Escaping / doesn't work, because it's not a filename character.
86	# ! : - are metachars within character classes
87	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
88	# underlying library doesn't support extended globs
89	# we don't need to escape the @ in @(cc), because escaping ( is enough
90	GLOB_META_CHARS = r'\*?[]-:!()\|'
91
92	# Check invariant needed to escape literal \ as \@
93	assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96	def GlobEscape(s):
97	# type: (str) -> str
98	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102	def GlobEscapeBackslash(s):
103	# type: (str) -> str
104	"""Glob escape a string for an unquoted var sub.
105
106	Used to evaluate something like $v with v='a\b.txt'
107
108	We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110	See test cases in spec/glob.test.sh
111
112	- If globbing is performed, then \* evaluates to literal '*'
113	- that is, \ is an escape for the *
114	- If globbing is NOT performed (set -o noglob or no matching files), then
115	\* evaluates to '\*'
116	- that is, the \ is preserved literally
117	"""
118	return s.replace('\\', r'\@')
119
120
121	# Bug fix: add [] so [[:space:]] is not special, etc.
122	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
123
124
125	def ExtendedRegexEscape(s):
126	# type: (str) -> str
127	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128	I don't think libc has a function to do this. Escape these characters:
129
130	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131	"""
132	return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135	def GlobUnescape(s):
136	# type: (str) -> str
137	"""Remove glob escaping from a string.
138
139	Used when there is no glob match.
140	TODO: Can probably get rid of this, as long as you save the original word.
141
142	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
143	word_eval _JoinElideEscape and EvalWordToString you have to build two
144	'parallel' strings -- one escaped and one not.
145	"""
146	unescaped = [] # type: List[int]
147	i = 0
148	n = len(s)
149	while i < n:
150	c = mylib.ByteAt(s, i)
151
152	if mylib.ByteEquals(c, '\\') and i != n - 1:
153	# TODO: GlobEscape() turns \ into \\, so a string should never end
154	# with a single backslash.
155	# Suppressed this assert to fix bug #698, #628 is still there.
156	# Check them again.
157	assert i != n - 1, 'Trailing backslash: %r' % s
158
159	i += 1
160	c2 = mylib.ByteAt(s, i)
161
162	if mylib.ByteInSet(c2, GLOB_META_CHARS):
163	unescaped.append(c2)
164	elif mylib.ByteEquals(c2, '@'):
165	unescaped.append(pyos.BACKSLASH_CH)
166	else:
167	raise AssertionError("Unexpected escaped character %r" % c2)
168	else:
169	unescaped.append(c)
170	i += 1
171	return mylib.JoinBytes(unescaped)
172
173
174	def GlobUnescapeBackslash(s):
175	# type: (str) -> str
176	"""Inverse of GlobEscapeBackslash - turns \@ into \ """
177	unescaped = [] # type: List[int]
178	i = 0
179	n = len(s)
180	while i < n:
181	c = mylib.ByteAt(s, i)
182
183	if mylib.ByteEquals(c, '\\') and i != n - 1:
184	# Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185	# could end with a single backslash?
186	assert i != n - 1, 'Trailing backslash: %r' % s
187
188	i += 1
189	c2 = mylib.ByteAt(s, i)
190
191	if mylib.ByteEquals(c2, '@'):
192	unescaped.append(pyos.BACKSLASH_CH)
193	else:
194	unescaped.append(pyos.BACKSLASH_CH)
195	unescaped.append(c2)
196	else:
197	unescaped.append(c)
198	i += 1
199	return mylib.JoinBytes(unescaped)
200
201
202	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203	# positions of matches. So we convert globs to regexps.
204
205	# Problems:
206	# - What about unicode? Do we have to set any global variables? We want it to
207	# always use utf-8?
208
209
210	class _GlobParser(object):
211
212	def __init__(self, lexer):
213	# type: (SimpleLexer) -> None
214	self.lexer = lexer
215	self.token_type = Id.Undefined_Tok
216	self.token_val = ''
217	self.warnings = [] # type: List[str]
218
219	def _Next(self):
220	# type: () -> None
221	"""Move to the next token."""
222	self.token_type, self.token_val = self.lexer.Next()
223
224	def _ParseCharClass(self):
225	# type: () -> List[glob_part_t]
226	"""
227	Returns:
228	a CharClass if the parse succeeds, or a Literal if fails. In the latter
229	case, we also append a warning.
230	"""
231	first_token = glob_part.Literal(self.token_type, self.token_val)
232	balance = 1 # We already saw a [
233	tokens = [] # type: List[Tuple[Id_t, str]]
234
235	# NOTE: There is a special rule where []] and [[] are valid globs. Also
236	# [^[] and sometimes [^]], although that one is ambiguous!
237	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238	# punting on this now because the rule isn't clear and consistent between
239	# shells.
240
241	while True:
242	self._Next()
243
244	if self.token_type == Id.Eol_Tok:
245	# TODO: location info
246	self.warnings.append(
247	'Malformed character class; treating as literal')
248	parts = [first_token] # type: List[glob_part_t]
249	for (id_, s) in tokens:
250	parts.append(glob_part.Literal(id_, s))
251	return parts
252
253	if self.token_type == Id.Glob_LBracket:
254	balance += 1
255	elif self.token_type == Id.Glob_RBracket:
256	balance -= 1
257
258	if balance == 0:
259	break
260	tokens.append(
261	(self.token_type, self.token_val)) # Don't append the last ]
262
263	negated = False
264	if len(tokens):
265	id1, _ = tokens[0]
266	# NOTE: Both ! and ^ work for negation in globs
267	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268	# TODO: Warn about the one that's not recommended?
269	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270	negated = True
271	tokens = tokens[1:]
272	strs = [s for _, s in tokens]
273	return [glob_part.CharClass(negated, strs)]
274
275	def Parse(self):
276	# type: () -> Tuple[List[glob_part_t], List[str]]
277	"""
278	Returns:
279	regex string (or None if it's not a glob)
280	A list of warnings about the syntax
281	"""
282	parts = [] # type: List[glob_part_t]
283
284	while True:
285	self._Next()
286	id_ = self.token_type
287	s = self.token_val
288
289	#log('%s %r', self.token_type, self.token_val)
290	if id_ == Id.Eol_Tok:
291	break
292
293	if id_ in (Id.Glob_Star, Id.Glob_QMark):
294	parts.append(glob_part.Operator(id_))
295
296	elif id_ == Id.Glob_LBracket:
297	# Could return a Literal or a CharClass
298	parts.extend(self._ParseCharClass())
299
300	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301	# BadBackslash}
302	parts.append(glob_part.Literal(id_, s))
303
304	# Also check for warnings. TODO: location info.
305	if id_ == Id.Glob_RBracket:
306	self.warnings.append('Got unescaped right bracket')
307	if id_ == Id.Glob_BadBackslash:
308	self.warnings.append('Got unescaped trailing backslash')
309
310	return parts, self.warnings
311
312
313	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
314
315
316	def _GenerateERE(parts):
317	# type: (List[glob_part_t]) -> str
318	out = [] # type: List[str]
319
320	for part in parts:
321	tag = part.tag()
322	UP_part = part
323
324	if tag == glob_part_e.Literal:
325	part = cast(glob_part.Literal, UP_part)
326	if part.id == Id.Glob_EscapedChar:
327	assert len(part.s) == 2, part.s
328	# The user could have escaped a char that doesn't need regex escaping,
329	# like \b or something.
330	c = part.s[1]
331	if c in _REGEX_CHARS_TO_ESCAPE:
332	out.append('\\')
333	out.append(c)
334
335	# ! is only for char class
336	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337	out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339	# ^ is only for char class
340	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341	assert len(part.s) == 1, part.s
342	c = part.s
343	if c in _REGEX_CHARS_TO_ESCAPE:
344	out.append('\\')
345	out.append(c)
346
347	# These are UNMATCHED ones not parsed in a glob class
348	elif part.id == Id.Glob_LBracket:
349	out.append('\\[')
350
351	elif part.id == Id.Glob_RBracket:
352	out.append('\\]')
353
354	elif part.id == Id.Glob_BadBackslash:
355	out.append('\\\\')
356
357	elif part.id == Id.Glob_Caret:
358	out.append('^')
359
360	else:
361	raise AssertionError(part.id)
362
363	elif tag == glob_part_e.Operator:
364	part = cast(glob_part.Operator, UP_part)
365	if part.op_id == Id.Glob_QMark:
366	out.append('.')
367	elif part.op_id == Id.Glob_Star:
368	out.append('.*')
369	else:
370	raise AssertionError()
371
372	elif tag == glob_part_e.CharClass:
373	part = cast(glob_part.CharClass, UP_part)
374	out.append('[')
375	if part.negated:
376	out.append('^')
377
378	# Important: the character class is LITERALLY preserved, because we
379	# assume glob char classes are EXACTLY the same as regex char classes,
380	# including the escaping rules.
381	#
382	# TWO WEIRD EXCEPTIONS:
383	# \- is moved to the end as '-'.
384	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385	# want a literal, and the POSIX way to do that is to put it at the end.
386	# \] is moved to the FRONT as ]
387
388	good = [] # type: List[str]
389
390	literal_hyphen = False
391	literal_rbracket = False
392
393	for s in part.strs:
394	if s == '\-':
395	literal_hyphen = True
396	continue
397	if s == '\]':
398	literal_rbracket = True
399	continue
400	good.append(s)
401
402	if literal_rbracket:
403	out.append(']')
404
405	out.extend(good)
406
407	if literal_hyphen:
408	out.append('-')
409
410	out.append(']')
411
412	return ''.join(out)
413
414
415	def GlobToERE(pat):
416	# type: (str) -> Tuple[str, List[str]]
417	lexer = match.GlobLexer(pat)
418	p = _GlobParser(lexer)
419	parts, warnings = p.Parse()
420
421	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
422	# a literal, and we could use a more efficient mechanism.
423	# But we would have to DEQUOTE before doing that.
424	if 0:
425	is_glob = False
426	for p in parts:
427	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428	is_glob = True
429	if 0:
430	log('GlobToERE()')
431	for p in parts:
432	log(' %s', p)
433
434	regex = _GenerateERE(parts)
435	#log('pat %s -> regex %s', pat, regex)
436	return regex, warnings
437
438
439	# Notes for implementing extglob
440	# - libc glob() doesn't have any extension!
441	# - Nix stdenv uses !(foo) and @(foo\|bar)
442	# - can we special case these for now?
443	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
444	# result!
445	# - Actually I guess we can do that for all of them. That seems fine.
446	# - But we have to get the statically parsed arg in here?
447	# - or do dynamic parsing
448	# - LooksLikeGlob() would have to respect extglob! ugh!
449	# - See 2 calls in osh/word_eval.py
450
451
452	class Globber(object):
453
454	def __init__(self, exec_opts, mem):
455	# type: (optview.Exec, state.Mem) -> None
456	self.exec_opts = exec_opts
457	self.mem = mem
458	# Cache for parsed GLOBIGNORE patterns to avoid re-parsing
459	self._globignore_cache = {} # type: Dict[str, List[str]]
460
461	# Other unimplemented bash options:
462	#
463	# globstar ** for directories
464	# globasciiranges ascii or unicode char classes (unicode by default)
465	# nocaseglob
466	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
467	# not glob().
468
469	def _GetGlobIgnorePatterns(self):
470	# type: () -> Optional[List[str]]
471	"""Get GLOBIGNORE patterns as a list, or None if not set."""
472
473	val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
474	if val.tag() != value_e.Str:
475	return None
476
477	globignore = cast(value.Str, val).s # type: str
478	if len(globignore) == 0:
479	return None
480
481	if globignore in self._globignore_cache:
482	return self._globignore_cache[globignore]
483
484	# Split by colon to get individual patterns, but don't split colons
485	# inside bracket expressions like [[:alnum:]]
486	patterns = [] # type: List[str]
487	current = [] # type: List[str]
488	in_bracket = False
489
490	for c in globignore:
491	if c == '[':
492	in_bracket = True
493	current.append(c)
494	elif c == ']':
495	in_bracket = False
496	current.append(c)
497	elif c == ':' and not in_bracket:
498	if len(current):
499	patterns.append(''.join(current))
500	del current[:]
501	else:
502	current.append(c)
503
504	if len(current):
505	patterns.append(''.join(current))
506
507	self._globignore_cache[globignore] = patterns
508
509	return patterns
510
511	def _MatchesGlobIgnore(self, filename, patterns):
512	# type: (str, List[str]) -> bool
513	"""Check if filename matches any GLOBIGNORE pattern.
514
515	Filenames . and .. are always ignored when GLOBIGNORE is set.
516	"""
517	basename = os_path.basename(filename)
518	if basename in ('.', '..'):
519	return True
520
521	flags = 0
522
523	for pattern in patterns:
524	if libc.fnmatch(pattern, filename, flags):
525	return True
526
527	return False
528
529	def _Glob(self, arg, out):
530	# type: (str, List[str]) -> int
531	globignore_patterns = self._GetGlobIgnorePatterns()
532
533	try:
534	flags = 0
535	# GLOBIGNORE enables dotglob when set to a non-null value
536	if self.exec_opts.dotglob() or globignore_patterns is not None:
537	# If HAVE_GLOB_PERIOD is false, then ./configure stubs out
538	# GLOB_PERIOD as 0, a no-op
539	flags \|= GLOB_PERIOD
540	results = libc.glob(arg, flags)
541	except RuntimeError as e:
542	# These errors should be rare: I/O error, out of memory, or unknown
543	# There are no syntax errors. (But see comment about globerr() in
544	# native/libc.c.)
545	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
546	msg = e.message # type: str
547	print_stderr("Error expanding glob %r: %s" % (arg, msg))
548	raise
549	#log('glob %r -> %r', arg, g)
550
551	n = len(results)
552	if n: # Something matched
553	# Omit files starting with -
554	# no_dash_glob is part of shopt --set ysh:upgrade
555	if self.exec_opts.no_dash_glob():
556	tmp = [s for s in results if not s.startswith('-')]
557	results = tmp # idiom to work around mycpp limitation
558	n = len(results)
559
560	if globignore_patterns is not None:
561	filtered = [] # type: List[str]
562	for s in results:
563	if not self._MatchesGlobIgnore(s, globignore_patterns):
564	filtered.append(s)
565	results = filtered
566	n = len(results)
567	else:
568	# XXX: libc's glob function can return '.' and '..', which
569	# are typically not of interest. Filtering in this manner
570	# is similar (but not identical) to the default bash
571	# setting of 'setopt -s globskipdots'. Supporting that
572	# option fully would require more than simply wrapping
573	# this in an if statement.
574	dotfile_filtered = [] # type: List[str]
575	for s in results:
576	if s not in ('.', '..'):
577	dotfile_filtered.append(s)
578	results = dotfile_filtered
579	n = len(results)
580
581	out.extend(results)
582	return n
583
584	return 0
585
586	def Expand(self, arg, out, blame_loc):
587	# type: (str, List[str], loc_t) -> int
588	"""Given a string that MAY be a glob, perform glob expansion
589
590	If files on disk match the glob pattern, we append to the list 'out',
591	and return the number of items.
592
593	Returns:
594	Number of items appended, or -1 when glob expansion did not happen.
595	Raises:
596	error.FailGlob when nothing matched, and shopt -s failglob
597	"""
598	if self.exec_opts.noglob():
599	# The caller should use the original string
600	return -1
601
602	n = self._Glob(arg, out)
603	if n:
604	return n
605
606	# Nothing matched
607	if self.exec_opts.failglob():
608	raise error.FailGlob('Pattern %r matched no files' % arg,
609	blame_loc)
610
611	if self.exec_opts.nullglob():
612	return 0
613
614	# The caller should use the original string
615	return -1
616
617	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
618	# type: (str, str, List[str]) -> int
619	if self.exec_opts.noglob():
620	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
621	# there is extra \ escaping compared with bash and mksh. OK for now
622	out.append(fnmatch_pat)
623	return 1
624
625	tmp = [] # type: List[str]
626	self._Glob(glob_pat, tmp)
627	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
628	n = len(filtered)
629
630	if n:
631	out.extend(filtered)
632	return n
633
634	if self.exec_opts.failglob():
635	return -1 # nothing matched
636
637	if self.exec_opts.nullglob():
638	return 0
639	else:
640	# See comment above
641	out.append(GlobUnescape(fnmatch_pat))
642	return 1