osh/glob

OILS / osh / glob_.py View on Github | oils.pub

677 lines, 335 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7	glob_part, glob_part_e, glob_part_t,
8	loc, loc_t)
9	from _devbuild.gen.value_asdl import value
10	from core import pyos, pyutil, error
11	from frontend import match
12	from mycpp import mylib
13	from mycpp.mylib import log
14	from pylib import os_path
15
16	from libc import GLOB_PERIOD
17	from _devbuild.gen.value_asdl import value_e
18	from _devbuild.gen.runtime_asdl import scope_e
19
20	from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21	if TYPE_CHECKING:
22	from core import optview
23	from core import state
24	from frontend.match import SimpleLexer
25
26	_ = log
27
28
29	def LooksLikeGlob(s):
30	# type: (str) -> bool
31	"""Does this string look like a glob pattern?
32
33	Like other shells, OSH avoids calls to glob() unless there are glob
34	metacharacters.
35
36	TODO: Reference lib/glob / glob_pattern functions in bash
37	$ grep glob_pattern lib/glob/*
38
39	Used:
40	1. in Globber below
41	2. for the slow path / fast path of prefix/suffix/patsub ops.
42	"""
43	left_bracket = False
44	i = 0
45	n = len(s)
46	while i < n:
47	c = mylib.ByteAt(s, i)
48
49	if mylib.ByteEquals(c, '\\'):
50	i += 1
51
52	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53	return True
54
55	elif mylib.ByteEquals(c, '['):
56	left_bracket = True
57
58	elif mylib.ByteEquals(c, ']') and left_bracket:
59	# It has at least one pair of balanced []. Not bothering to check stray
60	# [ or ].
61	return True
62
63	i += 1
64	return False
65
66
67	def LooksLikeStaticGlob(w):
68	# type: (CompoundWord) -> bool
69	"""Like LooksLikeGlob, but for static words."""
70
71	left_bracket = False
72	for part in w.parts:
73	if part.tag() == word_part_e.Literal:
74	id_ = cast(Token, part).id
75	if id_ in (Id.Lit_Star, Id.Lit_QMark):
76	return True
77	elif id_ == Id.Lit_LBracket:
78	left_bracket = True
79	elif id_ == Id.Lit_RBracket and left_bracket:
80	return True
81	return False
82
83
84	# Glob Helpers for WordParts.
85	# NOTE: Escaping / doesn't work, because it's not a filename character.
86	# ! : - are metachars within character classes
87	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
88	# underlying library doesn't support extended globs
89	# we don't need to escape the @ in @(cc), because escaping ( is enough
90	GLOB_META_CHARS = r'\*?[]-:!()\|'
91
92	# Check invariant needed to escape literal \ as \@
93	assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96	def GlobEscape(s):
97	# type: (str) -> str
98	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102	def GlobEscapeBackslash(s):
103	# type: (str) -> str
104	"""Glob escape a string for an unquoted var sub.
105
106	Used to evaluate something like $v with v='a\b.txt'
107
108	We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110	See test cases in spec/glob.test.sh
111
112	- If globbing is performed, then \* evaluates to literal '*'
113	- that is, \ is an escape for the *
114	- If globbing is NOT performed (set -o noglob or no matching files), then
115	\* evaluates to '\*'
116	- that is, the \ is preserved literally
117	"""
118	return s.replace('\\', r'\@')
119
120
121	# Bug fix: add [] so [[:space:]] is not special, etc.
122	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
123
124
125	def ExtendedRegexEscape(s):
126	# type: (str) -> str
127	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128	I don't think libc has a function to do this. Escape these characters:
129
130	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131	"""
132	return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135	def GlobUnescape(s):
136	# type: (str) -> str
137	"""Remove glob escaping from a string.
138
139	Used when there is no glob match.
140	TODO: Can probably get rid of this, as long as you save the original word.
141
142	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
143	word_eval _JoinElideEscape and EvalWordToString you have to build two
144	'parallel' strings -- one escaped and one not.
145	"""
146	unescaped = [] # type: List[int]
147	i = 0
148	n = len(s)
149	while i < n:
150	c = mylib.ByteAt(s, i)
151
152	if mylib.ByteEquals(c, '\\') and i != n - 1:
153	# TODO: GlobEscape() turns \ into \\, so a string should never end
154	# with a single backslash.
155	# Suppressed this assert to fix bug #698, #628 is still there.
156	# Check them again.
157	assert i != n - 1, 'Trailing backslash: %r' % s
158
159	i += 1
160	c2 = mylib.ByteAt(s, i)
161
162	if mylib.ByteInSet(c2, GLOB_META_CHARS):
163	unescaped.append(c2)
164	elif mylib.ByteEquals(c2, '@'):
165	unescaped.append(pyos.BACKSLASH_CH)
166	else:
167	raise AssertionError("Unexpected escaped character %r" % c2)
168	else:
169	unescaped.append(c)
170	i += 1
171	return mylib.JoinBytes(unescaped)
172
173
174	def GlobUnescapeBackslash(s):
175	# type: (str) -> str
176	"""Inverse of GlobEscapeBackslash - turns \@ into \ """
177	unescaped = [] # type: List[int]
178	i = 0
179	n = len(s)
180	while i < n:
181	c = mylib.ByteAt(s, i)
182
183	if mylib.ByteEquals(c, '\\') and i != n - 1:
184	# Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185	# could end with a single backslash?
186	assert i != n - 1, 'Trailing backslash: %r' % s
187
188	i += 1
189	c2 = mylib.ByteAt(s, i)
190
191	if mylib.ByteEquals(c2, '@'):
192	unescaped.append(pyos.BACKSLASH_CH)
193	else:
194	unescaped.append(pyos.BACKSLASH_CH)
195	unescaped.append(c2)
196	else:
197	unescaped.append(c)
198	i += 1
199	return mylib.JoinBytes(unescaped)
200
201
202	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203	# positions of matches. So we convert globs to regexps.
204
205	# Problems:
206	# - What about unicode? Do we have to set any global variables? We want it to
207	# always use utf-8?
208
209
210	class _GlobParser(object):
211
212	def __init__(self, lexer):
213	# type: (SimpleLexer) -> None
214	self.lexer = lexer
215	self.token_type = Id.Undefined_Tok
216	self.token_val = ''
217	self.warnings = [] # type: List[str]
218
219	def _Next(self):
220	# type: () -> None
221	"""Move to the next token."""
222	self.token_type, self.token_val = self.lexer.Next()
223
224	def _ParseCharClass(self):
225	# type: () -> List[glob_part_t]
226	"""
227	Returns:
228	a CharClass if the parse succeeds, or a Literal if fails. In the latter
229	case, we also append a warning.
230	"""
231	first_token = glob_part.Literal(self.token_type, self.token_val)
232	balance = 1 # We already saw a [
233	tokens = [] # type: List[Tuple[Id_t, str]]
234
235	# NOTE: There is a special rule where []] and [[] are valid globs. Also
236	# [^[] and sometimes [^]], although that one is ambiguous!
237	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238	# punting on this now because the rule isn't clear and consistent between
239	# shells.
240
241	while True:
242	self._Next()
243
244	if self.token_type == Id.Eol_Tok:
245	# TODO: location info
246	self.warnings.append(
247	'Malformed character class; treating as literal')
248	parts = [first_token] # type: List[glob_part_t]
249	for (id_, s) in tokens:
250	parts.append(glob_part.Literal(id_, s))
251	return parts
252
253	if self.token_type == Id.Glob_LBracket:
254	balance += 1
255	elif self.token_type == Id.Glob_RBracket:
256	balance -= 1
257
258	if balance == 0:
259	break
260	tokens.append(
261	(self.token_type, self.token_val)) # Don't append the last ]
262
263	negated = False
264	if len(tokens):
265	id1, _ = tokens[0]
266	# NOTE: Both ! and ^ work for negation in globs
267	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268	# TODO: Warn about the one that's not recommended?
269	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270	negated = True
271	tokens = tokens[1:]
272	strs = [s for _, s in tokens]
273	return [glob_part.CharClass(negated, strs)]
274
275	def Parse(self):
276	# type: () -> Tuple[List[glob_part_t], List[str]]
277	"""
278	Returns:
279	regex string (or None if it's not a glob)
280	A list of warnings about the syntax
281	"""
282	parts = [] # type: List[glob_part_t]
283
284	while True:
285	self._Next()
286	id_ = self.token_type
287	s = self.token_val
288
289	#log('%s %r', self.token_type, self.token_val)
290	if id_ == Id.Eol_Tok:
291	break
292
293	if id_ in (Id.Glob_Star, Id.Glob_QMark):
294	parts.append(glob_part.Operator(id_))
295
296	elif id_ == Id.Glob_LBracket:
297	# Could return a Literal or a CharClass
298	parts.extend(self._ParseCharClass())
299
300	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301	# BadBackslash}
302	parts.append(glob_part.Literal(id_, s))
303
304	# Also check for warnings. TODO: location info.
305	if id_ == Id.Glob_RBracket:
306	self.warnings.append('Got unescaped right bracket')
307	if id_ == Id.Glob_BadBackslash:
308	self.warnings.append('Got unescaped trailing backslash')
309
310	return parts, self.warnings
311
312
313	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
314
315
316	def _GenerateERE(parts):
317	# type: (List[glob_part_t]) -> str
318	out = [] # type: List[str]
319
320	for part in parts:
321	tag = part.tag()
322	UP_part = part
323
324	if tag == glob_part_e.Literal:
325	part = cast(glob_part.Literal, UP_part)
326	if part.id == Id.Glob_EscapedChar:
327	assert len(part.s) == 2, part.s
328	# The user could have escaped a char that doesn't need regex escaping,
329	# like \b or something.
330	c = part.s[1]
331	if c in _REGEX_CHARS_TO_ESCAPE:
332	out.append('\\')
333	out.append(c)
334
335	# ! is only for char class
336	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337	out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339	# ^ is only for char class
340	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341	assert len(part.s) == 1, part.s
342	c = part.s
343	if c in _REGEX_CHARS_TO_ESCAPE:
344	out.append('\\')
345	out.append(c)
346
347	# These are UNMATCHED ones not parsed in a glob class
348	elif part.id == Id.Glob_LBracket:
349	out.append('\\[')
350
351	elif part.id == Id.Glob_RBracket:
352	out.append('\\]')
353
354	elif part.id == Id.Glob_BadBackslash:
355	out.append('\\\\')
356
357	elif part.id == Id.Glob_Caret:
358	out.append('^')
359
360	else:
361	raise AssertionError(part.id)
362
363	elif tag == glob_part_e.Operator:
364	part = cast(glob_part.Operator, UP_part)
365	if part.op_id == Id.Glob_QMark:
366	out.append('.')
367	elif part.op_id == Id.Glob_Star:
368	out.append('.*')
369	else:
370	raise AssertionError()
371
372	elif tag == glob_part_e.CharClass:
373	part = cast(glob_part.CharClass, UP_part)
374	out.append('[')
375	if part.negated:
376	out.append('^')
377
378	# Important: the character class is LITERALLY preserved, because we
379	# assume glob char classes are EXACTLY the same as regex char classes,
380	# including the escaping rules.
381	#
382	# TWO WEIRD EXCEPTIONS:
383	# \- is moved to the end as '-'.
384	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385	# want a literal, and the POSIX way to do that is to put it at the end.
386	# \] is moved to the FRONT as ]
387
388	good = [] # type: List[str]
389
390	literal_hyphen = False
391	literal_rbracket = False
392
393	for s in part.strs:
394	if s == '\-':
395	literal_hyphen = True
396	continue
397	if s == '\]':
398	literal_rbracket = True
399	continue
400	good.append(s)
401
402	if literal_rbracket:
403	out.append(']')
404
405	out.extend(good)
406
407	if literal_hyphen:
408	out.append('-')
409
410	out.append(']')
411
412	return ''.join(out)
413
414
415	def GlobToERE(pat):
416	# type: (str) -> Tuple[str, List[str]]
417	lexer = match.GlobLexer(pat)
418	p = _GlobParser(lexer)
419	parts, warnings = p.Parse()
420
421	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
422	# a literal, and we could use a more efficient mechanism.
423	# But we would have to DEQUOTE before doing that.
424	if 0:
425	is_glob = False
426	for p in parts:
427	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428	is_glob = True
429	if 0:
430	log('GlobToERE()')
431	for p in parts:
432	log(' %s', p)
433
434	regex = _GenerateERE(parts)
435	#log('pat %s -> regex %s', pat, regex)
436	return regex, warnings
437
438
439	# Notes for implementing extglob
440	# - libc glob() doesn't have any extension!
441	# - Nix stdenv uses !(foo) and @(foo\|bar)
442	# - can we special case these for now?
443	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
444	# result!
445	# - Actually I guess we can do that for all of them. That seems fine.
446	# - But we have to get the statically parsed arg in here?
447	# - or do dynamic parsing
448	# - LooksLikeGlob() would have to respect extglob! ugh!
449	# - See 2 calls in osh/word_eval.py
450
451
452	def _StringMatchesAnyPattern(s, patterns):
453	# type: (str, List[str]) -> bool
454	"""Check if string matches any pattern in the list.
455
456	Returns True if s matches any pattern, or if s is . or ..
457	(which are always filtered when GLOBIGNORE is set).
458	"""
459	flags = 0
460	for pattern in patterns:
461	if libc.fnmatch(pattern, s, flags):
462	return True
463
464	return False
465
466
467	class Globber(object):
468
469	def __init__(self, exec_opts, mem):
470	# type: (optview.Exec, state.Mem) -> None
471	self.exec_opts = exec_opts
472	self.mem = mem
473	# Cache for parsed GLOBIGNORE patterns to avoid re-parsing
474	self._globignore_cache = {} # type: Dict[str, List[str]]
475
476	# Other unimplemented bash options:
477	#
478	# globstar ** for directories
479	# globasciiranges ascii or unicode char classes (unicode by default)
480	# nocaseglob
481	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
482	# not glob().
483
484	def _GetGlobIgnorePatterns(self):
485	# type: () -> Optional[List[str]]
486	"""Get GLOBIGNORE patterns as a list, or None if not set."""
487
488	val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
489	if val.tag() != value_e.Str:
490	return None
491
492	globignore = cast(value.Str, val).s # type: str
493	if len(globignore) == 0:
494	return None
495
496	if globignore in self._globignore_cache:
497	return self._globignore_cache[globignore]
498
499	# Split by colon to get individual patterns, but don't split colons
500	# inside bracket expressions like [[:alnum:]]
501	patterns = [] # type: List[str]
502	current = [] # type: List[str]
503	in_bracket = False
504
505	for c in globignore:
506	if c == '[':
507	in_bracket = True
508	current.append(c)
509	elif c == ']':
510	in_bracket = False
511	current.append(c)
512	elif c == ':' and not in_bracket:
513	if len(current):
514	patterns.append(''.join(current))
515	del current[:]
516	else:
517	current.append(c)
518
519	if len(current):
520	patterns.append(''.join(current))
521
522	self._globignore_cache[globignore] = patterns
523
524	return patterns
525
526	def DoLibcGlob(self, arg, out, blame_loc):
527	# type: (str, List[str], loc_t) -> None
528	"""For the io.libcGlob() API"""
529	try:
530	results = libc.glob(arg, 0)
531	except RuntimeError as e:
532	# Rare glob errors, like GLOB_NOSPACE
533	# Note: dash has a fatal sh_error() on GLOB_NOSPACE
534
535	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
536	msg = e.message # type: str
537	raise error.Structured(error.CODEC_STATUS, msg, blame_loc)
538
539	out.extend(results)
540
541	def DoShellGlob(self, arg, out, blame_loc=loc.Missing):
542	# type: (str, List[str], loc_t) -> int
543	"""For word evaluation and the io.glob() API
544
545	Respects these filters:
546	- GLOBIGNORE
547	- dotglob turns into C GLOB_PERIOD
548	- no_dash_glob
549	- globskipdots
550
551	But NOT these; they are done at a higher level
552	- noglob
553	- failglob
554	- nullglob - ditto
555
556	TODO:
557	- ysh globbing should not respect globals like GLOBIGNORE?
558	- only no_dash_glob by default?
559	- split into two functions:
560	- compatible io.glob()
561	- controlled io.libcGlob()
562	"""
563	globignore_patterns = self._GetGlobIgnorePatterns()
564
565	flags = 0
566	# shopt -u dotglob (default): echo * does not return say .gitignore
567	# If GLOBIGNORE is set, then dotglob is NOT respected - we return ..
568	if self.exec_opts.dotglob() or globignore_patterns is not None:
569	# If HAVE_GLOB_PERIOD is false, then ./configure stubs out
570	# GLOB_PERIOD as 0, a no-op
571	flags \|= GLOB_PERIOD
572
573	try:
574	results = libc.glob(arg, flags)
575	except RuntimeError as e:
576	# Rare glob errors, like GLOB_NOSPACE
577	# Note: dash has a fatal sh_error() on GLOB_NOSPACE
578
579	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
580	msg = e.message # type: str
581	raise error.Structured(error.CODEC_STATUS, msg, blame_loc)
582	#log('glob %r -> %r', arg, g)
583
584	if len(results) == 0:
585	return 0 # nothing matched
586
587	# Something matched
588
589	if globignore_patterns is not None: # Handle GLOBIGNORE
590	# When GLOBIGNORE is set, bash doesn't respect shopt -u
591	# globskipdots! The entries . and .. are skipped, even if they
592	# do NOT match GLOBIGNORE
593	tmp = [
594	s for s in results
595	if not _StringMatchesAnyPattern(s, globignore_patterns) and
596	os_path.basename(s) not in ('.', '..')
597	]
598	results = tmp # idiom to work around mycpp limitation
599
600	skipdots = True
601
602	else: # Do filtering that's NOT GLOBIGNORE
603	# no_dash_glob: Omit files starting with -
604	# (part of shopt --set ysh:upgrade)
605	if self.exec_opts.no_dash_glob():
606	tmp = [s for s in results if not s.startswith('-')]
607	results = tmp
608
609	# globskipdots: Remove . and .. entries returned by libc.
610	if self.exec_opts.globskipdots():
611	tmp = [s for s in results if s not in ('.', '..')]
612	results = tmp
613
614	out.extend(results)
615	return len(results)
616
617	def Expand(self, arg, out, blame_loc):
618	# type: (str, List[str], loc_t) -> int
619	"""Given a string that MAY be a glob, perform glob expansion
620
621	If files on disk match the glob pattern, we append to the list 'out',
622	and return the number of items.
623
624	Returns:
625	Number of items appended, or -1 when glob expansion did not happen.
626	Raises:
627	error.FailGlob when nothing matched, and shopt -s failglob
628	"""
629	if self.exec_opts.noglob():
630	# The caller should use the original string
631	return -1
632
633	n = self.DoShellGlob(arg, out)
634	if n:
635	return n
636
637	# Nothing matched
638	if self.exec_opts.failglob():
639	raise error.FailGlob('Pattern %r matched no files' % arg,
640	blame_loc)
641
642	if self.exec_opts.nullglob():
643	return 0
644
645	# The caller should use the original string
646	return -1
647
648	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
649	# type: (str, str, List[str]) -> int
650	"""
651	Returns:
652	The number of items appended, or -1 when glob expansion did not happen
653	"""
654	if self.exec_opts.noglob():
655	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
656	# there is extra \ escaping compared with bash and mksh. OK for now
657	out.append(fnmatch_pat)
658	return 1
659
660	tmp = [] # type: List[str]
661	self.DoShellGlob(glob_pat, tmp)
662	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
663	n = len(filtered)
664
665	if n:
666	out.extend(filtered)
667	return n
668
669	if self.exec_opts.failglob():
670	return -1 # nothing matched
671
672	if self.exec_opts.nullglob():
673	return 0
674
675	# Expand to fnmatch_pat, as above
676	out.append(GlobUnescape(fnmatch_pat))
677	return 1