osh/glob

OILS / osh / glob_.py View on Github | oils.pub

652 lines, 330 significant

1	"""Glob_.py."""
2
3	import libc
4
5	from _devbuild.gen.id_kind_asdl import Id, Id_t
6	from _devbuild.gen.syntax_asdl import (CompoundWord, Token, word_part_e,
7	glob_part, glob_part_e, glob_part_t,
8	loc_t)
9	from _devbuild.gen.value_asdl import value
10	from core import pyos, pyutil, error
11	from frontend import match
12	from mycpp import mylib
13	from mycpp.mylib import log, print_stderr
14	from pylib import os_path
15
16	from libc import GLOB_PERIOD
17	from _devbuild.gen.value_asdl import value_e
18	from _devbuild.gen.runtime_asdl import scope_e
19
20	from typing import Dict, List, Tuple, cast, Optional, TYPE_CHECKING
21	if TYPE_CHECKING:
22	from core import optview
23	from core import state
24	from frontend.match import SimpleLexer
25
26	_ = log
27
28
29	def LooksLikeGlob(s):
30	# type: (str) -> bool
31	"""Does this string look like a glob pattern?
32
33	Like other shells, OSH avoids calls to glob() unless there are glob
34	metacharacters.
35
36	TODO: Reference lib/glob / glob_pattern functions in bash
37	$ grep glob_pattern lib/glob/*
38
39	Used:
40	1. in Globber below
41	2. for the slow path / fast path of prefix/suffix/patsub ops.
42	"""
43	left_bracket = False
44	i = 0
45	n = len(s)
46	while i < n:
47	c = mylib.ByteAt(s, i)
48
49	if mylib.ByteEquals(c, '\\'):
50	i += 1
51
52	elif mylib.ByteEquals(c, '*') or mylib.ByteEquals(c, '?'):
53	return True
54
55	elif mylib.ByteEquals(c, '['):
56	left_bracket = True
57
58	elif mylib.ByteEquals(c, ']') and left_bracket:
59	# It has at least one pair of balanced []. Not bothering to check stray
60	# [ or ].
61	return True
62
63	i += 1
64	return False
65
66
67	def LooksLikeStaticGlob(w):
68	# type: (CompoundWord) -> bool
69	"""Like LooksLikeGlob, but for static words."""
70
71	left_bracket = False
72	for part in w.parts:
73	if part.tag() == word_part_e.Literal:
74	id_ = cast(Token, part).id
75	if id_ in (Id.Lit_Star, Id.Lit_QMark):
76	return True
77	elif id_ == Id.Lit_LBracket:
78	left_bracket = True
79	elif id_ == Id.Lit_RBracket and left_bracket:
80	return True
81	return False
82
83
84	# Glob Helpers for WordParts.
85	# NOTE: Escaping / doesn't work, because it's not a filename character.
86	# ! : - are metachars within character classes
87	# ( ) \| are extended glob characters, and it's OK to add extra \ when the
88	# underlying library doesn't support extended globs
89	# we don't need to escape the @ in @(cc), because escaping ( is enough
90	GLOB_META_CHARS = r'\*?[]-:!()\|'
91
92	# Check invariant needed to escape literal \ as \@
93	assert '@' not in GLOB_META_CHARS, '\@ is used to escape backslash'
94
95
96	def GlobEscape(s):
97	# type: (str) -> str
98	"""For SingleQuoted, DoubleQuoted, and EscapedLiteral."""
99	return pyutil.BackslashEscape(s, GLOB_META_CHARS)
100
101
102	def GlobEscapeBackslash(s):
103	# type: (str) -> str
104	"""Glob escape a string for an unquoted var sub.
105
106	Used to evaluate something like $v with v='a\b.txt'
107
108	We escape \ as \@, which is OK because @ is not in GLOB_META_CHARS.
109
110	See test cases in spec/glob.test.sh
111
112	- If globbing is performed, then \* evaluates to literal '*'
113	- that is, \ is an escape for the *
114	- If globbing is NOT performed (set -o noglob or no matching files), then
115	\* evaluates to '\*'
116	- that is, the \ is preserved literally
117	"""
118	return s.replace('\\', r'\@')
119
120
121	# Bug fix: add [] so [[:space:]] is not special, etc.
122	ERE_META_CHARS = r'\?*+{}^$.()\|[]'
123
124
125	def ExtendedRegexEscape(s):
126	# type: (str) -> str
127	"""Quoted parts need to be regex-escaped when quoted, e.g. [[ $a =~ "{" ]].
128	I don't think libc has a function to do this. Escape these characters:
129
130	https://www.gnu.org/software/sed/manual/html_node/ERE-syntax.html
131	"""
132	return pyutil.BackslashEscape(s, ERE_META_CHARS)
133
134
135	def GlobUnescape(s):
136	# type: (str) -> str
137	"""Remove glob escaping from a string.
138
139	Used when there is no glob match.
140	TODO: Can probably get rid of this, as long as you save the original word.
141
142	Complicated example: 'ab'.py, which will be escaped to a\b.py. So in
143	word_eval _JoinElideEscape and EvalWordToString you have to build two
144	'parallel' strings -- one escaped and one not.
145	"""
146	unescaped = [] # type: List[int]
147	i = 0
148	n = len(s)
149	while i < n:
150	c = mylib.ByteAt(s, i)
151
152	if mylib.ByteEquals(c, '\\') and i != n - 1:
153	# TODO: GlobEscape() turns \ into \\, so a string should never end
154	# with a single backslash.
155	# Suppressed this assert to fix bug #698, #628 is still there.
156	# Check them again.
157	assert i != n - 1, 'Trailing backslash: %r' % s
158
159	i += 1
160	c2 = mylib.ByteAt(s, i)
161
162	if mylib.ByteInSet(c2, GLOB_META_CHARS):
163	unescaped.append(c2)
164	elif mylib.ByteEquals(c2, '@'):
165	unescaped.append(pyos.BACKSLASH_CH)
166	else:
167	raise AssertionError("Unexpected escaped character %r" % c2)
168	else:
169	unescaped.append(c)
170	i += 1
171	return mylib.JoinBytes(unescaped)
172
173
174	def GlobUnescapeBackslash(s):
175	# type: (str) -> str
176	"""Inverse of GlobEscapeBackslash - turns \@ into \ """
177	unescaped = [] # type: List[int]
178	i = 0
179	n = len(s)
180	while i < n:
181	c = mylib.ByteAt(s, i)
182
183	if mylib.ByteEquals(c, '\\') and i != n - 1:
184	# Note: GlobEscapeBackslash() doesn't turn \ into \\, so a string
185	# could end with a single backslash?
186	assert i != n - 1, 'Trailing backslash: %r' % s
187
188	i += 1
189	c2 = mylib.ByteAt(s, i)
190
191	if mylib.ByteEquals(c2, '@'):
192	unescaped.append(pyos.BACKSLASH_CH)
193	else:
194	unescaped.append(pyos.BACKSLASH_CH)
195	unescaped.append(c2)
196	else:
197	unescaped.append(c)
198	i += 1
199	return mylib.JoinBytes(unescaped)
200
201
202	# For ${x//foo*/y}, we need to glob patterns, but fnmatch doesn't give you the
203	# positions of matches. So we convert globs to regexps.
204
205	# Problems:
206	# - What about unicode? Do we have to set any global variables? We want it to
207	# always use utf-8?
208
209
210	class _GlobParser(object):
211
212	def __init__(self, lexer):
213	# type: (SimpleLexer) -> None
214	self.lexer = lexer
215	self.token_type = Id.Undefined_Tok
216	self.token_val = ''
217	self.warnings = [] # type: List[str]
218
219	def _Next(self):
220	# type: () -> None
221	"""Move to the next token."""
222	self.token_type, self.token_val = self.lexer.Next()
223
224	def _ParseCharClass(self):
225	# type: () -> List[glob_part_t]
226	"""
227	Returns:
228	a CharClass if the parse succeeds, or a Literal if fails. In the latter
229	case, we also append a warning.
230	"""
231	first_token = glob_part.Literal(self.token_type, self.token_val)
232	balance = 1 # We already saw a [
233	tokens = [] # type: List[Tuple[Id_t, str]]
234
235	# NOTE: There is a special rule where []] and [[] are valid globs. Also
236	# [^[] and sometimes [^]], although that one is ambiguous!
237	# And [[:space:]] and [[.class.]] has to be taken into account too. I'm
238	# punting on this now because the rule isn't clear and consistent between
239	# shells.
240
241	while True:
242	self._Next()
243
244	if self.token_type == Id.Eol_Tok:
245	# TODO: location info
246	self.warnings.append(
247	'Malformed character class; treating as literal')
248	parts = [first_token] # type: List[glob_part_t]
249	for (id_, s) in tokens:
250	parts.append(glob_part.Literal(id_, s))
251	return parts
252
253	if self.token_type == Id.Glob_LBracket:
254	balance += 1
255	elif self.token_type == Id.Glob_RBracket:
256	balance -= 1
257
258	if balance == 0:
259	break
260	tokens.append(
261	(self.token_type, self.token_val)) # Don't append the last ]
262
263	negated = False
264	if len(tokens):
265	id1, _ = tokens[0]
266	# NOTE: Both ! and ^ work for negation in globs
267	# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
268	# TODO: Warn about the one that's not recommended?
269	if id1 in (Id.Glob_Bang, Id.Glob_Caret):
270	negated = True
271	tokens = tokens[1:]
272	strs = [s for _, s in tokens]
273	return [glob_part.CharClass(negated, strs)]
274
275	def Parse(self):
276	# type: () -> Tuple[List[glob_part_t], List[str]]
277	"""
278	Returns:
279	regex string (or None if it's not a glob)
280	A list of warnings about the syntax
281	"""
282	parts = [] # type: List[glob_part_t]
283
284	while True:
285	self._Next()
286	id_ = self.token_type
287	s = self.token_val
288
289	#log('%s %r', self.token_type, self.token_val)
290	if id_ == Id.Eol_Tok:
291	break
292
293	if id_ in (Id.Glob_Star, Id.Glob_QMark):
294	parts.append(glob_part.Operator(id_))
295
296	elif id_ == Id.Glob_LBracket:
297	# Could return a Literal or a CharClass
298	parts.extend(self._ParseCharClass())
299
300	else: # Glob_{Bang,Caret,CleanLiterals,OtherLiteral,RBracket,EscapedChar,
301	# BadBackslash}
302	parts.append(glob_part.Literal(id_, s))
303
304	# Also check for warnings. TODO: location info.
305	if id_ == Id.Glob_RBracket:
306	self.warnings.append('Got unescaped right bracket')
307	if id_ == Id.Glob_BadBackslash:
308	self.warnings.append('Got unescaped trailing backslash')
309
310	return parts, self.warnings
311
312
313	_REGEX_CHARS_TO_ESCAPE = '.\|^$()+*?[]{}\\'
314
315
316	def _GenerateERE(parts):
317	# type: (List[glob_part_t]) -> str
318	out = [] # type: List[str]
319
320	for part in parts:
321	tag = part.tag()
322	UP_part = part
323
324	if tag == glob_part_e.Literal:
325	part = cast(glob_part.Literal, UP_part)
326	if part.id == Id.Glob_EscapedChar:
327	assert len(part.s) == 2, part.s
328	# The user could have escaped a char that doesn't need regex escaping,
329	# like \b or something.
330	c = part.s[1]
331	if c in _REGEX_CHARS_TO_ESCAPE:
332	out.append('\\')
333	out.append(c)
334
335	# ! is only for char class
336	elif part.id in (Id.Glob_CleanLiterals, Id.Glob_Bang):
337	out.append(part.s) # e.g. 'py' doesn't need to be escaped
338
339	# ^ is only for char class
340	elif part.id in (Id.Glob_OtherLiteral, Id.Glob_Caret):
341	assert len(part.s) == 1, part.s
342	c = part.s
343	if c in _REGEX_CHARS_TO_ESCAPE:
344	out.append('\\')
345	out.append(c)
346
347	# These are UNMATCHED ones not parsed in a glob class
348	elif part.id == Id.Glob_LBracket:
349	out.append('\\[')
350
351	elif part.id == Id.Glob_RBracket:
352	out.append('\\]')
353
354	elif part.id == Id.Glob_BadBackslash:
355	out.append('\\\\')
356
357	elif part.id == Id.Glob_Caret:
358	out.append('^')
359
360	else:
361	raise AssertionError(part.id)
362
363	elif tag == glob_part_e.Operator:
364	part = cast(glob_part.Operator, UP_part)
365	if part.op_id == Id.Glob_QMark:
366	out.append('.')
367	elif part.op_id == Id.Glob_Star:
368	out.append('.*')
369	else:
370	raise AssertionError()
371
372	elif tag == glob_part_e.CharClass:
373	part = cast(glob_part.CharClass, UP_part)
374	out.append('[')
375	if part.negated:
376	out.append('^')
377
378	# Important: the character class is LITERALLY preserved, because we
379	# assume glob char classes are EXACTLY the same as regex char classes,
380	# including the escaping rules.
381	#
382	# TWO WEIRD EXCEPTIONS:
383	# \- is moved to the end as '-'.
384	# In GNU libc, [0\-9] ODDLY has a range starting with \ ! But we
385	# want a literal, and the POSIX way to do that is to put it at the end.
386	# \] is moved to the FRONT as ]
387
388	good = [] # type: List[str]
389
390	literal_hyphen = False
391	literal_rbracket = False
392
393	for s in part.strs:
394	if s == '\-':
395	literal_hyphen = True
396	continue
397	if s == '\]':
398	literal_rbracket = True
399	continue
400	good.append(s)
401
402	if literal_rbracket:
403	out.append(']')
404
405	out.extend(good)
406
407	if literal_hyphen:
408	out.append('-')
409
410	out.append(']')
411
412	return ''.join(out)
413
414
415	def GlobToERE(pat):
416	# type: (str) -> Tuple[str, List[str]]
417	lexer = match.GlobLexer(pat)
418	p = _GlobParser(lexer)
419	parts, warnings = p.Parse()
420
421	# Vestigial: if there is nothing like * ? or [abc], then the whole string is
422	# a literal, and we could use a more efficient mechanism.
423	# But we would have to DEQUOTE before doing that.
424	if 0:
425	is_glob = False
426	for p in parts:
427	if p.tag in (glob_part_e.Operator, glob_part_e.CharClass):
428	is_glob = True
429	if 0:
430	log('GlobToERE()')
431	for p in parts:
432	log(' %s', p)
433
434	regex = _GenerateERE(parts)
435	#log('pat %s -> regex %s', pat, regex)
436	return regex, warnings
437
438
439	# Notes for implementing extglob
440	# - libc glob() doesn't have any extension!
441	# - Nix stdenv uses !(foo) and @(foo\|bar)
442	# - can we special case these for now?
443	# - !(foo\|bar) -- change it to *, and then just do fnmatch() to filter the
444	# result!
445	# - Actually I guess we can do that for all of them. That seems fine.
446	# - But we have to get the statically parsed arg in here?
447	# - or do dynamic parsing
448	# - LooksLikeGlob() would have to respect extglob! ugh!
449	# - See 2 calls in osh/word_eval.py
450
451
452	def _StringMatchesAnyPattern(s, patterns):
453	# type: (str, List[str]) -> bool
454	"""Check if string matches any pattern in the list.
455
456	Returns True if s matches any pattern, or if s is . or ..
457	(which are always filtered when GLOBIGNORE is set).
458	"""
459	basename = os_path.basename(s)
460	if basename in ('.', '..'):
461	return True
462
463	flags = 0
464	for pattern in patterns:
465	if libc.fnmatch(pattern, s, flags):
466	return True
467
468	return False
469
470
471	class Globber(object):
472
473	def __init__(self, exec_opts, mem):
474	# type: (optview.Exec, state.Mem) -> None
475	self.exec_opts = exec_opts
476	self.mem = mem
477	# Cache for parsed GLOBIGNORE patterns to avoid re-parsing
478	self._globignore_cache = {} # type: Dict[str, List[str]]
479
480	# Other unimplemented bash options:
481	#
482	# globstar ** for directories
483	# globasciiranges ascii or unicode char classes (unicode by default)
484	# nocaseglob
485	# extglob the @() !() syntax -- libc helps us with fnmatch(), but
486	# not glob().
487
488	def _GetGlobIgnorePatterns(self):
489	# type: () -> Optional[List[str]]
490	"""Get GLOBIGNORE patterns as a list, or None if not set."""
491
492	val = self.mem.GetValue('GLOBIGNORE', scope_e.GlobalOnly)
493	if val.tag() != value_e.Str:
494	return None
495
496	globignore = cast(value.Str, val).s # type: str
497	if len(globignore) == 0:
498	return None
499
500	if globignore in self._globignore_cache:
501	return self._globignore_cache[globignore]
502
503	# Split by colon to get individual patterns, but don't split colons
504	# inside bracket expressions like [[:alnum:]]
505	patterns = [] # type: List[str]
506	current = [] # type: List[str]
507	in_bracket = False
508
509	for c in globignore:
510	if c == '[':
511	in_bracket = True
512	current.append(c)
513	elif c == ']':
514	in_bracket = False
515	current.append(c)
516	elif c == ':' and not in_bracket:
517	if len(current):
518	patterns.append(''.join(current))
519	del current[:]
520	else:
521	current.append(c)
522
523	if len(current):
524	patterns.append(''.join(current))
525
526	self._globignore_cache[globignore] = patterns
527
528	return patterns
529
530	def DoGlob(self, arg, out):
531	# type: (str, List[str]) -> int
532	"""
533	Respects:
534	- GLOBIGNORE
535	- dotglob
536	- no_dash_glob
537
538	But NOT
539	- noglob - done at the wordl evel
540	- nullglob - ditto
541
542	TODO:
543	- ysh globbing should not respect globals like GLOBIGNORE?
544	- only no_dash_glob by default?
545	- split into pure io.glob() and legacyGlob() function?
546	- this respects GLOBIGNORE
547	"""
548	globignore_patterns = self._GetGlobIgnorePatterns()
549
550	flags = 0
551	# shopt -u dotglob (default): echo * does not return say .gitignore
552	# If GLOBIGNORE is set, then dotglob is NOT respected - we return ..
553	if self.exec_opts.dotglob() or globignore_patterns is not None:
554	# If HAVE_GLOB_PERIOD is false, then ./configure stubs out
555	# GLOB_PERIOD as 0, a no-op
556	flags \|= GLOB_PERIOD
557
558	try:
559	results = libc.glob(arg, flags)
560	except RuntimeError as e:
561	# These errors should be rare: I/O error, out of memory, or unknown
562	# There are no syntax errors. (But see comment about globerr() in
563	# native/libc.c.)
564	# note: MyPy doesn't know RuntimeError has e.message (and e.args)
565	msg = e.message # type: str
566	print_stderr("Error expanding glob %r: %s" % (arg, msg))
567	raise
568	#log('glob %r -> %r', arg, g)
569
570	if len(results): # Something matched
571	# Omit files starting with -
572	# no_dash_glob is part of shopt --set ysh:upgrade
573	if self.exec_opts.no_dash_glob():
574	tmp = [s for s in results if not s.startswith('-')]
575	results = tmp # idiom to work around mycpp limitation
576
577	if globignore_patterns is not None:
578	# Handle GLOBIGNORE
579	tmp = [
580	s for s in results
581	if not _StringMatchesAnyPattern(s, globignore_patterns)
582	]
583
584	results = tmp # idiom to work around mycpp limitation
585	else:
586	# Remove . and .. entries returned by libc.
587	# This is 'shopt -s globskipdots'. TODO: support it fully?
588	tmp = [s for s in results if not s in ('.', '..')]
589	results = tmp # idiom to work around mycpp limitation
590
591	out.extend(results)
592	return len(results)
593
594	return 0
595
596	def Expand(self, arg, out, blame_loc):
597	# type: (str, List[str], loc_t) -> int
598	"""Given a string that MAY be a glob, perform glob expansion
599
600	If files on disk match the glob pattern, we append to the list 'out',
601	and return the number of items.
602
603	Returns:
604	Number of items appended, or -1 when glob expansion did not happen.
605	Raises:
606	error.FailGlob when nothing matched, and shopt -s failglob
607	"""
608	if self.exec_opts.noglob():
609	# The caller should use the original string
610	return -1
611
612	n = self.DoGlob(arg, out)
613	if n:
614	return n
615
616	# Nothing matched
617	if self.exec_opts.failglob():
618	raise error.FailGlob('Pattern %r matched no files' % arg,
619	blame_loc)
620
621	if self.exec_opts.nullglob():
622	return 0
623
624	# The caller should use the original string
625	return -1
626
627	def ExpandExtended(self, glob_pat, fnmatch_pat, out):
628	# type: (str, str, List[str]) -> int
629	if self.exec_opts.noglob():
630	# Return the fnmatch_pat. Note: this means we turn ,() into @(), and
631	# there is extra \ escaping compared with bash and mksh. OK for now
632	out.append(fnmatch_pat)
633	return 1
634
635	tmp = [] # type: List[str]
636	self.DoGlob(glob_pat, tmp)
637	filtered = [s for s in tmp if libc.fnmatch(fnmatch_pat, s)]
638	n = len(filtered)
639
640	if n:
641	out.extend(filtered)
642	return n
643
644	if self.exec_opts.failglob():
645	return -1 # nothing matched
646
647	if self.exec_opts.nullglob():
648	return 0
649	else:
650	# See comment above
651	out.append(GlobUnescape(fnmatch_pat))
652	return 1