builtin/method

OILS / builtin / method_str.py View on Github | oilshell.org

587 lines, 353 significant

1	"""YSH Str methods"""
2
3	from __future__ import print_function
4
5	from _devbuild.gen.syntax_asdl import loc_t
6	from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
7	eggex_ops_t, RegexMatch)
8	from core import error
9	from core import state
10	from core import vm
11	from frontend import typed_args
12	from mycpp import mops
13	from mycpp.mylib import log, tagswitch
14	from osh import string_ops
15	from ysh import expr_eval
16	from ysh import regex_translate
17	from ysh import val_ops
18
19	import libc
20	from libc import REG_NOTBOL
21
22	from typing import cast, Dict, List, Tuple
23
24	_ = log
25
26
27	def _StrMatchStart(s, p):
28	# type: (str, str) -> Tuple[bool, int, int]
29	"""Returns the range of bytes in 's' that match string pattern `p`. the
30	pattern matches if 's' starts with all the characters in 'p'.
31
32	The returned match result is the tuple "(matched, begin, end)". 'matched'
33	is true if the pattern matched. 'begin' and 'end' give the half-open range
34	"[begin, end)" of byte indices from 's' for the match, and are a valid but
35	empty range if 'match' is false.
36
37	Used for shell functions like 'trimStart' when trimming a prefix string.
38	"""
39	if s.startswith(p):
40	return (True, 0, len(p))
41	else:
42	return (False, 0, 0)
43
44
45	def _StrMatchEnd(s, p):
46	# type: (str, str) -> Tuple[bool, int, int]
47	"""Returns a match result for the bytes in 's' that match string pattern
48	`p`. the pattern matches if 's' ends with all the characters in 'p'.
49
50	The returned match result is the tuple "(matched, begin, end)". 'matched'
51	is true if the pattern matched. 'begin' and 'end' give the half-open range
52	"[begin, end)" of byte indices from 's' for the match, and are a valid but
53	empty range if 'match' is false.
54
55	Used for shell functions like 'trimEnd' when trimming a suffix string.
56	"""
57	len_s = len(s)
58	if s.endswith(p):
59	return (True, len_s - len(p), len_s)
60	else:
61	return (False, len_s, len_s)
62
63
64	def _EggexMatchCommon(s, p, ere, empty_p):
65	# type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
66	cflags = regex_translate.LibcFlags(p.canonical_flags)
67	eflags = 0
68	indices = libc.regex_search(ere, cflags, s, eflags)
69	if indices is None:
70	return (False, empty_p, empty_p)
71
72	start = indices[0]
73	end = indices[1]
74
75	return (True, start, end)
76
77
78	def _EggexMatchStart(s, p):
79	# type: (str, value.Eggex) -> Tuple[bool, int, int]
80	"""Returns a match result for the bytes in 's' that match Eggex pattern
81	`p` when constrained to match at the start of the string.
82
83	Any capturing done by the Eggex pattern is ignored.
84
85	The returned match result is the tuple "(matched, begin, end)". 'matched'
86	is true if the pattern matched. 'begin' and 'end' give the half-open range
87	"[begin, end)" of byte indices from 's' for the match, and are a valid but
88	empty range if 'match' is false.
89
90	Used for shell functions like 'trimStart' when trimming with an Eggex
91	pattern.
92	"""
93	ere = regex_translate.AsPosixEre(p)
94	if not ere.startswith('^'):
95	ere = '^' + ere
96	return _EggexMatchCommon(s, p, ere, 0)
97
98
99	def _EggexMatchEnd(s, p):
100	# type: (str, value.Eggex) -> Tuple[bool, int, int]
101	"""Like _EggexMatchStart, but matches against the end of the
102	string.
103	"""
104	ere = regex_translate.AsPosixEre(p)
105	if not ere.endswith('$'):
106	ere = ere + '$'
107	return _EggexMatchCommon(s, p, ere, len(s))
108
109
110	START = 0b01
111	END = 0b10
112
113
114	class HasAffix(vm._Callable):
115	""" Implements `startsWith()`, `endsWith()`. """
116
117	def __init__(self, anchor):
118	# type: (int) -> None
119	assert anchor in (START, END), ("Anchor must be START or END")
120	self.anchor = anchor
121
122	def Call(self, rd):
123	# type: (typed_args.Reader) -> value_t
124	"""
125	string => startsWith(pattern_str) # => bool
126	string => startsWith(pattern_eggex) # => bool
127	string => endsWith(pattern_str) # => bool
128	string => endsWith(pattern_eggex) # => bool
129	"""
130
131	string = rd.PosStr()
132	pattern_val = rd.PosValue()
133	pattern_str = None # type: str
134	pattern_eggex = None # type: value.Eggex
135	with tagswitch(pattern_val) as case:
136	if case(value_e.Eggex):
137	pattern_eggex = cast(value.Eggex, pattern_val)
138	elif case(value_e.Str):
139	pattern_str = cast(value.Str, pattern_val).s
140	else:
141	raise error.TypeErr(pattern_val,
142	'expected pattern to be Eggex or Str',
143	rd.LeftParenToken())
144	rd.Done()
145
146	matched = False
147	try:
148	if pattern_str is not None:
149	if self.anchor & START:
150	matched, _, _ = _StrMatchStart(string, pattern_str)
151	else:
152	matched, _, _ = _StrMatchEnd(string, pattern_str)
153	else:
154	assert pattern_eggex is not None
155	if self.anchor & START:
156	matched, _, _ = _EggexMatchStart(string, pattern_eggex)
157	else:
158	matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
159	except error.Strict as e:
160	raise error.Expr(e.msg, e.location)
161
162	return value.Bool(matched)
163
164
165	class Trim(vm._Callable):
166	""" Implements `trimStart()`, `trimEnd()`, and `trim()` """
167
168	def __init__(self, anchor):
169	# type: (int) -> None
170	assert anchor in (START, END, START
171	\| END), ("Anchor must be START, END, or START\|END")
172	self.anchor = anchor
173
174	def Call(self, rd):
175	# type: (typed_args.Reader) -> value_t
176	"""
177	string => trimStart() # => Str
178	string => trimEnd() # => Str
179	string => trim() # => Str
180	string => trimStart(pattern_str) # => Str
181	string => trimEnd(pattern_str) # => Str
182	string => trim(pattern_str) # => Str
183	string => trimStart(pattern_eggex) # => Str
184	string => trimEnd(pattern_eggex) # => Str
185	string => trim(pattern_eggex) # => Str
186	"""
187
188	string = rd.PosStr()
189	pattern_val = rd.OptionalValue()
190	pattern_str = None # type: str
191	pattern_eggex = None # type: value.Eggex
192	if pattern_val:
193	with tagswitch(pattern_val) as case:
194	if case(value_e.Eggex):
195	pattern_eggex = cast(value.Eggex, pattern_val)
196	elif case(value_e.Str):
197	pattern_str = cast(value.Str, pattern_val).s
198	else:
199	raise error.TypeErr(pattern_val,
200	'expected pattern to be Eggex or Str',
201	rd.LeftParenToken())
202	rd.Done()
203
204	start = 0
205	end = len(string)
206	try:
207	if pattern_str is not None:
208	if self.anchor & START:
209	_, _, start = _StrMatchStart(string, pattern_str)
210	if self.anchor & END:
211	_, end, _ = _StrMatchEnd(string, pattern_str)
212	elif pattern_eggex is not None:
213	if self.anchor & START:
214	_, _, start = _EggexMatchStart(string, pattern_eggex)
215	if self.anchor & END:
216	_, end, _ = _EggexMatchEnd(string, pattern_eggex)
217	else:
218	if self.anchor & START:
219	_, start = string_ops.StartsWithWhitespaceByteRange(string)
220	if self.anchor & END:
221	end, _ = string_ops.EndsWithWhitespaceByteRange(string)
222	except error.Strict as e:
223	raise error.Expr(e.msg, e.location)
224
225	res = string[start:end]
226	return value.Str(res)
227
228
229	class Upper(vm._Callable):
230
231	def __init__(self):
232	# type: () -> None
233	pass
234
235	def Call(self, rd):
236	# type: (typed_args.Reader) -> value_t
237
238	s = rd.PosStr()
239	rd.Done()
240
241	# TODO: unicode support
242	return value.Str(s.upper())
243
244
245	class Lower(vm._Callable):
246
247	def __init__(self):
248	# type: () -> None
249	pass
250
251	def Call(self, rd):
252	# type: (typed_args.Reader) -> value_t
253
254	s = rd.PosStr()
255	rd.Done()
256
257	# TODO: unicode support
258	return value.Str(s.lower())
259
260
261	SEARCH = 0
262	LEFT_MATCH = 1
263
264
265	class SearchMatch(vm._Callable):
266
267	def __init__(self, which_method):
268	# type: (int) -> None
269	self.which_method = which_method
270
271	def Call(self, rd):
272	# type: (typed_args.Reader) -> value_t
273	"""
274	s => search(eggex, pos=0)
275	"""
276	string = rd.PosStr()
277
278	pattern = rd.PosValue() # Eggex or ERE Str
279	with tagswitch(pattern) as case:
280	if case(value_e.Eggex):
281	eggex_val = cast(value.Eggex, pattern)
282
283	# lazily converts to ERE
284	ere = regex_translate.AsPosixEre(eggex_val)
285	cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
286	capture = eggex_ops.Yes(
287	eggex_val.convert_funcs, eggex_val.convert_toks,
288	eggex_val.capture_names) # type: eggex_ops_t
289
290	elif case(value_e.Str):
291	ere = cast(value.Str, pattern).s
292	cflags = 0
293	capture = eggex_ops.No
294
295	else:
296	# TODO: add method name to this error
297	raise error.TypeErr(pattern, 'expected Eggex or Str',
298	rd.LeftParenToken())
299
300	# It's called 'pos', not 'start' like Python. Python has 2 kinds of
301	# 'start' in its regex API, which can be confusing.
302	pos = mops.BigTruncate(rd.NamedInt('pos', 0))
303	rd.Done()
304
305	# Make it anchored
306	if self.which_method == LEFT_MATCH and not ere.startswith('^'):
307	ere = '^' + ere
308
309	if self.which_method == LEFT_MATCH:
310	eflags = 0 # ^ matches beginning even if pos=5
311	else:
312	eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
313
314	indices = libc.regex_search(ere, cflags, string, eflags, pos)
315
316	if indices is None:
317	return value.Null
318
319	return RegexMatch(string, indices, capture)
320
321
322	class Replace(vm._Callable):
323
324	def __init__(self, mem, expr_ev):
325	# type: (state.Mem, expr_eval.ExprEvaluator) -> None
326	self.mem = mem
327	self.expr_ev = expr_ev
328
329	def EvalSubstExpr(self, expr, blame_loc):
330	# type: (value.Expr, loc_t) -> str
331	res = self.expr_ev.EvalExprClosure(expr, blame_loc)
332	if res.tag() == value_e.Str:
333	return cast(value.Str, res).s
334
335	raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
336
337	def Call(self, rd):
338	# type: (typed_args.Reader) -> value_t
339	"""
340	s => replace(string_val, subst_str, count=-1)
341	s => replace(string_val, subst_expr, count=-1)
342	s => replace(eggex_val, subst_str, count=-1)
343	s => replace(eggex_val, subst_expr, count=-1)
344
345	For count in [0, MAX_INT], there will be no more than count
346	replacements. Any negative count should read as unset, and replace will
347	replace all occurances of the pattern.
348	"""
349	string = rd.PosStr()
350
351	string_val = None # type: value.Str
352	eggex_val = None # type: value.Eggex
353	subst_str = None # type: value.Str
354	subst_expr = None # type: value.Expr
355
356	pattern = rd.PosValue()
357	with tagswitch(pattern) as case:
358	if case(value_e.Eggex):
359	# HACK: mycpp will otherwise generate:
360	# value::Eggex* eggex_val ...
361	eggex_val_ = cast(value.Eggex, pattern)
362	eggex_val = eggex_val_
363
364	elif case(value_e.Str):
365	string_val_ = cast(value.Str, pattern)
366	string_val = string_val_
367
368	else:
369	raise error.TypeErr(pattern,
370	'expected pattern to be Eggex or Str',
371	rd.LeftParenToken())
372
373	subst = rd.PosValue()
374	with tagswitch(subst) as case:
375	if case(value_e.Str):
376	subst_str_ = cast(value.Str, subst)
377	subst_str = subst_str_
378
379	elif case(value_e.Expr):
380	subst_expr_ = cast(value.Expr, subst)
381	subst_expr = subst_expr_
382
383	else:
384	raise error.TypeErr(subst,
385	'expected substitution to be Str or Expr',
386	rd.LeftParenToken())
387
388	count = mops.BigTruncate(rd.NamedInt("count", -1))
389	rd.Done()
390
391	if count == 0:
392	return value.Str(string)
393
394	if string_val:
395	if subst_str:
396	s = subst_str.s
397	if subst_expr:
398	# Eval with $0 set to string_val (the matched substring)
399	with state.ctx_Eval(self.mem, string_val.s, None, None):
400	s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
401	assert s is not None
402
403	result = string.replace(string_val.s, s, count)
404
405	return value.Str(result)
406
407	if eggex_val:
408	if '\0' in string:
409	raise error.Structured(
410	3, "cannot replace by eggex on a string with NUL bytes",
411	rd.LeftParenToken())
412
413	ere = regex_translate.AsPosixEre(eggex_val)
414	cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
415
416	# Walk through the string finding all matches of the compiled ere.
417	# Then, collect unmatched substrings and substitutions into the
418	# `parts` list.
419	pos = 0
420	parts = [] # type: List[str]
421	replace_count = 0
422	while pos < len(string):
423	indices = libc.regex_search(ere, cflags, string, 0, pos)
424	if indices is None:
425	break
426
427	# Collect captures
428	arg0 = None # type: str
429	argv = [] # type: List[str]
430	named_vars = {} # type: Dict[str, value_t]
431	num_groups = len(indices) / 2
432	for group in xrange(num_groups):
433	start = indices[2 * group]
434	end = indices[2 * group + 1]
435	captured = string[start:end]
436	val = value.Str(captured) # type: value_t
437
438	if len(eggex_val.convert_funcs) and group != 0:
439	convert_func = eggex_val.convert_funcs[group - 1]
440	convert_tok = eggex_val.convert_toks[group - 1]
441
442	if convert_func:
443	val = self.expr_ev.CallConvertFunc(
444	convert_func, val, convert_tok,
445	rd.LeftParenToken())
446
447	# $0, $1, $2 variables are argv values, which must be
448	# strings. Furthermore, they can only be used in string
449	# contexts
450	# eg. "$[1]" != "$1".
451	val_str = val_ops.Stringify(val, rd.LeftParenToken(), '')
452	if group == 0:
453	arg0 = val_str
454	else:
455	argv.append(val_str)
456
457	# $0 cannot be named
458	if group != 0:
459	name = eggex_val.capture_names[group - 2]
460	if name is not None:
461	named_vars[name] = val
462
463	if subst_str:
464	s = subst_str.s
465	if subst_expr:
466	with state.ctx_Eval(self.mem, arg0, argv, named_vars):
467	s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
468	assert s is not None
469
470	start = indices[0]
471	end = indices[1]
472	if pos == end:
473	raise error.Structured(
474	3, "eggex should never match the empty string",
475	rd.LeftParenToken())
476
477	parts.append(string[pos:start]) # Unmatched substring
478	parts.append(s) # Replacement
479	pos = end # Move to end of match
480
481	replace_count += 1
482	if count != -1 and replace_count == count:
483	break
484
485	parts.append(string[pos:]) # Remaining unmatched substring
486
487	return value.Str("".join(parts))
488
489	raise AssertionError()
490
491
492	class Split(vm._Callable):
493
494	def __init__(self):
495	# type: () -> None
496	pass
497
498	def Call(self, rd):
499	# type: (typed_args.Reader) -> value_t
500	"""
501	s.split(string_sep, count=-1)
502	s.split(eggex_sep, count=-1)
503
504	Count behaves like in replace() in that:
505	- `count` < 0 -> ignore
506	- `count` >= 0 -> there will be at most `count` splits
507	"""
508	string = rd.PosStr()
509
510	string_sep = None # type: str
511	eggex_sep = None # type: value.Eggex
512
513	sep = rd.PosValue()
514	with tagswitch(sep) as case:
515	if case(value_e.Eggex):
516	eggex_sep_ = cast(value.Eggex, sep)
517	eggex_sep = eggex_sep_
518
519	elif case(value_e.Str):
520	string_sep_ = cast(value.Str, sep)
521	string_sep = string_sep_.s
522
523	else:
524	raise error.TypeErr(sep,
525	'expected separator to be Eggex or Str',
526	rd.LeftParenToken())
527
528	count = mops.BigTruncate(rd.NamedInt("count", -1))
529	rd.Done()
530
531	if len(string) == 0:
532	return value.List([])
533
534	if string_sep is not None:
535	if len(string_sep) == 0:
536	raise error.Structured(3, "separator must be non-empty",
537	rd.LeftParenToken())
538
539	cursor = 0
540	chunks = [] # type: List[value_t]
541	while cursor < len(string) and count != 0:
542	next = string.find(string_sep, cursor)
543	if next == -1:
544	break
545
546	chunks.append(value.Str(string[cursor:next]))
547	cursor = next + len(string_sep)
548	count -= 1
549
550	chunks.append(value.Str(string[cursor:]))
551
552	return value.List(chunks)
553
554	if eggex_sep is not None:
555	if '\0' in string:
556	raise error.Structured(
557	3, "cannot split a string with a NUL byte",
558	rd.LeftParenToken())
559
560	regex = regex_translate.AsPosixEre(eggex_sep)
561	cflags = regex_translate.LibcFlags(eggex_sep.canonical_flags)
562
563	cursor = 0
564	chunks = []
565	while cursor < len(string) and count != 0:
566	m = libc.regex_search(regex, cflags, string, 0, cursor)
567	if m is None:
568	break
569
570	start = m[0]
571	end = m[1]
572	if start == end:
573	raise error.Structured(
574	3,
575	"eggex separators should never match the empty string",
576	rd.LeftParenToken())
577
578	chunks.append(value.Str(string[cursor:start]))
579	cursor = end
580
581	count -= 1
582
583	chunks.append(value.Str(string[cursor:]))
584
585	return value.List(chunks)
586
587	raise AssertionError()