osh/split.py

OILS / osh / split.py View on Github | oils.pub

449 lines, 257 significant

1	"""
2	split.py - Word Splitting
3
4	Nice blog post on the complexity/corner cases/differing intuition of splitting
5	strings:
6
7	https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9	python-dev doesn't want to touch it anymore!
10
11	Other possible splitters:
12
13	- AwkSplitter -- how does this compare to awk -F?
14	- RegexSplitter
15	- CsvSplitter
16	- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17	a pure slice, but neither is IFS splitting because of backslashes.
18	- Perl?
19	- does perl have a spilt context?
20
21	with SPLIT_REGEX = / digit+ / {
22	echo $#
23	echo $len(argv)
24	echo $1 $2
25	echo @argv
26	}
27	"""
28
29	from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30	state_i)
31	from _devbuild.gen.value_asdl import (value, value_e, value_t)
32	from mycpp.mylib import log
33	from core import pyutil, pyos
34	from frontend import consts
35	from mycpp import mylib
36	from mycpp.mylib import tagswitch
37	from osh import glob_
38
39	from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
40	if TYPE_CHECKING:
41	from core.state import Mem
42	from _devbuild.gen.runtime_asdl import span_t
43	Span = Tuple[span_t, int]
44
45	DEFAULT_IFS = ' \t\n'
46
47
48	def _SpansToParts(s, spans):
49	# type: (str, List[Span]) -> List[str]
50	"""Helper for SplitForWordEval."""
51	parts = [] # type: List[mylib.BufWriter]
52	start_index = 0
53
54	# If the last span was black, and we get a backslash, set join_next to merge
55	# two black spans.
56	join_next = False
57	last_span_was_black = False
58
59	for span_type, end_index in spans:
60	if span_type == span_e.Black:
61	if len(parts) and join_next:
62	parts[-1].write(s[start_index:end_index])
63	join_next = False
64	else:
65	buf = mylib.BufWriter()
66	buf.write(s[start_index:end_index])
67	parts.append(buf)
68
69	last_span_was_black = True
70
71	elif span_type == span_e.Backslash:
72	if last_span_was_black:
73	join_next = True
74	last_span_was_black = False
75
76	else:
77	last_span_was_black = False
78
79	start_index = end_index
80
81	result = [buf.getvalue() for buf in parts]
82	return result
83
84
85	class SplitContext(object):
86	"""A polymorphic interface to field splitting.
87
88	It respects a STACK of IFS values, for example:
89
90	echo $x # uses default shell IFS
91	IFS=':' myfunc # new splitter
92	echo $x # uses default shell IFS again.
93	"""
94
95	def __init__(self, mem):
96	# type: (Mem) -> None
97	self.mem = mem
98	# Split into (ifs_whitespace, ifs_other)
99	self.splitters = {
100	} # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
101
102	def _GetSplitter(self, ifs=None):
103	# type: (str) -> IfsSplitter
104	"""Based on the current stack frame, get the splitter."""
105	if ifs is None:
106	# Like _ESCAPER, this has dynamic scope!
107	val = self.mem.GetValue('IFS', scope_e.Dynamic)
108
109	UP_val = val
110	with tagswitch(val) as case:
111	if case(value_e.Undef):
112	ifs = DEFAULT_IFS
113	elif case(value_e.Str):
114	val = cast(value.Str, UP_val)
115	ifs = val.s
116	else:
117	# TODO: Raise proper error
118	raise AssertionError("IFS shouldn't be an array")
119
120	sp = self.splitters.get(ifs) # cache lookup
121	if sp is None:
122	# Figure out what kind of splitter we should instantiate.
123
124	ifs_whitespace = mylib.BufWriter()
125	ifs_other = mylib.BufWriter()
126	for c in ifs:
127	if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
128	ifs_whitespace.write(c)
129	else:
130	# TODO: \ not supported
131	ifs_other.write(c)
132
133	sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
134
135	# NOTE: Technically, we could make the key more precise. IFS=$' \t' is
136	# the same as IFS=$'\t '. But most programs probably don't do that, and
137	# everything should work in any case.
138	self.splitters[ifs] = sp
139
140	return sp
141
142	def GetJoinChar(self):
143	# type: () -> str
144	"""For decaying arrays by joining, eg.
145
146	"$@" -> $@. array
147	"""
148	# https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
149	# http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
150	# "When the expansion occurs within a double-quoted string (see
151	# Double-Quotes), it shall expand to a single field with the value of
152	# each parameter separated by the first character of the IFS variable, or
153	# by a <space> if IFS is unset. If IFS is set to a null string, this is
154	# not equivalent to unsetting it; its first character does not exist, so
155	# the parameter values are concatenated."
156	val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
157	UP_val = val
158	with tagswitch(val) as case:
159	if case(value_e.Undef):
160	return ' '
161	elif case(value_e.Str):
162	val = cast(value.Str, UP_val)
163	if len(val.s):
164	return val.s[0]
165	else:
166	return ''
167	else:
168	# TODO: Raise proper error
169	raise AssertionError("IFS shouldn't be an array")
170
171	raise AssertionError('for -Wreturn-type in C++')
172
173	def Escape(self, s):
174	# type: (str) -> str
175	"""Escape IFS chars."""
176	sp = self._GetSplitter()
177	return sp.Escape(s)
178
179	def CreateSplitterState(self, ifs=None):
180	# type: (Optional[str]) -> IfsSplitterState
181	sp = self._GetSplitter(ifs=ifs)
182	return IfsSplitterState(sp.ifs_whitespace, sp.ifs_other)
183
184	def SplitForWordEval(self, s, ifs=None):
185	# type: (str, Optional[str]) -> List[str]
186	"""Split used by the explicit shSplit() function.
187	"""
188	sp = self.CreateSplitterState(ifs=ifs)
189	sp.SetAllowEscape(True)
190	sp.PushFragment(s)
191	return sp.PushTerminator()
192
193	def SplitForRead(self, line, allow_escape, do_split, max_parts):
194	# type: (str, bool, bool, int) -> List[str]
195
196	if len(line) == 0:
197	return []
198
199	# None: use the default splitter, consulting $IFS
200	# '' : forces IFS='' behavior
201	ifs = None if do_split else ''
202
203	sp = self.CreateSplitterState(ifs=ifs)
204	sp.SetAllowEscape(allow_escape)
205	sp.SetMaxSplit(max_parts - 1)
206	sp.PushFragment(line)
207	return sp.PushTerminator()
208
209
210	class _BaseSplitter(object):
211
212	def __init__(self, escape_chars):
213	# type: (str) -> None
214	self.escape_chars = escape_chars + '\\' # Backslash is always escaped
215
216	def Escape(self, s):
217	# type: (str) -> str
218	# Note the characters here are DYNAMIC, unlike other usages of
219	# BackslashEscape().
220	return pyutil.BackslashEscape(s, self.escape_chars)
221
222
223	class IfsSplitter(_BaseSplitter):
224	"""Split a string when IFS has non-whitespace characters."""
225
226	def __init__(self, ifs_whitespace, ifs_other):
227	# type: (str, str) -> None
228	_BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
229	self.ifs_whitespace = ifs_whitespace
230	self.ifs_other = ifs_other
231
232	def __repr__(self):
233	# type: () -> str
234	return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
235	self.ifs_other)
236
237	def Split(self, s, allow_escape):
238	# type: (str, bool) -> List[Span]
239	"""
240	Args:
241	s: string to split
242	allow_escape: False for read -r, this means \ doesn't do anything.
243
244	Returns:
245	List of (runtime.span, end_index) pairs
246	"""
247	ws_chars = self.ifs_whitespace
248	other_chars = self.ifs_other
249
250	n = len(s)
251	# NOTE: in C, could reserve() this to len(s)
252	spans = [] # type: List[Span]
253
254	if n == 0:
255	return spans # empty
256
257	# Ad hoc rule from POSIX: ignore leading whitespace.
258	# "IFS white space shall be ignored at the beginning and end of the input"
259	# This can't really be handled by the state machine.
260
261	# 2025-03: This causes a bug with splitting ""$A"" when there's no IFS
262
263	i = 0
264	while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
265	i += 1
266
267	# Append an ignored span.
268	if i != 0:
269	spans.append((span_e.Delim, i))
270
271	# String is ONLY whitespace. We want to skip the last span after the
272	# while loop.
273	if i == n:
274	return spans
275
276	state = state_i.Start
277	while state != state_i.Done:
278	if i < n:
279	byte = mylib.ByteAt(s, i)
280
281	if mylib.ByteInSet(byte, ws_chars):
282	ch = char_kind_i.DE_White
283	elif mylib.ByteInSet(byte, other_chars):
284	ch = char_kind_i.DE_Gray
285	elif allow_escape and mylib.ByteEquals(byte, '\\'):
286	ch = char_kind_i.Backslash
287	else:
288	ch = char_kind_i.Black
289
290	elif i == n:
291	ch = char_kind_i.Sentinel # one more iterations for the end of string
292
293	else:
294	raise AssertionError() # shouldn't happen
295
296	new_state, action = consts.IfsEdge(state, ch)
297	if new_state == state_i.Invalid:
298	raise AssertionError('Invalid transition from %r with %r' %
299	(state, ch))
300
301	if 0:
302	log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
303	state, new_state, action)
304
305	if action == emit_i.Part:
306	spans.append((span_e.Black, i))
307	elif action == emit_i.Delim:
308	spans.append((span_e.Delim, i)) # ignored delimiter
309	elif action == emit_i.Empty:
310	spans.append((span_e.Delim, i)) # ignored delimiter
311	# EMPTY part that is NOT ignored
312	spans.append((span_e.Black, i))
313	elif action == emit_i.Escape:
314	spans.append((span_e.Backslash, i)) # \
315	elif action == emit_i.Nothing:
316	pass
317	else:
318	raise AssertionError()
319
320	state = new_state
321	i += 1
322
323	return spans
324
325
326	class IfsSplitterState(object):
327
328	def __init__(self, ifs_space, ifs_other):
329	# type: (str, str) -> None
330	self.ifs_space = ifs_space
331	self.ifs_other = ifs_other
332	self.glob_escape = False
333	self.allow_escape = False
334	self.max_split = -1
335
336	self.state = state_i.Start
337	self.args = [] # type: List[str] # generated words
338	self.frags = [] # type: List[str] # str fragments of the current word
339	self.char_buff = [] # type: List[int] # chars in the current fragment
340	self.white_buff = None # type: Optional[List[int]] # chars for max_split space
341
342	def SetGlobEscape(self, glob_escape):
343	# type: (bool) -> None
344	self.glob_escape = glob_escape
345
346	def SetAllowEscape(self, allow_escape):
347	# type: (bool) -> None
348	self.allow_escape = allow_escape
349
350	def SetMaxSplit(self, max_split):
351	# type: (int) -> None
352	self.max_split = max_split
353	if max_split >= 0 and self.white_buff is None:
354	self.white_buff = []
355
356	def _FlushCharBuff(self):
357	# type: () -> None
358
359	if len(self.char_buff) >= 1:
360	frag = mylib.JoinBytes(self.char_buff)
361	if self.glob_escape:
362	frag = glob_.GlobEscapeBackslash(frag)
363	self.frags.append(frag)
364	del self.char_buff[:]
365
366	def _GenerateWord(self):
367	# type: () -> None
368	self._FlushCharBuff()
369	self.args.append(''.join(self.frags))
370	del self.frags[:]
371
372	if self.max_split >= 0 and len(self.white_buff) >= 1:
373	self.char_buff.extend(self.white_buff)
374	del self.white_buff[:]
375
376	def PushLiteral(self, s):
377	# type: (str) -> None
378	"""
379	Args:
380	s: word fragment that should be literally added
381	"""
382	if self.state == state_i.DE_White1:
383	self._GenerateWord()
384	else:
385	self._FlushCharBuff()
386	self.frags.append(s)
387	self.state = state_i.Black
388
389	def PushFragment(self, s):
390	# type: (str) -> None
391	"""
392	Args:
393	s: word fragment to split
394	"""
395	ifs_space = self.ifs_space
396	ifs_other = self.ifs_other
397	allow_escape = self.allow_escape
398	max_split = self.max_split
399	n = len(s)
400
401	for i in xrange(n):
402	byte = mylib.ByteAt(s, i)
403
404	if self.state == state_i.Backslash:
405	pass
406
407	elif max_split >= 0 and len(self.args) == max_split + 1:
408	# When max_split is reached, the processing is modified.
409	if allow_escape and byte == pyos.BACKSLASH_CH:
410	self.state = state_i.Backslash
411	continue
412	elif mylib.ByteInSet(byte, ifs_space):
413	if self.state == state_i.Start:
414	self.char_buff.append(byte)
415	continue
416
417	elif allow_escape and byte == pyos.BACKSLASH_CH:
418	if self.state == state_i.DE_White1:
419	self._GenerateWord()
420	self.state = state_i.Backslash
421	continue
422	elif mylib.ByteInSet(byte, ifs_space):
423	if self.state != state_i.Start:
424	if len(self.args) == max_split:
425	self.white_buff.append(byte)
426	self.state = state_i.DE_White1
427	continue
428	elif mylib.ByteInSet(byte, ifs_other):
429	if len(self.args) == max_split:
430	self.white_buff.append(byte)
431	self._GenerateWord()
432	self.state = state_i.Start
433	continue
434
435	if self.state == state_i.DE_White1:
436	self._GenerateWord()
437	self.char_buff.append(byte)
438	self.state = state_i.Black
439
440	def PushTerminator(self):
441	# type: () -> List[str]
442	if self.state in (state_i.DE_White1, state_i.Black):
443	self._GenerateWord()
444	if self.max_split >= 0 and len(self.args) == self.max_split + 2:
445	# TODO: is there an algorithm without this "fix up"?
446	last = self.args.pop()
447	self.args[-1] = self.args[-1] + last.rstrip(self.ifs_space)
448	self.state = state_i.Start
449	return self.args