OILS / osh / split.py View on Github | oils.pub

449 lines, 257 significant
1"""
2split.py - Word Splitting
3
4Nice blog post on the complexity/corner cases/differing intuition of splitting
5strings:
6
7https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9python-dev doesn't want to touch it anymore!
10
11Other possible splitters:
12
13- AwkSplitter -- how does this compare to awk -F?
14- RegexSplitter
15- CsvSplitter
16- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17 a pure slice, but neither is IFS splitting because of backslashes.
18- Perl?
19 - does perl have a spilt context?
20
21with SPLIT_REGEX = / digit+ / {
22 echo $#
23 echo $len(argv)
24 echo $1 $2
25 echo @argv
26}
27"""
28
29from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30 state_i)
31from _devbuild.gen.value_asdl import (value, value_e, value_t)
32from mycpp.mylib import log
33from core import pyutil, pyos
34from frontend import consts
35from mycpp import mylib
36from mycpp.mylib import tagswitch
37from osh import glob_
38
39from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
40if TYPE_CHECKING:
41 from core.state import Mem
42 from _devbuild.gen.runtime_asdl import span_t
43 Span = Tuple[span_t, int]
44
45DEFAULT_IFS = ' \t\n'
46
47
48def _SpansToParts(s, spans):
49 # type: (str, List[Span]) -> List[str]
50 """Helper for SplitForWordEval."""
51 parts = [] # type: List[mylib.BufWriter]
52 start_index = 0
53
54 # If the last span was black, and we get a backslash, set join_next to merge
55 # two black spans.
56 join_next = False
57 last_span_was_black = False
58
59 for span_type, end_index in spans:
60 if span_type == span_e.Black:
61 if len(parts) and join_next:
62 parts[-1].write(s[start_index:end_index])
63 join_next = False
64 else:
65 buf = mylib.BufWriter()
66 buf.write(s[start_index:end_index])
67 parts.append(buf)
68
69 last_span_was_black = True
70
71 elif span_type == span_e.Backslash:
72 if last_span_was_black:
73 join_next = True
74 last_span_was_black = False
75
76 else:
77 last_span_was_black = False
78
79 start_index = end_index
80
81 result = [buf.getvalue() for buf in parts]
82 return result
83
84
85class SplitContext(object):
86 """A polymorphic interface to field splitting.
87
88 It respects a STACK of IFS values, for example:
89
90 echo $x # uses default shell IFS
91 IFS=':' myfunc # new splitter
92 echo $x # uses default shell IFS again.
93 """
94
95 def __init__(self, mem):
96 # type: (Mem) -> None
97 self.mem = mem
98 # Split into (ifs_whitespace, ifs_other)
99 self.splitters = {
100 } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
101
102 def _GetSplitter(self, ifs=None):
103 # type: (str) -> IfsSplitter
104 """Based on the current stack frame, get the splitter."""
105 if ifs is None:
106 # Like _ESCAPER, this has dynamic scope!
107 val = self.mem.GetValue('IFS', scope_e.Dynamic)
108
109 UP_val = val
110 with tagswitch(val) as case:
111 if case(value_e.Undef):
112 ifs = DEFAULT_IFS
113 elif case(value_e.Str):
114 val = cast(value.Str, UP_val)
115 ifs = val.s
116 else:
117 # TODO: Raise proper error
118 raise AssertionError("IFS shouldn't be an array")
119
120 sp = self.splitters.get(ifs) # cache lookup
121 if sp is None:
122 # Figure out what kind of splitter we should instantiate.
123
124 ifs_whitespace = mylib.BufWriter()
125 ifs_other = mylib.BufWriter()
126 for c in ifs:
127 if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
128 ifs_whitespace.write(c)
129 else:
130 # TODO: \ not supported
131 ifs_other.write(c)
132
133 sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
134
135 # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
136 # the same as IFS=$'\t '. But most programs probably don't do that, and
137 # everything should work in any case.
138 self.splitters[ifs] = sp
139
140 return sp
141
142 def GetJoinChar(self):
143 # type: () -> str
144 """For decaying arrays by joining, eg.
145
146 "$@" -> $@. array
147 """
148 # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
149 # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
150 # "When the expansion occurs within a double-quoted string (see
151 # Double-Quotes), it shall expand to a single field with the value of
152 # each parameter separated by the first character of the IFS variable, or
153 # by a <space> if IFS is unset. If IFS is set to a null string, this is
154 # not equivalent to unsetting it; its first character does not exist, so
155 # the parameter values are concatenated."
156 val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
157 UP_val = val
158 with tagswitch(val) as case:
159 if case(value_e.Undef):
160 return ' '
161 elif case(value_e.Str):
162 val = cast(value.Str, UP_val)
163 if len(val.s):
164 return val.s[0]
165 else:
166 return ''
167 else:
168 # TODO: Raise proper error
169 raise AssertionError("IFS shouldn't be an array")
170
171 raise AssertionError('for -Wreturn-type in C++')
172
173 def Escape(self, s):
174 # type: (str) -> str
175 """Escape IFS chars."""
176 sp = self._GetSplitter()
177 return sp.Escape(s)
178
179 def CreateSplitterState(self, ifs=None):
180 # type: (Optional[str]) -> IfsSplitterState
181 sp = self._GetSplitter(ifs=ifs)
182 return IfsSplitterState(sp.ifs_whitespace, sp.ifs_other)
183
184 def SplitForWordEval(self, s, ifs=None):
185 # type: (str, Optional[str]) -> List[str]
186 """Split used by the explicit shSplit() function.
187 """
188 sp = self.CreateSplitterState(ifs=ifs)
189 sp.SetAllowEscape(True)
190 sp.PushFragment(s)
191 return sp.PushTerminator()
192
193 def SplitForRead(self, line, allow_escape, do_split, max_parts):
194 # type: (str, bool, bool, int) -> List[str]
195
196 if len(line) == 0:
197 return []
198
199 # None: use the default splitter, consulting $IFS
200 # '' : forces IFS='' behavior
201 ifs = None if do_split else ''
202
203 sp = self.CreateSplitterState(ifs=ifs)
204 sp.SetAllowEscape(allow_escape)
205 sp.SetMaxSplit(max_parts - 1)
206 sp.PushFragment(line)
207 return sp.PushTerminator()
208
209
210class _BaseSplitter(object):
211
212 def __init__(self, escape_chars):
213 # type: (str) -> None
214 self.escape_chars = escape_chars + '\\' # Backslash is always escaped
215
216 def Escape(self, s):
217 # type: (str) -> str
218 # Note the characters here are DYNAMIC, unlike other usages of
219 # BackslashEscape().
220 return pyutil.BackslashEscape(s, self.escape_chars)
221
222
223class IfsSplitter(_BaseSplitter):
224 """Split a string when IFS has non-whitespace characters."""
225
226 def __init__(self, ifs_whitespace, ifs_other):
227 # type: (str, str) -> None
228 _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
229 self.ifs_whitespace = ifs_whitespace
230 self.ifs_other = ifs_other
231
232 def __repr__(self):
233 # type: () -> str
234 return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
235 self.ifs_other)
236
237 def Split(self, s, allow_escape):
238 # type: (str, bool) -> List[Span]
239 """
240 Args:
241 s: string to split
242 allow_escape: False for read -r, this means \ doesn't do anything.
243
244 Returns:
245 List of (runtime.span, end_index) pairs
246 """
247 ws_chars = self.ifs_whitespace
248 other_chars = self.ifs_other
249
250 n = len(s)
251 # NOTE: in C, could reserve() this to len(s)
252 spans = [] # type: List[Span]
253
254 if n == 0:
255 return spans # empty
256
257 # Ad hoc rule from POSIX: ignore leading whitespace.
258 # "IFS white space shall be ignored at the beginning and end of the input"
259 # This can't really be handled by the state machine.
260
261 # 2025-03: This causes a bug with splitting ""$A"" when there's no IFS
262
263 i = 0
264 while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
265 i += 1
266
267 # Append an ignored span.
268 if i != 0:
269 spans.append((span_e.Delim, i))
270
271 # String is ONLY whitespace. We want to skip the last span after the
272 # while loop.
273 if i == n:
274 return spans
275
276 state = state_i.Start
277 while state != state_i.Done:
278 if i < n:
279 byte = mylib.ByteAt(s, i)
280
281 if mylib.ByteInSet(byte, ws_chars):
282 ch = char_kind_i.DE_White
283 elif mylib.ByteInSet(byte, other_chars):
284 ch = char_kind_i.DE_Gray
285 elif allow_escape and mylib.ByteEquals(byte, '\\'):
286 ch = char_kind_i.Backslash
287 else:
288 ch = char_kind_i.Black
289
290 elif i == n:
291 ch = char_kind_i.Sentinel # one more iterations for the end of string
292
293 else:
294 raise AssertionError() # shouldn't happen
295
296 new_state, action = consts.IfsEdge(state, ch)
297 if new_state == state_i.Invalid:
298 raise AssertionError('Invalid transition from %r with %r' %
299 (state, ch))
300
301 if 0:
302 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
303 state, new_state, action)
304
305 if action == emit_i.Part:
306 spans.append((span_e.Black, i))
307 elif action == emit_i.Delim:
308 spans.append((span_e.Delim, i)) # ignored delimiter
309 elif action == emit_i.Empty:
310 spans.append((span_e.Delim, i)) # ignored delimiter
311 # EMPTY part that is NOT ignored
312 spans.append((span_e.Black, i))
313 elif action == emit_i.Escape:
314 spans.append((span_e.Backslash, i)) # \
315 elif action == emit_i.Nothing:
316 pass
317 else:
318 raise AssertionError()
319
320 state = new_state
321 i += 1
322
323 return spans
324
325
326class IfsSplitterState(object):
327
328 def __init__(self, ifs_space, ifs_other):
329 # type: (str, str) -> None
330 self.ifs_space = ifs_space
331 self.ifs_other = ifs_other
332 self.glob_escape = False
333 self.allow_escape = False
334 self.max_split = -1
335
336 self.state = state_i.Start
337 self.args = [] # type: List[str] # generated words
338 self.frags = [] # type: List[str] # str fragments of the current word
339 self.char_buff = [] # type: List[int] # chars in the current fragment
340 self.white_buff = None # type: Optional[List[int]] # chars for max_split space
341
342 def SetGlobEscape(self, glob_escape):
343 # type: (bool) -> None
344 self.glob_escape = glob_escape
345
346 def SetAllowEscape(self, allow_escape):
347 # type: (bool) -> None
348 self.allow_escape = allow_escape
349
350 def SetMaxSplit(self, max_split):
351 # type: (int) -> None
352 self.max_split = max_split
353 if max_split >= 0 and self.white_buff is None:
354 self.white_buff = []
355
356 def _FlushCharBuff(self):
357 # type: () -> None
358
359 if len(self.char_buff) >= 1:
360 frag = mylib.JoinBytes(self.char_buff)
361 if self.glob_escape:
362 frag = glob_.GlobEscapeBackslash(frag)
363 self.frags.append(frag)
364 del self.char_buff[:]
365
366 def _GenerateWord(self):
367 # type: () -> None
368 self._FlushCharBuff()
369 self.args.append(''.join(self.frags))
370 del self.frags[:]
371
372 if self.max_split >= 0 and len(self.white_buff) >= 1:
373 self.char_buff.extend(self.white_buff)
374 del self.white_buff[:]
375
376 def PushLiteral(self, s):
377 # type: (str) -> None
378 """
379 Args:
380 s: word fragment that should be literally added
381 """
382 if self.state == state_i.DE_White1:
383 self._GenerateWord()
384 else:
385 self._FlushCharBuff()
386 self.frags.append(s)
387 self.state = state_i.Black
388
389 def PushFragment(self, s):
390 # type: (str) -> None
391 """
392 Args:
393 s: word fragment to split
394 """
395 ifs_space = self.ifs_space
396 ifs_other = self.ifs_other
397 allow_escape = self.allow_escape
398 max_split = self.max_split
399 n = len(s)
400
401 for i in xrange(n):
402 byte = mylib.ByteAt(s, i)
403
404 if self.state == state_i.Backslash:
405 pass
406
407 elif max_split >= 0 and len(self.args) == max_split + 1:
408 # When max_split is reached, the processing is modified.
409 if allow_escape and byte == pyos.BACKSLASH_CH:
410 self.state = state_i.Backslash
411 continue
412 elif mylib.ByteInSet(byte, ifs_space):
413 if self.state == state_i.Start:
414 self.char_buff.append(byte)
415 continue
416
417 elif allow_escape and byte == pyos.BACKSLASH_CH:
418 if self.state == state_i.DE_White1:
419 self._GenerateWord()
420 self.state = state_i.Backslash
421 continue
422 elif mylib.ByteInSet(byte, ifs_space):
423 if self.state != state_i.Start:
424 if len(self.args) == max_split:
425 self.white_buff.append(byte)
426 self.state = state_i.DE_White1
427 continue
428 elif mylib.ByteInSet(byte, ifs_other):
429 if len(self.args) == max_split:
430 self.white_buff.append(byte)
431 self._GenerateWord()
432 self.state = state_i.Start
433 continue
434
435 if self.state == state_i.DE_White1:
436 self._GenerateWord()
437 self.char_buff.append(byte)
438 self.state = state_i.Black
439
440 def PushTerminator(self):
441 # type: () -> List[str]
442 if self.state in (state_i.DE_White1, state_i.Black):
443 self._GenerateWord()
444 if self.max_split >= 0 and len(self.args) == self.max_split + 2:
445 # TODO: is there an algorithm without this "fix up"?
446 last = self.args.pop()
447 self.args[-1] = self.args[-1] + last.rstrip(self.ifs_space)
448 self.state = state_i.Start
449 return self.args