OILS / osh / split.py View on Github | oils.pub

319 lines, 162 significant
1"""
2split.py - Word Splitting
3
4Nice blog post on the complexity/corner cases/differing intuition of splitting
5strings:
6
7https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9python-dev doesn't want to touch it anymore!
10
11Other possible splitters:
12
13- AwkSplitter -- how does this compare to awk -F?
14- RegexSplitter
15- CsvSplitter
16- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17 a pure slice, but neither is IFS splitting because of backslashes.
18- Perl?
19 - does perl have a spilt context?
20
21with SPLIT_REGEX = / digit+ / {
22 echo $#
23 echo $len(argv)
24 echo $1 $2
25 echo @argv
26}
27"""
28
29from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30 state_i)
31from _devbuild.gen.value_asdl import (value, value_e, value_t)
32from mycpp.mylib import log
33from core import pyutil
34from frontend import consts
35from mycpp import mylib
36from mycpp.mylib import tagswitch
37
38from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
39if TYPE_CHECKING:
40 from core.state import Mem
41 from _devbuild.gen.runtime_asdl import span_t
42 Span = Tuple[span_t, int]
43
44DEFAULT_IFS = ' \t\n'
45
46
47def _SpansToParts(s, spans):
48 # type: (str, List[Span]) -> List[str]
49 """Helper for SplitForWordEval."""
50 parts = [] # type: List[mylib.BufWriter]
51 start_index = 0
52
53 # If the last span was black, and we get a backslash, set join_next to merge
54 # two black spans.
55 join_next = False
56 last_span_was_black = False
57
58 for span_type, end_index in spans:
59 if span_type == span_e.Black:
60 if len(parts) and join_next:
61 parts[-1].write(s[start_index:end_index])
62 join_next = False
63 else:
64 buf = mylib.BufWriter()
65 buf.write(s[start_index:end_index])
66 parts.append(buf)
67
68 last_span_was_black = True
69
70 elif span_type == span_e.Backslash:
71 if last_span_was_black:
72 join_next = True
73 last_span_was_black = False
74
75 else:
76 last_span_was_black = False
77
78 start_index = end_index
79
80 result = [buf.getvalue() for buf in parts]
81 return result
82
83
84class SplitContext(object):
85 """A polymorphic interface to field splitting.
86
87 It respects a STACK of IFS values, for example:
88
89 echo $x # uses default shell IFS
90 IFS=':' myfunc # new splitter
91 echo $x # uses default shell IFS again.
92 """
93
94 def __init__(self, mem):
95 # type: (Mem) -> None
96 self.mem = mem
97 # Split into (ifs_whitespace, ifs_other)
98 self.splitters = {
99 } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
100
101 def _GetSplitter(self, ifs=None):
102 # type: (str) -> IfsSplitter
103 """Based on the current stack frame, get the splitter."""
104 if ifs is None:
105 # Like _ESCAPER, this has dynamic scope!
106 val = self.mem.GetValue('IFS', scope_e.Dynamic)
107
108 UP_val = val
109 with tagswitch(val) as case:
110 if case(value_e.Undef):
111 ifs = DEFAULT_IFS
112 elif case(value_e.Str):
113 val = cast(value.Str, UP_val)
114 ifs = val.s
115 else:
116 # TODO: Raise proper error
117 raise AssertionError("IFS shouldn't be an array")
118
119 sp = self.splitters.get(ifs) # cache lookup
120 if sp is None:
121 # Figure out what kind of splitter we should instantiate.
122
123 ifs_whitespace = mylib.BufWriter()
124 ifs_other = mylib.BufWriter()
125 for c in ifs:
126 if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
127 ifs_whitespace.write(c)
128 else:
129 # TODO: \ not supported
130 ifs_other.write(c)
131
132 sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
133
134 # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
135 # the same as IFS=$'\t '. But most programs probably don't do that, and
136 # everything should work in any case.
137 self.splitters[ifs] = sp
138
139 return sp
140
141 def GetJoinChar(self):
142 # type: () -> str
143 """For decaying arrays by joining, eg.
144
145 "$@" -> $@. array
146 """
147 # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
148 # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
149 # "When the expansion occurs within a double-quoted string (see
150 # Double-Quotes), it shall expand to a single field with the value of
151 # each parameter separated by the first character of the IFS variable, or
152 # by a <space> if IFS is unset. If IFS is set to a null string, this is
153 # not equivalent to unsetting it; its first character does not exist, so
154 # the parameter values are concatenated."
155 val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
156 UP_val = val
157 with tagswitch(val) as case:
158 if case(value_e.Undef):
159 return ' '
160 elif case(value_e.Str):
161 val = cast(value.Str, UP_val)
162 if len(val.s):
163 return val.s[0]
164 else:
165 return ''
166 else:
167 # TODO: Raise proper error
168 raise AssertionError("IFS shouldn't be an array")
169
170 raise AssertionError('for -Wreturn-type in C++')
171
172 def Escape(self, s):
173 # type: (str) -> str
174 """Escape IFS chars."""
175 sp = self._GetSplitter()
176 return sp.Escape(s)
177
178 def SplitForWordEval(self, s, ifs=None):
179 # type: (str, Optional[str]) -> List[str]
180 """Split used by word evaluation.
181
182 Also used by the explicit shSplit() function.
183 """
184 sp = self._GetSplitter(ifs=ifs)
185 spans = sp.Split(s, True)
186
187 # Note: pass allow_escape=False so \ isn't special
188 #spans = sp.Split(s, False)
189
190 if 0:
191 for span in spans:
192 log('SPAN %s', span)
193 return _SpansToParts(s, spans)
194
195 def SplitForRead(self, line, allow_escape, do_split):
196 # type: (str, bool, bool) -> List[Span]
197
198 # None: use the default splitter, consulting $IFS
199 # '' : forces IFS='' behavior
200 ifs = None if do_split else ''
201
202 sp = self._GetSplitter(ifs=ifs)
203 return sp.Split(line, allow_escape)
204
205
206class _BaseSplitter(object):
207
208 def __init__(self, escape_chars):
209 # type: (str) -> None
210 self.escape_chars = escape_chars + '\\' # Backslash is always escaped
211
212 def Escape(self, s):
213 # type: (str) -> str
214 # Note the characters here are DYNAMIC, unlike other usages of
215 # BackslashEscape().
216 return pyutil.BackslashEscape(s, self.escape_chars)
217
218
219class IfsSplitter(_BaseSplitter):
220 """Split a string when IFS has non-whitespace characters."""
221
222 def __init__(self, ifs_whitespace, ifs_other):
223 # type: (str, str) -> None
224 _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
225 self.ifs_whitespace = ifs_whitespace
226 self.ifs_other = ifs_other
227
228 def __repr__(self):
229 # type: () -> str
230 return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
231 self.ifs_other)
232
233 def Split(self, s, allow_escape):
234 # type: (str, bool) -> List[Span]
235 """
236 Args:
237 s: string to split
238 allow_escape: False for read -r, this means \ doesn't do anything.
239
240 Returns:
241 List of (runtime.span, end_index) pairs
242 """
243 ws_chars = self.ifs_whitespace
244 other_chars = self.ifs_other
245
246 n = len(s)
247 # NOTE: in C, could reserve() this to len(s)
248 spans = [] # type: List[Span]
249
250 if n == 0:
251 return spans # empty
252
253 # Ad hoc rule from POSIX: ignore leading whitespace.
254 # "IFS white space shall be ignored at the beginning and end of the input"
255 # This can't really be handled by the state machine.
256
257 # 2025-03: This causes a bug with splitting ""$A"" when there's no IFS
258
259 i = 0
260 while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
261 i += 1
262
263 # Append an ignored span.
264 if i != 0:
265 spans.append((span_e.Delim, i))
266
267 # String is ONLY whitespace. We want to skip the last span after the
268 # while loop.
269 if i == n:
270 return spans
271
272 state = state_i.Start
273 while state != state_i.Done:
274 if i < n:
275 byte = mylib.ByteAt(s, i)
276
277 if mylib.ByteInSet(byte, ws_chars):
278 ch = char_kind_i.DE_White
279 elif mylib.ByteInSet(byte, other_chars):
280 ch = char_kind_i.DE_Gray
281 elif allow_escape and mylib.ByteEquals(byte, '\\'):
282 ch = char_kind_i.Backslash
283 else:
284 ch = char_kind_i.Black
285
286 elif i == n:
287 ch = char_kind_i.Sentinel # one more iterations for the end of string
288
289 else:
290 raise AssertionError() # shouldn't happen
291
292 new_state, action = consts.IfsEdge(state, ch)
293 if new_state == state_i.Invalid:
294 raise AssertionError('Invalid transition from %r with %r' %
295 (state, ch))
296
297 if 0:
298 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
299 state, new_state, action)
300
301 if action == emit_i.Part:
302 spans.append((span_e.Black, i))
303 elif action == emit_i.Delim:
304 spans.append((span_e.Delim, i)) # ignored delimiter
305 elif action == emit_i.Empty:
306 spans.append((span_e.Delim, i)) # ignored delimiter
307 # EMPTY part that is NOT ignored
308 spans.append((span_e.Black, i))
309 elif action == emit_i.Escape:
310 spans.append((span_e.Backslash, i)) # \
311 elif action == emit_i.Nothing:
312 pass
313 else:
314 raise AssertionError()
315
316 state = new_state
317 i += 1
318
319 return spans