OILS / osh / split.py View on Github | oils.pub

320 lines, 162 significant
1"""
2split.py - Word Splitting
3
4Nice blog post on the complexity/corner cases/differing intuition of splitting
5strings:
6
7https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9python-dev doesn't want to touch it anymore!
10
11Other possible splitters:
12
13- AwkSplitter -- how does this compare to awk -F?
14- RegexSplitter
15- CsvSplitter
16- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17 a pure slice, but neither is IFS splitting because of backslashes.
18- Perl?
19 - does perl have a spilt context?
20
21with SPLIT_REGEX = / digit+ / {
22 echo $#
23 echo $len(argv)
24 echo $1 $2
25 echo @argv
26}
27"""
28
29from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30 state_i)
31from _devbuild.gen.value_asdl import (value, value_e, value_t)
32from mycpp.mylib import log
33from core import pyutil
34from frontend import consts
35from mycpp import mylib
36from mycpp.mylib import tagswitch
37
38from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
39if TYPE_CHECKING:
40 from core.state import Mem
41 from _devbuild.gen.runtime_asdl import span_t
42 Span = Tuple[span_t, int]
43
44DEFAULT_IFS = ' \t\n'
45
46
47def _SpansToParts(s, spans):
48 # type: (str, List[Span]) -> List[str]
49 """Helper for SplitForWordEval."""
50 parts = [] # type: List[mylib.BufWriter]
51 start_index = 0
52
53 # If the last span was black, and we get a backslash, set join_next to merge
54 # two black spans.
55 join_next = False
56 last_span_was_black = False
57
58 for span_type, end_index in spans:
59 if span_type == span_e.Black:
60 if len(parts) and join_next:
61 parts[-1].write(s[start_index:end_index])
62 join_next = False
63 else:
64 buf = mylib.BufWriter()
65 buf.write(s[start_index:end_index])
66 parts.append(buf)
67
68 last_span_was_black = True
69
70 elif span_type == span_e.Backslash:
71 if last_span_was_black:
72 join_next = True
73 last_span_was_black = False
74
75 else:
76 last_span_was_black = False
77
78 start_index = end_index
79
80 result = [buf.getvalue() for buf in parts]
81 return result
82
83
84class SplitContext(object):
85 """A polymorphic interface to field splitting.
86
87 It respects a STACK of IFS values, for example:
88
89 echo $x # uses default shell IFS
90 IFS=':' myfunc # new splitter
91 echo $x # uses default shell IFS again.
92 """
93
94 def __init__(self, mem):
95 # type: (Mem) -> None
96 self.mem = mem
97 # Split into (ifs_whitespace, ifs_other)
98 self.splitters = {
99 } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
100
101 def _GetSplitter(self, ifs=None):
102 # type: (str) -> IfsSplitter
103 """Based on the current stack frame, get the splitter."""
104 if ifs is None:
105 # Like _ESCAPER, this has dynamic scope!
106 val = self.mem.GetValue('IFS', scope_e.Dynamic)
107
108 UP_val = val
109 with tagswitch(val) as case:
110 if case(value_e.Undef):
111 ifs = DEFAULT_IFS
112 elif case(value_e.Str):
113 val = cast(value.Str, UP_val)
114 ifs = val.s
115 else:
116 # TODO: Raise proper error
117 raise AssertionError("IFS shouldn't be an array")
118
119 sp = self.splitters.get(ifs) # cache lookup
120 if sp is None:
121 # Figure out what kind of splitter we should instantiate.
122
123 ifs_whitespace = mylib.BufWriter()
124 ifs_other = mylib.BufWriter()
125 for c in ifs:
126 if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
127 ifs_whitespace.write(c)
128 else:
129 # TODO: \ not supported
130 ifs_other.write(c)
131
132 sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
133
134 # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
135 # the same as IFS=$'\t '. But most programs probably don't do that, and
136 # everything should work in any case.
137 self.splitters[ifs] = sp
138
139 return sp
140
141 def GetJoinChar(self):
142 # type: () -> str
143 """For decaying arrays by joining, eg.
144
145 "$@" -> $@. array
146 """
147 # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
148 # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
149 # "When the expansion occurs within a double-quoted string (see
150 # Double-Quotes), it shall expand to a single field with the value of
151 # each parameter separated by the first character of the IFS variable, or
152 # by a <space> if IFS is unset. If IFS is set to a null string, this is
153 # not equivalent to unsetting it; its first character does not exist, so
154 # the parameter values are concatenated."
155 val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
156 UP_val = val
157 with tagswitch(val) as case:
158 if case(value_e.Undef):
159 return ' '
160 elif case(value_e.Str):
161 val = cast(value.Str, UP_val)
162 if len(val.s):
163 return val.s[0]
164 else:
165 return ''
166 else:
167 # TODO: Raise proper error
168 raise AssertionError("IFS shouldn't be an array")
169
170 raise AssertionError('for -Wreturn-type in C++')
171
172 def Escape(self, s):
173 # type: (str) -> str
174 """Escape IFS chars."""
175 sp = self._GetSplitter()
176 return sp.Escape(s)
177
178 def SplitForWordEval(self, s, ifs=None):
179 # type: (str, Optional[str]) -> List[str]
180 """Split used by word evaluation.
181
182 Also used by the explicit shSplit() function.
183 """
184 sp = self._GetSplitter(ifs=ifs)
185 spans = sp.Split(s, True)
186
187 # Note: pass allow_escape=False so \ isn't special
188 #spans = sp.Split(s, False)
189
190 if 0:
191 for span in spans:
192 log('SPAN %s', span)
193 return _SpansToParts(s, spans)
194
195 def SplitForRead(self, line, allow_escape, do_split):
196 # type: (str, bool, bool) -> List[Span]
197
198 # None: use the default splitter, consulting $IFS
199 # '' : forces IFS='' behavior
200 ifs = None if do_split else ''
201
202 sp = self._GetSplitter(ifs=ifs)
203 return sp.Split(line, allow_escape)
204
205
206class _BaseSplitter(object):
207
208 def __init__(self, escape_chars):
209 # type: (str) -> None
210 self.escape_chars = escape_chars + '\\' # Backslash is always escaped
211
212 def Escape(self, s):
213 # type: (str) -> str
214 # Note the characters here are DYNAMIC, unlike other usages of
215 # BackslashEscape().
216 return pyutil.BackslashEscape(s, self.escape_chars)
217
218
219class IfsSplitter(_BaseSplitter):
220 """Split a string when IFS has non-whitespace characters."""
221
222 def __init__(self, ifs_whitespace, ifs_other):
223 # type: (str, str) -> None
224 _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
225 self.ifs_whitespace = ifs_whitespace
226 self.ifs_other = ifs_other
227
228 def __repr__(self):
229 # type: () -> str
230 return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
231 self.ifs_other)
232
233 def Split(self, s, allow_escape):
234 # type: (str, bool) -> List[Span]
235 """
236 Args:
237 s: string to split
238 allow_escape: False for read -r, this means \ doesn't do anything.
239
240 Returns:
241 List of (runtime.span, end_index) pairs
242
243 TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
244 double-escaping issue.
245 """
246 ws_chars = self.ifs_whitespace
247 other_chars = self.ifs_other
248
249 n = len(s)
250 # NOTE: in C, could reserve() this to len(s)
251 spans = [] # type: List[Span]
252
253 if n == 0:
254 return spans # empty
255
256 # Ad hoc rule from POSIX: ignore leading whitespace.
257 # "IFS white space shall be ignored at the beginning and end of the input"
258 # This can't really be handled by the state machine.
259
260 i = 0
261 while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
262 i += 1
263
264 # Append an ignored span.
265 if i != 0:
266 spans.append((span_e.Delim, i))
267
268 # String is ONLY whitespace. We want to skip the last span after the
269 # while loop.
270 if i == n:
271 return spans
272
273 state = state_i.Start
274 while state != state_i.Done:
275 if i < n:
276 byte = mylib.ByteAt(s, i)
277
278 if mylib.ByteInSet(byte, ws_chars):
279 ch = char_kind_i.DE_White
280 elif mylib.ByteInSet(byte, other_chars):
281 ch = char_kind_i.DE_Gray
282 elif allow_escape and mylib.ByteEquals(byte, '\\'):
283 ch = char_kind_i.Backslash
284 else:
285 ch = char_kind_i.Black
286
287 elif i == n:
288 ch = char_kind_i.Sentinel # one more iterations for the end of string
289
290 else:
291 raise AssertionError() # shouldn't happen
292
293 new_state, action = consts.IfsEdge(state, ch)
294 if new_state == state_i.Invalid:
295 raise AssertionError('Invalid transition from %r with %r' %
296 (state, ch))
297
298 if 0:
299 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
300 state, new_state, action)
301
302 if action == emit_i.Part:
303 spans.append((span_e.Black, i))
304 elif action == emit_i.Delim:
305 spans.append((span_e.Delim, i)) # ignored delimiter
306 elif action == emit_i.Empty:
307 spans.append((span_e.Delim, i)) # ignored delimiter
308 # EMPTY part that is NOT ignored
309 spans.append((span_e.Black, i))
310 elif action == emit_i.Escape:
311 spans.append((span_e.Backslash, i)) # \
312 elif action == emit_i.Nothing:
313 pass
314 else:
315 raise AssertionError()
316
317 state = new_state
318 i += 1
319
320 return spans