1 | """
|
2 | split.py - Word Splitting
|
3 |
|
4 | Nice blog post on the complexity/corner cases/differing intuition of splitting
|
5 | strings:
|
6 |
|
7 | https://chriszetter.com/blog/2017/10/29/splitting-strings/
|
8 |
|
9 | python-dev doesn't want to touch it anymore!
|
10 |
|
11 | Other possible splitters:
|
12 |
|
13 | - AwkSplitter -- how does this compare to awk -F?
|
14 | - RegexSplitter
|
15 | - CsvSplitter
|
16 | - TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
|
17 | a pure slice, but neither is IFS splitting because of backslashes.
|
18 | - Perl?
|
19 | - does perl have a spilt context?
|
20 |
|
21 | with SPLIT_REGEX = / digit+ / {
|
22 | echo $#
|
23 | echo $len(argv)
|
24 | echo $1 $2
|
25 | echo @argv
|
26 | }
|
27 | """
|
28 |
|
29 | from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
|
30 | state_i)
|
31 | from _devbuild.gen.value_asdl import (value, value_e, value_t)
|
32 | from mycpp.mylib import log
|
33 | from core import pyutil
|
34 | from frontend import consts
|
35 | from mycpp import mylib
|
36 | from mycpp.mylib import tagswitch
|
37 |
|
38 | from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
|
39 | if TYPE_CHECKING:
|
40 | from core.state import Mem
|
41 | from _devbuild.gen.runtime_asdl import span_t
|
42 | Span = Tuple[span_t, int]
|
43 |
|
44 | DEFAULT_IFS = ' \t\n'
|
45 |
|
46 |
|
47 | def _SpansToParts(s, spans):
|
48 | # type: (str, List[Span]) -> List[str]
|
49 | """Helper for SplitForWordEval."""
|
50 | parts = [] # type: List[mylib.BufWriter]
|
51 | start_index = 0
|
52 |
|
53 | # If the last span was black, and we get a backslash, set join_next to merge
|
54 | # two black spans.
|
55 | join_next = False
|
56 | last_span_was_black = False
|
57 |
|
58 | for span_type, end_index in spans:
|
59 | if span_type == span_e.Black:
|
60 | if len(parts) and join_next:
|
61 | parts[-1].write(s[start_index:end_index])
|
62 | join_next = False
|
63 | else:
|
64 | buf = mylib.BufWriter()
|
65 | buf.write(s[start_index:end_index])
|
66 | parts.append(buf)
|
67 |
|
68 | last_span_was_black = True
|
69 |
|
70 | elif span_type == span_e.Backslash:
|
71 | if last_span_was_black:
|
72 | join_next = True
|
73 | last_span_was_black = False
|
74 |
|
75 | else:
|
76 | last_span_was_black = False
|
77 |
|
78 | start_index = end_index
|
79 |
|
80 | result = [buf.getvalue() for buf in parts]
|
81 | return result
|
82 |
|
83 |
|
84 | class SplitContext(object):
|
85 | """A polymorphic interface to field splitting.
|
86 |
|
87 | It respects a STACK of IFS values, for example:
|
88 |
|
89 | echo $x # uses default shell IFS
|
90 | IFS=':' myfunc # new splitter
|
91 | echo $x # uses default shell IFS again.
|
92 | """
|
93 |
|
94 | def __init__(self, mem):
|
95 | # type: (Mem) -> None
|
96 | self.mem = mem
|
97 | # Split into (ifs_whitespace, ifs_other)
|
98 | self.splitters = {
|
99 | } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
|
100 |
|
101 | def _GetSplitter(self, ifs=None):
|
102 | # type: (str) -> IfsSplitter
|
103 | """Based on the current stack frame, get the splitter."""
|
104 | if ifs is None:
|
105 | # Like _ESCAPER, this has dynamic scope!
|
106 | val = self.mem.GetValue('IFS', scope_e.Dynamic)
|
107 |
|
108 | UP_val = val
|
109 | with tagswitch(val) as case:
|
110 | if case(value_e.Undef):
|
111 | ifs = DEFAULT_IFS
|
112 | elif case(value_e.Str):
|
113 | val = cast(value.Str, UP_val)
|
114 | ifs = val.s
|
115 | else:
|
116 | # TODO: Raise proper error
|
117 | raise AssertionError("IFS shouldn't be an array")
|
118 |
|
119 | sp = self.splitters.get(ifs) # cache lookup
|
120 | if sp is None:
|
121 | # Figure out what kind of splitter we should instantiate.
|
122 |
|
123 | ifs_whitespace = mylib.BufWriter()
|
124 | ifs_other = mylib.BufWriter()
|
125 | for c in ifs:
|
126 | if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
|
127 | ifs_whitespace.write(c)
|
128 | else:
|
129 | # TODO: \ not supported
|
130 | ifs_other.write(c)
|
131 |
|
132 | sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
|
133 |
|
134 | # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
|
135 | # the same as IFS=$'\t '. But most programs probably don't do that, and
|
136 | # everything should work in any case.
|
137 | self.splitters[ifs] = sp
|
138 |
|
139 | return sp
|
140 |
|
141 | def GetJoinChar(self):
|
142 | # type: () -> str
|
143 | """For decaying arrays by joining, eg.
|
144 |
|
145 | "$@" -> $@. array
|
146 | """
|
147 | # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
|
148 | # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
|
149 | # "When the expansion occurs within a double-quoted string (see
|
150 | # Double-Quotes), it shall expand to a single field with the value of
|
151 | # each parameter separated by the first character of the IFS variable, or
|
152 | # by a <space> if IFS is unset. If IFS is set to a null string, this is
|
153 | # not equivalent to unsetting it; its first character does not exist, so
|
154 | # the parameter values are concatenated."
|
155 | val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
|
156 | UP_val = val
|
157 | with tagswitch(val) as case:
|
158 | if case(value_e.Undef):
|
159 | return ' '
|
160 | elif case(value_e.Str):
|
161 | val = cast(value.Str, UP_val)
|
162 | if len(val.s):
|
163 | return val.s[0]
|
164 | else:
|
165 | return ''
|
166 | else:
|
167 | # TODO: Raise proper error
|
168 | raise AssertionError("IFS shouldn't be an array")
|
169 |
|
170 | raise AssertionError('for -Wreturn-type in C++')
|
171 |
|
172 | def Escape(self, s):
|
173 | # type: (str) -> str
|
174 | """Escape IFS chars."""
|
175 | sp = self._GetSplitter()
|
176 | return sp.Escape(s)
|
177 |
|
178 | def SplitForWordEval(self, s, ifs=None):
|
179 | # type: (str, Optional[str]) -> List[str]
|
180 | """Split used by word evaluation.
|
181 |
|
182 | Also used by the explicit shSplit() function.
|
183 | """
|
184 | sp = self._GetSplitter(ifs=ifs)
|
185 | spans = sp.Split(s, True)
|
186 |
|
187 | # Note: pass allow_escape=False so \ isn't special
|
188 | #spans = sp.Split(s, False)
|
189 |
|
190 | if 0:
|
191 | for span in spans:
|
192 | log('SPAN %s', span)
|
193 | return _SpansToParts(s, spans)
|
194 |
|
195 | def SplitForRead(self, line, allow_escape, do_split):
|
196 | # type: (str, bool, bool) -> List[Span]
|
197 |
|
198 | # None: use the default splitter, consulting $IFS
|
199 | # '' : forces IFS='' behavior
|
200 | ifs = None if do_split else ''
|
201 |
|
202 | sp = self._GetSplitter(ifs=ifs)
|
203 | return sp.Split(line, allow_escape)
|
204 |
|
205 |
|
206 | class _BaseSplitter(object):
|
207 |
|
208 | def __init__(self, escape_chars):
|
209 | # type: (str) -> None
|
210 | self.escape_chars = escape_chars + '\\' # Backslash is always escaped
|
211 |
|
212 | def Escape(self, s):
|
213 | # type: (str) -> str
|
214 | # Note the characters here are DYNAMIC, unlike other usages of
|
215 | # BackslashEscape().
|
216 | return pyutil.BackslashEscape(s, self.escape_chars)
|
217 |
|
218 |
|
219 | class IfsSplitter(_BaseSplitter):
|
220 | """Split a string when IFS has non-whitespace characters."""
|
221 |
|
222 | def __init__(self, ifs_whitespace, ifs_other):
|
223 | # type: (str, str) -> None
|
224 | _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
|
225 | self.ifs_whitespace = ifs_whitespace
|
226 | self.ifs_other = ifs_other
|
227 |
|
228 | def __repr__(self):
|
229 | # type: () -> str
|
230 | return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
|
231 | self.ifs_other)
|
232 |
|
233 | def Split(self, s, allow_escape):
|
234 | # type: (str, bool) -> List[Span]
|
235 | """
|
236 | Args:
|
237 | s: string to split
|
238 | allow_escape: False for read -r, this means \ doesn't do anything.
|
239 |
|
240 | Returns:
|
241 | List of (runtime.span, end_index) pairs
|
242 |
|
243 | TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
|
244 | double-escaping issue.
|
245 | """
|
246 | ws_chars = self.ifs_whitespace
|
247 | other_chars = self.ifs_other
|
248 |
|
249 | n = len(s)
|
250 | # NOTE: in C, could reserve() this to len(s)
|
251 | spans = [] # type: List[Span]
|
252 |
|
253 | if n == 0:
|
254 | return spans # empty
|
255 |
|
256 | # Ad hoc rule from POSIX: ignore leading whitespace.
|
257 | # "IFS white space shall be ignored at the beginning and end of the input"
|
258 | # This can't really be handled by the state machine.
|
259 |
|
260 | i = 0
|
261 | while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
|
262 | i += 1
|
263 |
|
264 | # Append an ignored span.
|
265 | if i != 0:
|
266 | spans.append((span_e.Delim, i))
|
267 |
|
268 | # String is ONLY whitespace. We want to skip the last span after the
|
269 | # while loop.
|
270 | if i == n:
|
271 | return spans
|
272 |
|
273 | state = state_i.Start
|
274 | while state != state_i.Done:
|
275 | if i < n:
|
276 | byte = mylib.ByteAt(s, i)
|
277 |
|
278 | if mylib.ByteInSet(byte, ws_chars):
|
279 | ch = char_kind_i.DE_White
|
280 | elif mylib.ByteInSet(byte, other_chars):
|
281 | ch = char_kind_i.DE_Gray
|
282 | elif allow_escape and mylib.ByteEquals(byte, '\\'):
|
283 | ch = char_kind_i.Backslash
|
284 | else:
|
285 | ch = char_kind_i.Black
|
286 |
|
287 | elif i == n:
|
288 | ch = char_kind_i.Sentinel # one more iterations for the end of string
|
289 |
|
290 | else:
|
291 | raise AssertionError() # shouldn't happen
|
292 |
|
293 | new_state, action = consts.IfsEdge(state, ch)
|
294 | if new_state == state_i.Invalid:
|
295 | raise AssertionError('Invalid transition from %r with %r' %
|
296 | (state, ch))
|
297 |
|
298 | if 0:
|
299 | log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
|
300 | state, new_state, action)
|
301 |
|
302 | if action == emit_i.Part:
|
303 | spans.append((span_e.Black, i))
|
304 | elif action == emit_i.Delim:
|
305 | spans.append((span_e.Delim, i)) # ignored delimiter
|
306 | elif action == emit_i.Empty:
|
307 | spans.append((span_e.Delim, i)) # ignored delimiter
|
308 | # EMPTY part that is NOT ignored
|
309 | spans.append((span_e.Black, i))
|
310 | elif action == emit_i.Escape:
|
311 | spans.append((span_e.Backslash, i)) # \
|
312 | elif action == emit_i.Nothing:
|
313 | pass
|
314 | else:
|
315 | raise AssertionError()
|
316 |
|
317 | state = new_state
|
318 | i += 1
|
319 |
|
320 | return spans
|