| 1 | """
|
| 2 | split.py - Word Splitting
|
| 3 |
|
| 4 | Nice blog post on the complexity/corner cases/differing intuition of splitting
|
| 5 | strings:
|
| 6 |
|
| 7 | https://chriszetter.com/blog/2017/10/29/splitting-strings/
|
| 8 |
|
| 9 | python-dev doesn't want to touch it anymore!
|
| 10 |
|
| 11 | Other possible splitters:
|
| 12 |
|
| 13 | - AwkSplitter -- how does this compare to awk -F?
|
| 14 | - RegexSplitter
|
| 15 | - CsvSplitter
|
| 16 | - TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
|
| 17 | a pure slice, but neither is IFS splitting because of backslashes.
|
| 18 | - Perl?
|
| 19 | - does perl have a spilt context?
|
| 20 |
|
| 21 | with SPLIT_REGEX = / digit+ / {
|
| 22 | echo $#
|
| 23 | echo $len(argv)
|
| 24 | echo $1 $2
|
| 25 | echo @argv
|
| 26 | }
|
| 27 | """
|
| 28 |
|
| 29 | from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
|
| 30 | state_i)
|
| 31 | from _devbuild.gen.value_asdl import (value, value_e, value_t)
|
| 32 | from mycpp.mylib import log
|
| 33 | from core import pyutil, pyos
|
| 34 | from frontend import consts
|
| 35 | from mycpp import mylib
|
| 36 | from mycpp.mylib import tagswitch
|
| 37 | from osh import glob_
|
| 38 |
|
| 39 | from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
|
| 40 | if TYPE_CHECKING:
|
| 41 | from core.state import Mem
|
| 42 | from _devbuild.gen.runtime_asdl import span_t
|
| 43 | Span = Tuple[span_t, int]
|
| 44 |
|
| 45 | DEFAULT_IFS = ' \t\n'
|
| 46 |
|
| 47 |
|
| 48 | def _SpansToParts(s, spans):
|
| 49 | # type: (str, List[Span]) -> List[str]
|
| 50 | """Helper for SplitForWordEval."""
|
| 51 | parts = [] # type: List[mylib.BufWriter]
|
| 52 | start_index = 0
|
| 53 |
|
| 54 | # If the last span was black, and we get a backslash, set join_next to merge
|
| 55 | # two black spans.
|
| 56 | join_next = False
|
| 57 | last_span_was_black = False
|
| 58 |
|
| 59 | for span_type, end_index in spans:
|
| 60 | if span_type == span_e.Black:
|
| 61 | if len(parts) and join_next:
|
| 62 | parts[-1].write(s[start_index:end_index])
|
| 63 | join_next = False
|
| 64 | else:
|
| 65 | buf = mylib.BufWriter()
|
| 66 | buf.write(s[start_index:end_index])
|
| 67 | parts.append(buf)
|
| 68 |
|
| 69 | last_span_was_black = True
|
| 70 |
|
| 71 | elif span_type == span_e.Backslash:
|
| 72 | if last_span_was_black:
|
| 73 | join_next = True
|
| 74 | last_span_was_black = False
|
| 75 |
|
| 76 | else:
|
| 77 | last_span_was_black = False
|
| 78 |
|
| 79 | start_index = end_index
|
| 80 |
|
| 81 | result = [buf.getvalue() for buf in parts]
|
| 82 | return result
|
| 83 |
|
| 84 |
|
| 85 | class SplitContext(object):
|
| 86 | """A polymorphic interface to field splitting.
|
| 87 |
|
| 88 | It respects a STACK of IFS values, for example:
|
| 89 |
|
| 90 | echo $x # uses default shell IFS
|
| 91 | IFS=':' myfunc # new splitter
|
| 92 | echo $x # uses default shell IFS again.
|
| 93 | """
|
| 94 |
|
| 95 | def __init__(self, mem):
|
| 96 | # type: (Mem) -> None
|
| 97 | self.mem = mem
|
| 98 | # Split into (ifs_whitespace, ifs_other)
|
| 99 | self.splitters = {
|
| 100 | } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
|
| 101 |
|
| 102 | def _GetSplitter(self, ifs=None):
|
| 103 | # type: (str) -> IfsSplitter
|
| 104 | """Based on the current stack frame, get the splitter."""
|
| 105 | if ifs is None:
|
| 106 | # Like _ESCAPER, this has dynamic scope!
|
| 107 | val = self.mem.GetValue('IFS', scope_e.Dynamic)
|
| 108 |
|
| 109 | UP_val = val
|
| 110 | with tagswitch(val) as case:
|
| 111 | if case(value_e.Undef):
|
| 112 | ifs = DEFAULT_IFS
|
| 113 | elif case(value_e.Str):
|
| 114 | val = cast(value.Str, UP_val)
|
| 115 | ifs = val.s
|
| 116 | else:
|
| 117 | # TODO: Raise proper error
|
| 118 | raise AssertionError("IFS shouldn't be an array")
|
| 119 |
|
| 120 | sp = self.splitters.get(ifs) # cache lookup
|
| 121 | if sp is None:
|
| 122 | # Figure out what kind of splitter we should instantiate.
|
| 123 |
|
| 124 | ifs_whitespace = mylib.BufWriter()
|
| 125 | ifs_other = mylib.BufWriter()
|
| 126 | for c in ifs:
|
| 127 | if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
|
| 128 | ifs_whitespace.write(c)
|
| 129 | else:
|
| 130 | # TODO: \ not supported
|
| 131 | ifs_other.write(c)
|
| 132 |
|
| 133 | sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
|
| 134 |
|
| 135 | # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
|
| 136 | # the same as IFS=$'\t '. But most programs probably don't do that, and
|
| 137 | # everything should work in any case.
|
| 138 | self.splitters[ifs] = sp
|
| 139 |
|
| 140 | return sp
|
| 141 |
|
| 142 | def GetJoinChar(self):
|
| 143 | # type: () -> str
|
| 144 | """For decaying arrays by joining, eg.
|
| 145 |
|
| 146 | "$@" -> $@. array
|
| 147 | """
|
| 148 | # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
|
| 149 | # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
|
| 150 | # "When the expansion occurs within a double-quoted string (see
|
| 151 | # Double-Quotes), it shall expand to a single field with the value of
|
| 152 | # each parameter separated by the first character of the IFS variable, or
|
| 153 | # by a <space> if IFS is unset. If IFS is set to a null string, this is
|
| 154 | # not equivalent to unsetting it; its first character does not exist, so
|
| 155 | # the parameter values are concatenated."
|
| 156 | val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
|
| 157 | UP_val = val
|
| 158 | with tagswitch(val) as case:
|
| 159 | if case(value_e.Undef):
|
| 160 | return ' '
|
| 161 | elif case(value_e.Str):
|
| 162 | val = cast(value.Str, UP_val)
|
| 163 | if len(val.s):
|
| 164 | return val.s[0]
|
| 165 | else:
|
| 166 | return ''
|
| 167 | else:
|
| 168 | # TODO: Raise proper error
|
| 169 | raise AssertionError("IFS shouldn't be an array")
|
| 170 |
|
| 171 | raise AssertionError('for -Wreturn-type in C++')
|
| 172 |
|
| 173 | def Escape(self, s):
|
| 174 | # type: (str) -> str
|
| 175 | """Escape IFS chars."""
|
| 176 | sp = self._GetSplitter()
|
| 177 | return sp.Escape(s)
|
| 178 |
|
| 179 | def CreateSplitterState(self, ifs=None):
|
| 180 | # type: (Optional[str]) -> IfsSplitterState
|
| 181 | sp = self._GetSplitter(ifs=ifs)
|
| 182 | return IfsSplitterState(sp.ifs_whitespace, sp.ifs_other)
|
| 183 |
|
| 184 | def SplitForWordEval(self, s, ifs=None):
|
| 185 | # type: (str, Optional[str]) -> List[str]
|
| 186 | """Split used by the explicit shSplit() function.
|
| 187 | """
|
| 188 | sp = self.CreateSplitterState(ifs=ifs)
|
| 189 | sp.SetAllowEscape(True)
|
| 190 | sp.PushFragment(s)
|
| 191 | return sp.PushTerminator()
|
| 192 |
|
| 193 | def SplitForRead(self, line, allow_escape, do_split, max_parts):
|
| 194 | # type: (str, bool, bool, int) -> List[str]
|
| 195 |
|
| 196 | if len(line) == 0:
|
| 197 | return []
|
| 198 |
|
| 199 | # None: use the default splitter, consulting $IFS
|
| 200 | # '' : forces IFS='' behavior
|
| 201 | ifs = None if do_split else ''
|
| 202 |
|
| 203 | sp = self.CreateSplitterState(ifs=ifs)
|
| 204 | sp.SetAllowEscape(allow_escape)
|
| 205 | sp.SetMaxSplit(max_parts - 1)
|
| 206 | sp.PushFragment(line)
|
| 207 | return sp.PushTerminator()
|
| 208 |
|
| 209 |
|
| 210 | class _BaseSplitter(object):
|
| 211 |
|
| 212 | def __init__(self, escape_chars):
|
| 213 | # type: (str) -> None
|
| 214 | self.escape_chars = escape_chars + '\\' # Backslash is always escaped
|
| 215 |
|
| 216 | def Escape(self, s):
|
| 217 | # type: (str) -> str
|
| 218 | # Note the characters here are DYNAMIC, unlike other usages of
|
| 219 | # BackslashEscape().
|
| 220 | return pyutil.BackslashEscape(s, self.escape_chars)
|
| 221 |
|
| 222 |
|
| 223 | class IfsSplitter(_BaseSplitter):
|
| 224 | """Split a string when IFS has non-whitespace characters."""
|
| 225 |
|
| 226 | def __init__(self, ifs_whitespace, ifs_other):
|
| 227 | # type: (str, str) -> None
|
| 228 | _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
|
| 229 | self.ifs_whitespace = ifs_whitespace
|
| 230 | self.ifs_other = ifs_other
|
| 231 |
|
| 232 | def __repr__(self):
|
| 233 | # type: () -> str
|
| 234 | return '<IfsSplitter whitespace=%r other=%r>' % (self.ifs_whitespace,
|
| 235 | self.ifs_other)
|
| 236 |
|
| 237 | def Split(self, s, allow_escape):
|
| 238 | # type: (str, bool) -> List[Span]
|
| 239 | """
|
| 240 | Args:
|
| 241 | s: string to split
|
| 242 | allow_escape: False for read -r, this means \ doesn't do anything.
|
| 243 |
|
| 244 | Returns:
|
| 245 | List of (runtime.span, end_index) pairs
|
| 246 | """
|
| 247 | ws_chars = self.ifs_whitespace
|
| 248 | other_chars = self.ifs_other
|
| 249 |
|
| 250 | n = len(s)
|
| 251 | # NOTE: in C, could reserve() this to len(s)
|
| 252 | spans = [] # type: List[Span]
|
| 253 |
|
| 254 | if n == 0:
|
| 255 | return spans # empty
|
| 256 |
|
| 257 | # Ad hoc rule from POSIX: ignore leading whitespace.
|
| 258 | # "IFS white space shall be ignored at the beginning and end of the input"
|
| 259 | # This can't really be handled by the state machine.
|
| 260 |
|
| 261 | # 2025-03: This causes a bug with splitting ""$A"" when there's no IFS
|
| 262 |
|
| 263 | i = 0
|
| 264 | while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
|
| 265 | i += 1
|
| 266 |
|
| 267 | # Append an ignored span.
|
| 268 | if i != 0:
|
| 269 | spans.append((span_e.Delim, i))
|
| 270 |
|
| 271 | # String is ONLY whitespace. We want to skip the last span after the
|
| 272 | # while loop.
|
| 273 | if i == n:
|
| 274 | return spans
|
| 275 |
|
| 276 | state = state_i.Start
|
| 277 | while state != state_i.Done:
|
| 278 | if i < n:
|
| 279 | byte = mylib.ByteAt(s, i)
|
| 280 |
|
| 281 | if mylib.ByteInSet(byte, ws_chars):
|
| 282 | ch = char_kind_i.DE_White
|
| 283 | elif mylib.ByteInSet(byte, other_chars):
|
| 284 | ch = char_kind_i.DE_Gray
|
| 285 | elif allow_escape and mylib.ByteEquals(byte, '\\'):
|
| 286 | ch = char_kind_i.Backslash
|
| 287 | else:
|
| 288 | ch = char_kind_i.Black
|
| 289 |
|
| 290 | elif i == n:
|
| 291 | ch = char_kind_i.Sentinel # one more iterations for the end of string
|
| 292 |
|
| 293 | else:
|
| 294 | raise AssertionError() # shouldn't happen
|
| 295 |
|
| 296 | new_state, action = consts.IfsEdge(state, ch)
|
| 297 | if new_state == state_i.Invalid:
|
| 298 | raise AssertionError('Invalid transition from %r with %r' %
|
| 299 | (state, ch))
|
| 300 |
|
| 301 | if 0:
|
| 302 | log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
|
| 303 | state, new_state, action)
|
| 304 |
|
| 305 | if action == emit_i.Part:
|
| 306 | spans.append((span_e.Black, i))
|
| 307 | elif action == emit_i.Delim:
|
| 308 | spans.append((span_e.Delim, i)) # ignored delimiter
|
| 309 | elif action == emit_i.Empty:
|
| 310 | spans.append((span_e.Delim, i)) # ignored delimiter
|
| 311 | # EMPTY part that is NOT ignored
|
| 312 | spans.append((span_e.Black, i))
|
| 313 | elif action == emit_i.Escape:
|
| 314 | spans.append((span_e.Backslash, i)) # \
|
| 315 | elif action == emit_i.Nothing:
|
| 316 | pass
|
| 317 | else:
|
| 318 | raise AssertionError()
|
| 319 |
|
| 320 | state = new_state
|
| 321 | i += 1
|
| 322 |
|
| 323 | return spans
|
| 324 |
|
| 325 |
|
| 326 | class IfsSplitterState(object):
|
| 327 |
|
| 328 | def __init__(self, ifs_space, ifs_other):
|
| 329 | # type: (str, str) -> None
|
| 330 | self.ifs_space = ifs_space
|
| 331 | self.ifs_other = ifs_other
|
| 332 | self.glob_escape = False
|
| 333 | self.allow_escape = False
|
| 334 | self.max_split = -1
|
| 335 |
|
| 336 | self.state = state_i.Start
|
| 337 | self.args = [] # type: List[str] # generated words
|
| 338 | self.frags = [] # type: List[str] # str fragments of the current word
|
| 339 | self.char_buff = [] # type: List[int] # chars in the current fragment
|
| 340 | self.white_buff = None # type: Optional[List[int]] # chars for max_split space
|
| 341 |
|
| 342 | def SetGlobEscape(self, glob_escape):
|
| 343 | # type: (bool) -> None
|
| 344 | self.glob_escape = glob_escape
|
| 345 |
|
| 346 | def SetAllowEscape(self, allow_escape):
|
| 347 | # type: (bool) -> None
|
| 348 | self.allow_escape = allow_escape
|
| 349 |
|
| 350 | def SetMaxSplit(self, max_split):
|
| 351 | # type: (int) -> None
|
| 352 | self.max_split = max_split
|
| 353 | if max_split >= 0 and self.white_buff is None:
|
| 354 | self.white_buff = []
|
| 355 |
|
| 356 | def _FlushCharBuff(self):
|
| 357 | # type: () -> None
|
| 358 |
|
| 359 | if len(self.char_buff) >= 1:
|
| 360 | frag = mylib.JoinBytes(self.char_buff)
|
| 361 | if self.glob_escape:
|
| 362 | frag = glob_.GlobEscapeBackslash(frag)
|
| 363 | self.frags.append(frag)
|
| 364 | del self.char_buff[:]
|
| 365 |
|
| 366 | def _GenerateWord(self):
|
| 367 | # type: () -> None
|
| 368 | self._FlushCharBuff()
|
| 369 | self.args.append(''.join(self.frags))
|
| 370 | del self.frags[:]
|
| 371 |
|
| 372 | if self.max_split >= 0 and len(self.white_buff) >= 1:
|
| 373 | self.char_buff.extend(self.white_buff)
|
| 374 | del self.white_buff[:]
|
| 375 |
|
| 376 | def PushLiteral(self, s):
|
| 377 | # type: (str) -> None
|
| 378 | """
|
| 379 | Args:
|
| 380 | s: word fragment that should be literally added
|
| 381 | """
|
| 382 | if self.state == state_i.DE_White1:
|
| 383 | self._GenerateWord()
|
| 384 | else:
|
| 385 | self._FlushCharBuff()
|
| 386 | self.frags.append(s)
|
| 387 | self.state = state_i.Black
|
| 388 |
|
| 389 | def PushFragment(self, s):
|
| 390 | # type: (str) -> None
|
| 391 | """
|
| 392 | Args:
|
| 393 | s: word fragment to split
|
| 394 | """
|
| 395 | ifs_space = self.ifs_space
|
| 396 | ifs_other = self.ifs_other
|
| 397 | allow_escape = self.allow_escape
|
| 398 | max_split = self.max_split
|
| 399 | n = len(s)
|
| 400 |
|
| 401 | for i in xrange(n):
|
| 402 | byte = mylib.ByteAt(s, i)
|
| 403 |
|
| 404 | if self.state == state_i.Backslash:
|
| 405 | pass
|
| 406 |
|
| 407 | elif max_split >= 0 and len(self.args) == max_split + 1:
|
| 408 | # When max_split is reached, the processing is modified.
|
| 409 | if allow_escape and byte == pyos.BACKSLASH_CH:
|
| 410 | self.state = state_i.Backslash
|
| 411 | continue
|
| 412 | elif mylib.ByteInSet(byte, ifs_space):
|
| 413 | if self.state == state_i.Start:
|
| 414 | self.char_buff.append(byte)
|
| 415 | continue
|
| 416 |
|
| 417 | elif allow_escape and byte == pyos.BACKSLASH_CH:
|
| 418 | if self.state == state_i.DE_White1:
|
| 419 | self._GenerateWord()
|
| 420 | self.state = state_i.Backslash
|
| 421 | continue
|
| 422 | elif mylib.ByteInSet(byte, ifs_space):
|
| 423 | if self.state != state_i.Start:
|
| 424 | if len(self.args) == max_split:
|
| 425 | self.white_buff.append(byte)
|
| 426 | self.state = state_i.DE_White1
|
| 427 | continue
|
| 428 | elif mylib.ByteInSet(byte, ifs_other):
|
| 429 | if len(self.args) == max_split:
|
| 430 | self.white_buff.append(byte)
|
| 431 | self._GenerateWord()
|
| 432 | self.state = state_i.Start
|
| 433 | continue
|
| 434 |
|
| 435 | if self.state == state_i.DE_White1:
|
| 436 | self._GenerateWord()
|
| 437 | self.char_buff.append(byte)
|
| 438 | self.state = state_i.Black
|
| 439 |
|
| 440 | def PushTerminator(self):
|
| 441 | # type: () -> List[str]
|
| 442 | if self.state in (state_i.DE_White1, state_i.Black):
|
| 443 | self._GenerateWord()
|
| 444 | if self.max_split >= 0 and len(self.args) == self.max_split + 2:
|
| 445 | # TODO: is there an algorithm without this "fix up"?
|
| 446 | last = self.args.pop()
|
| 447 | self.args[-1] = self.args[-1] + last.rstrip(self.ifs_space)
|
| 448 | self.state = state_i.Start
|
| 449 | return self.args
|