OILS / ysh / regex_translate.py View on Github | oils.pub

410 lines, 250 significant
1#!/usr/bin/env python2
2"""regex_translate.py."""
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import (
6 PosixClass,
7 PerlClass,
8 CharCode,
9 CharRange,
10 char_class_term_e,
11 char_class_term_t,
12 re,
13 re_e,
14 re_repeat,
15 re_repeat_e,
16 EggexFlag,
17 Token,
18)
19from _devbuild.gen.id_kind_asdl import Id
20from _devbuild.gen.value_asdl import value
21from core.error import e_die, p_die
22from data_lang import j8
23from frontend import lexer
24from mycpp.mylib import log, tagswitch, switch
25from osh import glob_ # for ExtendedRegexEscape
26
27from typing import List, Optional, TYPE_CHECKING, cast
28
29if TYPE_CHECKING:
30 from _devbuild.gen.syntax_asdl import re_t
31
32from libc import REG_ICASE, REG_NEWLINE
33
34_ = log
35
36PERL_CLASS = {
37 'd': '[:digit:]',
38 # Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
39 'w': '[:alpha:][:digit:]_',
40 # Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
41 's': '[:space:]',
42}
43
44# ERE's in POSIX:
45# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
46#
47# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
48#
49# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
50#
51# (1) a literal tab with Ctrl-V, or
52# (2) bash syntax: grep $'\t' foo.txt
53# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
54#
55# I ran into this in test/lint.sh !!!
56#
57# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
58
59# Algorithm:
60# - Unicode escapes in BREs disallowed
61# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
62#
63# What about utf-8 encoded characters? Those work within literals, but are
64# problematic within character sets. There's no way to disallow those in
65# general though.
66
67CH_RBRACKET = 0x5d
68CH_BACKSLASH = 0x5c
69CH_CARET = 0x5e
70CH_HYPHEN = 0x2d
71
72FLAG_RBRACKET = 0b0001
73FLAG_BACKSLASH = 0b0010
74FLAG_CARET = 0b0100
75FLAG_HYPHEN = 0b1000
76
77
78def _CharCodeToEre(term, parts, special_char_flags):
79 # type: (CharCode, List[str], List[int]) -> None
80 """Translate CharCode like 'a' or \\x7f or \\u{7f} to ERE
81
82 \\x7f has u_braced == false
83 \\u{7f} has u_braced == true
84
85 Are these code points? TODO: it depends on the libc ERE semantics
86
87 special_char_flags: a List of ONE int that is mutated.
88 """
89
90 char_int = term.i
91 if char_int == 0:
92 e_die("ERE can't express char code %d" % char_int, term.blame_tok)
93
94 if term.u_braced: # \u{3bc}
95 # max is \u{10ffff} is checked at PARSE time
96 pass
97 else:
98 if char_int >= 128: # 128 is 0x80
99 # \yff can't be represented in ERE if LC_ALL=utf-8
100 # It should be \u{ff} instead
101 e_die("ERE can't express high bytes %d" % char_int, term.blame_tok)
102
103 # note: mycpp doesn't handle
104 # special_char_flags[0] |= FLAG_HYPHEN
105 mask = special_char_flags[0]
106
107 if char_int == CH_HYPHEN:
108 mask |= FLAG_HYPHEN
109 elif char_int == CH_CARET:
110 mask |= FLAG_CARET
111 elif char_int == CH_RBRACKET:
112 mask |= FLAG_RBRACKET
113 elif char_int == CH_BACKSLASH:
114 mask |= FLAG_BACKSLASH
115 else:
116 s = j8.Utf8Encode(char_int)
117 parts.append(s)
118
119 special_char_flags[0] = mask
120
121
122def _CharClassTermToEre(term, parts, special_char_flags):
123 # type: (char_class_term_t, List[str], List[int]) -> None
124 """special_char_flags: list of single int that is mutated."""
125
126 UP_term = term
127 with tagswitch(term) as case:
128 if case(char_class_term_e.CharRange):
129 term = cast(CharRange, UP_term)
130
131 # Create our own flags
132 range_no_special = [0]
133
134 _CharCodeToEre(term.start, parts, range_no_special)
135 if range_no_special[0] != 0:
136 e_die(
137 "Can't use char %d as start of range in ERE syntax" %
138 term.start.i, term.start.blame_tok)
139
140 parts.append('-') # a-b
141
142 _CharCodeToEre(term.end, parts, range_no_special)
143 if range_no_special[0] != 0:
144 e_die(
145 "Can't use char %d as end of range in ERE syntax" %
146 term.end.i, term.end.blame_tok)
147
148 elif case(char_class_term_e.CharCode):
149 term = cast(CharCode, UP_term)
150
151 _CharCodeToEre(term, parts, special_char_flags)
152
153 elif case(char_class_term_e.PerlClass):
154 term = cast(PerlClass, UP_term)
155 n = term.name
156 chars = PERL_CLASS[term.name] # looks like '[:digit:]'
157 if term.negated:
158 e_die("Perl classes can't be negated in ERE", term.negated)
159 else:
160 pat = '%s' % chars
161 parts.append(pat)
162
163 elif case(char_class_term_e.PosixClass):
164 term = cast(PosixClass, UP_term)
165 n = term.name # looks like 'digit'
166 if term.negated:
167 e_die("POSIX classes can't be negated in ERE", term.negated)
168 else:
169 pat = '[:%s:]' % n
170 parts.append(pat)
171
172 else:
173 raise AssertionError(term)
174
175
176def _AsPosixEre(node, parts, capture_names):
177 # type: (re_t, List[str], List[Optional[str]]) -> None
178 """Translate an Eggex to a POSIX ERE.
179
180 Appends to a list of parts that you have to join.
181 """
182 UP_node = node
183 tag = node.tag()
184
185 if tag == re_e.Primitive:
186 node = cast(re.Primitive, UP_node)
187 if node.id == Id.Eggex_Dot:
188 parts.append('.')
189 elif node.id == Id.Eggex_Start:
190 parts.append('^')
191 elif node.id == Id.Eggex_End:
192 parts.append('$')
193 else:
194 raise AssertionError(node.id)
195 return
196
197 if tag == re_e.LiteralChars:
198 node = cast(re.LiteralChars, UP_node)
199 # The bash [[ x =~ "." ]] construct also has to do this
200
201 # TODO: What about \0 and unicode escapes?
202 # Those won't be as LiteralChars I don't think?
203 # Unless you put them there through \0
204 # Maybe DISALLOW those.
205 # "Unprintable chars should be written as \0 or \x00 or \u0000"
206
207 parts.append(glob_.ExtendedRegexEscape(node.s))
208 return
209
210 if tag == re_e.Seq:
211 node = cast(re.Seq, UP_node)
212 for c in node.children:
213 _AsPosixEre(c, parts, capture_names)
214 return
215
216 if tag == re_e.Alt:
217 node = cast(re.Alt, UP_node)
218 for i, c in enumerate(node.children):
219 if i != 0:
220 parts.append('|')
221 _AsPosixEre(c, parts, capture_names)
222 return
223
224 if tag == re_e.Repeat:
225 node = cast(re.Repeat, UP_node)
226 # 'foo' or "foo" or $x or ${x} evaluated to too many chars
227 if node.child.tag() == re_e.LiteralChars:
228 child = cast(re.LiteralChars, node.child)
229 if len(child.s) > 1:
230 # Note: Other regex dialects have non-capturing groups since we don't
231 # need this.
232 e_die(
233 "POSIX EREs don't have groups without capture, so this node "
234 "needs () around it.", child.blame_tok)
235
236 _AsPosixEre(node.child, parts, capture_names)
237 op = node.op
238 op_tag = op.tag()
239 UP_op = op
240
241 if op_tag == re_repeat_e.Op:
242 op = cast(Token, UP_op)
243 with switch(op.id) as case:
244 if case(Id.Arith_Plus):
245 parts.append('+')
246 elif case(Id.Arith_Star):
247 parts.append('*')
248 elif case(Id.Arith_QMark):
249 parts.append('?')
250 elif case(Id.Expr_DecInt):
251 parts.append('{%s}' % lexer.LazyStr(op))
252 else:
253 raise AssertionError(op.id)
254 return
255
256 if op_tag == re_repeat_e.Range:
257 op = cast(re_repeat.Range, UP_op)
258 parts.append('{%s,%s}' % (op.lower, op.upper))
259 return
260
261 raise NotImplementedError(op_tag)
262
263 # Special case for familiarity: () is acceptable as a group in ERE
264 if tag == re_e.Group:
265 node = cast(re.Group, UP_node)
266
267 # placeholder so we know this group is numbered, but not named
268 capture_names.append(None)
269
270 parts.append('(')
271 _AsPosixEre(node.child, parts, capture_names)
272 parts.append(')')
273 return
274
275 if tag == re_e.Capture:
276 node = cast(re.Capture, UP_node)
277
278 # Collect in order of ( appearance
279 # TODO: get the name string, and type string
280
281 capture_str = lexer.TokenVal(node.name) if node.name else None
282 capture_names.append(capture_str)
283
284 parts.append('(')
285 _AsPosixEre(node.child, parts, capture_names)
286 parts.append(')')
287 return
288
289 if tag == re_e.PerlClass:
290 node = cast(PerlClass, UP_node)
291 n = node.name
292 chars = PERL_CLASS[node.name] # looks like [:digit:]
293 if node.negated:
294 pat = '[^%s]' % chars
295 else:
296 pat = '[%s]' % chars
297 parts.append(pat)
298 return
299
300 if tag == re_e.PosixClass:
301 node = cast(PosixClass, UP_node)
302 n = node.name # looks like 'digit'
303 if node.negated:
304 pat = '[^[:%s:]]' % n
305 else:
306 pat = '[[:%s:]]' % n
307 parts.append(pat)
308 return
309
310 if tag == re_e.CharClass:
311 node = cast(re.CharClass, UP_node)
312
313 # HYPHEN CARET RBRACKET BACKSLASH
314 special_char_flags = [0]
315 non_special_parts = [] # type: List[str]
316
317 for term in node.terms:
318 _CharClassTermToEre(term, non_special_parts, special_char_flags)
319
320 parts.append('[')
321 if node.negated:
322 parts.append('^')
323
324 # Help the user with some of terrible corner cases
325
326 # - move literal - to end [ab-] not [a-b]
327 # - move literal ^ to end [x^-] not [^x-]
328 # - move literal ] to beginning: []x] not [x]]
329 # - double up \\ because of Gawk extension [\\]
330
331 if special_char_flags[0] & FLAG_RBRACKET:
332 parts.append(']')
333
334 parts.extend(non_special_parts)
335
336 if special_char_flags[0] & FLAG_BACKSLASH:
337 parts.append('\\\\') # TWO backslashes
338
339 if special_char_flags[0] & FLAG_CARET:
340 parts.append('^')
341
342 if special_char_flags[0] & FLAG_HYPHEN:
343 parts.append('-')
344
345 parts.append(']')
346 return
347
348 raise NotImplementedError(tag)
349
350
351def AsPosixEre(eggex):
352 # type: (value.Eggex) -> str
353 """
354 Lazily fills in fields on the value.Eggex argument.
355 """
356 if eggex.as_ere is not None:
357 return eggex.as_ere
358
359 parts = [] # type: List[str]
360 _AsPosixEre(eggex.spliced, parts, eggex.capture_names)
361
362 # These are both indexed by group number, with None for the holes
363 # List[str?] vs. List[value?]
364 assert len(eggex.capture_names) == len(eggex.convert_funcs)
365
366 eggex.as_ere = ''.join(parts)
367
368 return eggex.as_ere
369
370
371def CanonicalFlags(flags):
372 # type: (List[EggexFlag]) -> str
373 """
374 Raises PARSE error on invalid flags.
375
376 In theory we could encode directly to integers like REG_ICASE, but a string
377 like like 'i' makes the error message slightly more legible.
378 """
379 letters = [] # type: List[str]
380 for flag in flags:
381 if flag.negated:
382 p_die("Flag can't be negated", flag.flag)
383 flag_name = lexer.TokenVal(flag.flag)
384 if flag_name in ('i', 'reg_icase'):
385 letters.append('i')
386 elif flag_name == 'reg_newline':
387 letters.append('n')
388 else:
389 p_die("Invalid regex flag %r" % flag_name, flag.flag)
390
391 # Normalize for comparison
392 letters.sort()
393 return ''.join(letters)
394
395
396def LibcFlags(canonical_flags):
397 # type: (Optional[str]) -> int
398 if canonical_flags is None:
399 return 0
400
401 libc_flags = 0
402 for ch in canonical_flags:
403 if ch == 'i':
404 libc_flags |= REG_ICASE
405 elif ch == 'n':
406 libc_flags |= REG_NEWLINE
407 else:
408 # regex_translate should prevent this
409 raise AssertionError()
410 return libc_flags