ysh/regex_translate.py

OILS / ysh / regex_translate.py View on Github | oils.pub

410 lines, 250 significant

1	#!/usr/bin/env python2
2	"""regex_translate.py."""
3	from __future__ import print_function
4
5	from _devbuild.gen.syntax_asdl import (
6	PosixClass,
7	PerlClass,
8	CharCode,
9	CharRange,
10	char_class_term_e,
11	char_class_term_t,
12	re,
13	re_e,
14	re_repeat,
15	re_repeat_e,
16	EggexFlag,
17	Token,
18	)
19	from _devbuild.gen.id_kind_asdl import Id
20	from _devbuild.gen.value_asdl import value
21	from core.error import e_die, p_die
22	from data_lang import j8
23	from frontend import lexer
24	from mycpp.mylib import log, tagswitch, switch
25	from osh import glob_ # for ExtendedRegexEscape
26
27	from typing import List, Optional, TYPE_CHECKING, cast
28
29	if TYPE_CHECKING:
30	from _devbuild.gen.syntax_asdl import re_t
31
32	from libc import REG_ICASE, REG_NEWLINE
33
34	_ = log
35
36	PERL_CLASS = {
37	'd': '[:digit:]',
38	# Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
39	'w': '[:alpha:][:digit:]_',
40	# Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
41	's': '[:space:]',
42	}
43
44	# ERE's in POSIX:
45	# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
46	#
47	# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
48	#
49	# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
50	#
51	# (1) a literal tab with Ctrl-V, or
52	# (2) bash syntax: grep $'\t' foo.txt
53	# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
54	#
55	# I ran into this in test/lint.sh !!!
56	#
57	# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
58
59	# Algorithm:
60	# - Unicode escapes in BREs disallowed
61	# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
62	#
63	# What about utf-8 encoded characters? Those work within literals, but are
64	# problematic within character sets. There's no way to disallow those in
65	# general though.
66
67	CH_RBRACKET = 0x5d
68	CH_BACKSLASH = 0x5c
69	CH_CARET = 0x5e
70	CH_HYPHEN = 0x2d
71
72	FLAG_RBRACKET = 0b0001
73	FLAG_BACKSLASH = 0b0010
74	FLAG_CARET = 0b0100
75	FLAG_HYPHEN = 0b1000
76
77
78	def _CharCodeToEre(term, parts, special_char_flags):
79	# type: (CharCode, List[str], List[int]) -> None
80	"""Translate CharCode like 'a' or \\x7f or \\u{7f} to ERE
81
82	\\x7f has u_braced == false
83	\\u{7f} has u_braced == true
84
85	Are these code points? TODO: it depends on the libc ERE semantics
86
87	special_char_flags: a List of ONE int that is mutated.
88	"""
89
90	char_int = term.i
91	if char_int == 0:
92	e_die("ERE can't express char code %d" % char_int, term.blame_tok)
93
94	if term.u_braced: # \u{3bc}
95	# max is \u{10ffff} is checked at PARSE time
96	pass
97	else:
98	if char_int >= 128: # 128 is 0x80
99	# \yff can't be represented in ERE if LC_ALL=utf-8
100	# It should be \u{ff} instead
101	e_die("ERE can't express high bytes %d" % char_int, term.blame_tok)
102
103	# note: mycpp doesn't handle
104	# special_char_flags[0] \|= FLAG_HYPHEN
105	mask = special_char_flags[0]
106
107	if char_int == CH_HYPHEN:
108	mask \|= FLAG_HYPHEN
109	elif char_int == CH_CARET:
110	mask \|= FLAG_CARET
111	elif char_int == CH_RBRACKET:
112	mask \|= FLAG_RBRACKET
113	elif char_int == CH_BACKSLASH:
114	mask \|= FLAG_BACKSLASH
115	else:
116	s = j8.Utf8Encode(char_int)
117	parts.append(s)
118
119	special_char_flags[0] = mask
120
121
122	def _CharClassTermToEre(term, parts, special_char_flags):
123	# type: (char_class_term_t, List[str], List[int]) -> None
124	"""special_char_flags: list of single int that is mutated."""
125
126	UP_term = term
127	with tagswitch(term) as case:
128	if case(char_class_term_e.CharRange):
129	term = cast(CharRange, UP_term)
130
131	# Create our own flags
132	range_no_special = [0]
133
134	_CharCodeToEre(term.start, parts, range_no_special)
135	if range_no_special[0] != 0:
136	e_die(
137	"Can't use char %d as start of range in ERE syntax" %
138	term.start.i, term.start.blame_tok)
139
140	parts.append('-') # a-b
141
142	_CharCodeToEre(term.end, parts, range_no_special)
143	if range_no_special[0] != 0:
144	e_die(
145	"Can't use char %d as end of range in ERE syntax" %
146	term.end.i, term.end.blame_tok)
147
148	elif case(char_class_term_e.CharCode):
149	term = cast(CharCode, UP_term)
150
151	_CharCodeToEre(term, parts, special_char_flags)
152
153	elif case(char_class_term_e.PerlClass):
154	term = cast(PerlClass, UP_term)
155	n = term.name
156	chars = PERL_CLASS[term.name] # looks like '[:digit:]'
157	if term.negated:
158	e_die("Perl classes can't be negated in ERE", term.negated)
159	else:
160	pat = '%s' % chars
161	parts.append(pat)
162
163	elif case(char_class_term_e.PosixClass):
164	term = cast(PosixClass, UP_term)
165	n = term.name # looks like 'digit'
166	if term.negated:
167	e_die("POSIX classes can't be negated in ERE", term.negated)
168	else:
169	pat = '[:%s:]' % n
170	parts.append(pat)
171
172	else:
173	raise AssertionError(term)
174
175
176	def _AsPosixEre(node, parts, capture_names):
177	# type: (re_t, List[str], List[Optional[str]]) -> None
178	"""Translate an Eggex to a POSIX ERE.
179
180	Appends to a list of parts that you have to join.
181	"""
182	UP_node = node
183	tag = node.tag()
184
185	if tag == re_e.Primitive:
186	node = cast(re.Primitive, UP_node)
187	if node.id == Id.Eggex_Dot:
188	parts.append('.')
189	elif node.id == Id.Eggex_Start:
190	parts.append('^')
191	elif node.id == Id.Eggex_End:
192	parts.append('$')
193	else:
194	raise AssertionError(node.id)
195	return
196
197	if tag == re_e.LiteralChars:
198	node = cast(re.LiteralChars, UP_node)
199	# The bash [[ x =~ "." ]] construct also has to do this
200
201	# TODO: What about \0 and unicode escapes?
202	# Those won't be as LiteralChars I don't think?
203	# Unless you put them there through \0
204	# Maybe DISALLOW those.
205	# "Unprintable chars should be written as \0 or \x00 or \u0000"
206
207	parts.append(glob_.ExtendedRegexEscape(node.s))
208	return
209
210	if tag == re_e.Seq:
211	node = cast(re.Seq, UP_node)
212	for c in node.children:
213	_AsPosixEre(c, parts, capture_names)
214	return
215
216	if tag == re_e.Alt:
217	node = cast(re.Alt, UP_node)
218	for i, c in enumerate(node.children):
219	if i != 0:
220	parts.append('\|')
221	_AsPosixEre(c, parts, capture_names)
222	return
223
224	if tag == re_e.Repeat:
225	node = cast(re.Repeat, UP_node)
226	# 'foo' or "foo" or $x or ${x} evaluated to too many chars
227	if node.child.tag() == re_e.LiteralChars:
228	child = cast(re.LiteralChars, node.child)
229	if len(child.s) > 1:
230	# Note: Other regex dialects have non-capturing groups since we don't
231	# need this.
232	e_die(
233	"POSIX EREs don't have groups without capture, so this node "
234	"needs () around it.", child.blame_tok)
235
236	_AsPosixEre(node.child, parts, capture_names)
237	op = node.op
238	op_tag = op.tag()
239	UP_op = op
240
241	if op_tag == re_repeat_e.Op:
242	op = cast(Token, UP_op)
243	with switch(op.id) as case:
244	if case(Id.Arith_Plus):
245	parts.append('+')
246	elif case(Id.Arith_Star):
247	parts.append('*')
248	elif case(Id.Arith_QMark):
249	parts.append('?')
250	elif case(Id.Expr_DecInt):
251	parts.append('{%s}' % lexer.LazyStr(op))
252	else:
253	raise AssertionError(op.id)
254	return
255
256	if op_tag == re_repeat_e.Range:
257	op = cast(re_repeat.Range, UP_op)
258	parts.append('{%s,%s}' % (op.lower, op.upper))
259	return
260
261	raise NotImplementedError(op_tag)
262
263	# Special case for familiarity: () is acceptable as a group in ERE
264	if tag == re_e.Group:
265	node = cast(re.Group, UP_node)
266
267	# placeholder so we know this group is numbered, but not named
268	capture_names.append(None)
269
270	parts.append('(')
271	_AsPosixEre(node.child, parts, capture_names)
272	parts.append(')')
273	return
274
275	if tag == re_e.Capture:
276	node = cast(re.Capture, UP_node)
277
278	# Collect in order of ( appearance
279	# TODO: get the name string, and type string
280
281	capture_str = lexer.TokenVal(node.name) if node.name else None
282	capture_names.append(capture_str)
283
284	parts.append('(')
285	_AsPosixEre(node.child, parts, capture_names)
286	parts.append(')')
287	return
288
289	if tag == re_e.PerlClass:
290	node = cast(PerlClass, UP_node)
291	n = node.name
292	chars = PERL_CLASS[node.name] # looks like [:digit:]
293	if node.negated:
294	pat = '[^%s]' % chars
295	else:
296	pat = '[%s]' % chars
297	parts.append(pat)
298	return
299
300	if tag == re_e.PosixClass:
301	node = cast(PosixClass, UP_node)
302	n = node.name # looks like 'digit'
303	if node.negated:
304	pat = '[^[:%s:]]' % n
305	else:
306	pat = '[[:%s:]]' % n
307	parts.append(pat)
308	return
309
310	if tag == re_e.CharClass:
311	node = cast(re.CharClass, UP_node)
312
313	# HYPHEN CARET RBRACKET BACKSLASH
314	special_char_flags = [0]
315	non_special_parts = [] # type: List[str]
316
317	for term in node.terms:
318	_CharClassTermToEre(term, non_special_parts, special_char_flags)
319
320	parts.append('[')
321	if node.negated:
322	parts.append('^')
323
324	# Help the user with some of terrible corner cases
325
326	# - move literal - to end [ab-] not [a-b]
327	# - move literal ^ to end [x^-] not [^x-]
328	# - move literal ] to beginning: []x] not [x]]
329	# - double up \\ because of Gawk extension [\\]
330
331	if special_char_flags[0] & FLAG_RBRACKET:
332	parts.append(']')
333
334	parts.extend(non_special_parts)
335
336	if special_char_flags[0] & FLAG_BACKSLASH:
337	parts.append('\\\\') # TWO backslashes
338
339	if special_char_flags[0] & FLAG_CARET:
340	parts.append('^')
341
342	if special_char_flags[0] & FLAG_HYPHEN:
343	parts.append('-')
344
345	parts.append(']')
346	return
347
348	raise NotImplementedError(tag)
349
350
351	def AsPosixEre(eggex):
352	# type: (value.Eggex) -> str
353	"""
354	Lazily fills in fields on the value.Eggex argument.
355	"""
356	if eggex.as_ere is not None:
357	return eggex.as_ere
358
359	parts = [] # type: List[str]
360	_AsPosixEre(eggex.spliced, parts, eggex.capture_names)
361
362	# These are both indexed by group number, with None for the holes
363	# List[str?] vs. List[value?]
364	assert len(eggex.capture_names) == len(eggex.convert_funcs)
365
366	eggex.as_ere = ''.join(parts)
367
368	return eggex.as_ere
369
370
371	def CanonicalFlags(flags):
372	# type: (List[EggexFlag]) -> str
373	"""
374	Raises PARSE error on invalid flags.
375
376	In theory we could encode directly to integers like REG_ICASE, but a string
377	like like 'i' makes the error message slightly more legible.
378	"""
379	letters = [] # type: List[str]
380	for flag in flags:
381	if flag.negated:
382	p_die("Flag can't be negated", flag.flag)
383	flag_name = lexer.TokenVal(flag.flag)
384	if flag_name in ('i', 'reg_icase'):
385	letters.append('i')
386	elif flag_name == 'reg_newline':
387	letters.append('n')
388	else:
389	p_die("Invalid regex flag %r" % flag_name, flag.flag)
390
391	# Normalize for comparison
392	letters.sort()
393	return ''.join(letters)
394
395
396	def LibcFlags(canonical_flags):
397	# type: (Optional[str]) -> int
398	if canonical_flags is None:
399	return 0
400
401	libc_flags = 0
402	for ch in canonical_flags:
403	if ch == 'i':
404	libc_flags \|= REG_ICASE
405	elif ch == 'n':
406	libc_flags \|= REG_NEWLINE
407	else:
408	# regex_translate should prevent this
409	raise AssertionError()
410	return libc_flags