OILS / frontend / lexer_gen.py View on Github | oilshell.org

500 lines, 252 significant
1#!/usr/bin/env python2
2"""Lex_gen.py."""
3from __future__ import print_function
4
5from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
6 TEST_OTHER_LOOKUP)
7from _devbuild.gen.id_kind_asdl import Id_str
8from _devbuild.gen.types_asdl import lex_mode_str
9
10import cStringIO
11import sys
12import sre_parse
13import sre_constants
14
15from frontend import lexer_def
16
17
18def PrintTree(re_tree, depth=2):
19 """
20 re_tree: List of children
21 """
22 for child in re_tree:
23 name, arg = child
24 sys.stdout.write(depth * '\t')
25 sys.stdout.write(name)
26 sys.stdout.write(' ')
27 if name == 'in': # character class
28 print('{')
29 PrintTree(arg, depth=depth + 1)
30 sys.stdout.write(depth * '\t')
31 print('}')
32 elif name == 'max_repeat': # repetition
33 min_, max_, children = arg
34 # min = 0 means *, min = 1 means +
35 assert min_ in (0, 1), min_
36 print(min_, max_, '{')
37 PrintTree(children, depth=depth + 1)
38 sys.stdout.write(depth * '\t')
39 print()
40 elif name == 'negate': # Oh this is a ^. It doesn't form a node.
41 assert arg is None
42 print()
43 elif name == 'literal': # Quote \ and " in re2c syntax
44 print(repr(chr(arg)))
45 elif name == 'not_literal': # ditto
46 print(repr(chr(arg)))
47 elif name == 'range': # ascii range
48 begin, end = arg
49 print(repr(chr(begin)), repr(chr(end)))
50 elif name == 'any': # This is the '.' character
51 assert arg is None
52 print()
53 else:
54 raise AssertionError(name)
55
56 # NOTE: negate and not_literal are sort of duplicated
57
58
59def PrintRegex(pat):
60 re_tree = sre_parse.parse(pat)
61 print('\t\t[')
62 PrintTree(re_tree)
63 print('\t\t]')
64
65
66# re2c literals are inside double quotes, so we need to escape \ and "
67# But we don't need to do anything with the quoted literal ^
68LITERAL_META = ['\\', '"']
69LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
70
71
72def _Literal(arg, char_escapes=LITERAL_META_CODES):
73 if arg == ord('\n'):
74 s = r'\n'
75 elif arg == ord('\r'):
76 s = r'\r'
77 elif arg == ord('\t'):
78 s = r'\t'
79 elif arg >= 0x80 or arg < 0x20: # 0x1f and below
80 s = '\\x%02x' % arg # for \x80-\xff
81 elif arg in char_escapes:
82 s = '\\' + chr(arg)
83 else:
84 s = chr(arg)
85 return s
86
87
88# - means range. Note that re2c gives an error we uselessly escape \^.
89CHAR_CLASS_META = ['\\', '-', ']']
90CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
91
92
93def _CharClassLiteral(arg):
94 return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
95
96
97def TranslateConstant(pat):
98 return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
99
100
101def TranslateTree(re_tree, f, in_char_class=False):
102 """
103 re_tree: List of children
104 """
105 for child in re_tree:
106 name, arg = child
107 if name == 'in': # character class
108 f.write('[')
109 TranslateTree(arg, f,
110 in_char_class=True) # list of literals/ranges
111 f.write(']')
112
113 elif name == 'branch': # |
114 _, branches = arg
115 for i, branch in enumerate(branches):
116 if i != 0:
117 f.write(' | ')
118 TranslateTree(branch, f)
119
120 elif name == 'max_repeat': # repetition
121 min_, max_, children = arg
122 # min = 0 means *, min = 1 means +
123 #assert min_ in (0, 1), min_
124 TranslateTree(children, f)
125
126 if min_ == 0 and max_ == 1:
127 f.write('? ')
128
129 elif max_ == sre_constants.MAXREPEAT:
130 if min_ == 0:
131 f.write('* ')
132 elif min_ == 1:
133 f.write('+ ')
134 else:
135 assert 0, min_
136
137 else: # re2c also supports [0-7]{1,2} syntax
138 # note: might generated {2,2}
139 f.write('{%d,%d} ' % (min_, max_))
140
141 elif name == 'negate': # ^ in [^a-z]
142 assert arg is None
143 f.write('^')
144
145 elif name == 'literal': # Quote \ and " in re2c syntax
146 # TODO: it matters if we're inside a character class
147 #print("literal ARG %r" % arg)
148
149 if in_char_class:
150 s = _CharClassLiteral(arg)
151 else:
152 s = '"%s" ' % _Literal(arg)
153 f.write(s)
154
155 elif name == 'not_literal': # ditto
156 assert not in_char_class
157 f.write('[^%s]' % _CharClassLiteral(arg))
158
159 elif name == 'range': # ascii range
160 begin, end = arg
161 f.write('%s-%s' %
162 (_CharClassLiteral(begin), _CharClassLiteral(end)))
163
164 elif name == 'any': # This is the '.' character
165 assert arg is None
166 f.write('.')
167
168 elif name == 'subpattern':
169 _, children = arg # Not sure what the _ is, but it works
170 f.write('(')
171 TranslateTree(children, f)
172 f.write(')')
173
174 else:
175 if 0:
176 from mycpp.mylib import log
177 log('child %s', child)
178 raise RuntimeError("I don't understand regex construct: %r" % name)
179
180 # NOTE: negate and not_literal are sort of duplicated
181
182
183def TranslateRegex(pat):
184 re_tree = sre_parse.parse(pat)
185 # For debugging
186 #import pprint
187 #print(pprint.pformat(re_tree), file=sys.stderr)
188 f = cStringIO.StringIO()
189 try:
190 TranslateTree(re_tree, f)
191 except RuntimeError:
192 print('Error translating %r' % pat, file=sys.stderr)
193 raise
194 return f.getvalue()
195
196
197# This explains the sentinel method, which we will use.
198# http://re2c.org/examples/example_01.html
199#
200# TODO: Change ParseTuple to use 's' rather than '#s' ?
201
202# I don't think we need this YYFILL mechanism, because we lex a line at a
203# time.
204# http://re2c.org/examples/example_03.html
205
206
207def TranslateSimpleLexer(func_name, lexer_def):
208 print(r"""
209static inline void %s(const unsigned char* line, int line_len,
210 int start_pos, int* id, int* end_pos) {
211 assert(start_pos <= line_len); /* caller should have checked */
212
213 const unsigned char* p = line + start_pos; /* modified by re2c */
214
215 /* Echo and History lexer apparently need this, but others don't */
216 __attribute__((unused)) const unsigned char* YYMARKER;
217
218 for (;;) {
219 /*!re2c
220""" % func_name)
221
222 for is_regex, pat, id_ in lexer_def:
223 if is_regex:
224 re2c_pat = TranslateRegex(pat)
225 else:
226 re2c_pat = TranslateConstant(pat)
227 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
228 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
229
230 # EARLY RETURN: Do NOT advance past the NUL terminator.
231 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
232 r'"\x00"')
233
234 print("""
235 */
236 }
237 *end_pos = p - line; /* relative */
238}
239""")
240
241
242def TranslateBracket(func_name, token_dict):
243 print(r"""
244static inline int %s(const unsigned char* s, int len) {
245 const unsigned char* p = s; /* modified by re2c */
246 const unsigned char* end = s + len;
247
248 __attribute__((unused)) const unsigned char* YYMARKER;
249 int id;
250
251 for (;;) {
252 /*!re2c
253""" % func_name)
254
255 for pat in sorted(token_dict):
256 id_ = token_dict[pat]
257 re2c_pat = TranslateConstant(pat)
258 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
259 print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
260
261 # EARLY RETURN: Do NOT advance past other chars, including the NUL
262 # terminator.
263 print(' %-30s { return id__Undefined_Tok; }' % '*')
264
265 print("""
266 */
267 }
268 // must be an exact match
269 return (p == end) ? id : id__Undefined_Tok;
270}
271""")
272
273
274def StringToInt(func_name, name_def):
275 print(r"""
276static inline void %s(const unsigned char* s, int len, int* id) {
277 const unsigned char* p = s; /* modified by re2c */
278 const unsigned char* end = s + len;
279
280 //fprintf(stderr, "*** s = %%s\n", s);
281
282 for (;;) {
283 /*!re2c
284""" % func_name)
285
286 for name, enum in name_def:
287 re2c_pat = TranslateConstant(name)
288 print(' %-30s { *id = %s; break; }' % (re2c_pat, enum))
289
290 # Not found. * matches anything else.
291 print(' %-30s { *id = 0; return; }' % \
292 r'*')
293
294 print(r"""
295 */
296 }
297 if (p != end) {
298 //fprintf(stderr, "EXTRA CHARS\n", s);
299 *id = 0; // Not an exact match
300 }
301}
302""")
303
304 # TODO: Check that we're at the END OF THE STRING
305
306
307def TranslateOshLexer(lexer_def):
308 # https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
309 # Has to be 'static inline' rather than 'inline', otherwise the
310 # _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
311 # Since we reference this function in exactly one translation unit --
312 # fastlex.c, the difference is moot, and we just satisfy the compiler.
313
314 print(r"""
315/* Common stuff */
316
317/*!re2c
318 re2c:define:YYCTYPE = "unsigned char";
319 re2c:define:YYCURSOR = p;
320 re2c:yyfill:enable = 0; // generated code doesn't ask for more input
321*/
322
323static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
324 int start_pos, int* id, int* end_pos) {
325 assert(start_pos <= line_len); /* caller should have checked */
326
327 const unsigned char* p = line + start_pos; /* modified by re2c */
328 //printf("p: %p q: %p\n", p, q);
329
330 __attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
331 switch (lex_mode) {
332""")
333
334 # TODO: Should be ordered by most common? Or will profile-directed feedback
335 # help?
336 for state, pat_list in lexer_def.iteritems():
337 # e.g. lex_mode.DQ => lex_mode__DQ
338 print(' case %s:' % lex_mode_str(state).replace('.', '__'))
339 print(' for (;;) {')
340 print(' /*!re2c')
341
342 for is_regex, pat, id_ in pat_list:
343 if is_regex:
344 re2c_pat = TranslateRegex(pat)
345 else:
346 re2c_pat = TranslateConstant(pat)
347 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
348 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
349
350 # EARLY RETURN: Do NOT advance past the NUL terminator.
351 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
352 r'"\x00"')
353
354 print(' */')
355 print(' }')
356 print(' break;')
357 print()
358
359 # This is literal code without generation:
360 """
361 case lex_mode__OUTER:
362 for (;;) {
363 /*!re2c
364 literal_chunk = [a-zA-Z0-9_/.-]+;
365 var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
366 comment = [ \t\r]* "#" [^\000\r\n]*;
367 space = [ \t\r]+;
368 nul = "\000";
369
370 literal_chunk { *id = id__Lit_Chars; break; }
371 var_like { *id = id__Lit_VarLike; break; }
372
373 [ \t\r]* "\n" { *id = id__Op_Newline; break; }
374 space { *id = id__WS_Space; break; }
375
376 nul { *id = id__Eof_Real; break; }
377
378 // anything else
379 * { *id = id__Lit_Other; break; }
380
381 */
382 }
383
384 *end_pos = p - line;
385 break;
386
387 case lex_mode__COMMENT:
388 *id = id__Lit_Other;
389 *end_pos = 6;
390 break;
391 """
392
393 print("""\
394 default:
395 assert(0);
396
397 }
398 *end_pos = p - line; /* relative */
399}
400""")
401
402
403def TranslateRegexToPredicate(py_regex, func_name):
404 re2c_pat = TranslateRegex(py_regex)
405 print(r"""
406static inline int %s(const unsigned char* s, int len) {
407 const unsigned char* p = s; /* modified by re2c */
408 const unsigned char* end = s + len;
409
410 /* MatchBraceRangeToken needs this, but others don't */
411 __attribute__((unused)) const unsigned char* YYMARKER;
412
413 /*!re2c
414 re2c:define:YYCTYPE = "unsigned char";
415 re2c:define:YYCURSOR = p;
416 %-30s { return p == end; } // Match must be anchored right, like $
417 * { return 0; }
418 */
419}
420""" % (func_name, re2c_pat))
421
422
423 # note: use YYCURSOR and YYLIMIT
424 # limit should be the end of string
425 # line + line_len
426def main(argv):
427 action = argv[1]
428 if action == 'c':
429 # Code is printed to stdout
430
431 TranslateOshLexer(lexer_def.LEXER_DEF)
432
433 TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
434 TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
435 TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
436 TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
437 TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
438 TranslateSimpleLexer('MatchJ8Token', lexer_def.J8_DEF)
439 TranslateSimpleLexer('MatchJ8LinesToken', lexer_def.J8_LINES_DEF)
440 TranslateSimpleLexer('MatchJ8StrToken', lexer_def.J8_STR_DEF)
441 TranslateSimpleLexer('MatchJsonStrToken', lexer_def.JSON_STR_DEF)
442 TranslateSimpleLexer('MatchShNumberToken', lexer_def.SH_NUMBER_DEF)
443
444 TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
445 TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
446 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
447 'LooksLikeInteger')
448 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_YSH_INT,
449 'LooksLikeYshInt')
450 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_YSH_FLOAT,
451 'LooksLikeYshFloat')
452
453 TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
454 TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
455 TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
456
457 elif action == 'print-all':
458 # Top level is a switch statement.
459 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
460 print(state)
461 # This level is re2c patterns.
462 for is_regex, pat, token_id in pat_list:
463 print('\t%r -> %r' % (pat, token_id))
464 if is_regex:
465 #print re_tree
466 _ = TranslateRegex(pat)
467 #print out_pat
468
469 print()
470
471 elif action == 'print-regex':
472 unique = set()
473
474 num_regexes = 0
475 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
476 print(state)
477 # This level is re2c patterns.
478 for is_regex, pat, token_id in pat_list:
479 #print '\t%r -> %r' % (pat, token_id)
480 if is_regex:
481 print('\t' + pat)
482 print('\t' + TranslateRegex(pat))
483 print()
484 #PrintRegex(pat)
485 num_regexes += 1
486 unique.add(pat)
487 else:
488 print('\t' + TranslateConstant(pat))
489
490 print()
491
492 print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
493
494
495if __name__ == '__main__':
496 try:
497 main(sys.argv)
498 except RuntimeError as e:
499 print('FATAL: %s' % e, file=sys.stderr)
500 sys.exit(1)