OILS / frontend / lexer_gen.py View on Github | oils.pub

421 lines, 207 significant
1#!/usr/bin/env python2
2"""Lex_gen.py."""
3from __future__ import print_function
4
5from _devbuild.gen.id_kind import (TEST_UNARY_LOOKUP, TEST_BINARY_LOOKUP,
6 TEST_OTHER_LOOKUP)
7from _devbuild.gen.id_kind_asdl import Id_str
8from _devbuild.gen.types_asdl import lex_mode_str
9
10import cStringIO
11import sys
12import sre_parse
13import sre_constants
14
15from frontend import lexer_def
16
17
18# re2c literals are inside double quotes, so we need to escape \ and "
19# But we don't need to do anything with the quoted literal ^
20LITERAL_META = ['\\', '"']
21LITERAL_META_CODES = [ord(c) for c in LITERAL_META]
22
23
24def _Literal(arg, char_escapes=LITERAL_META_CODES):
25 if arg == ord('\n'):
26 s = r'\n'
27 elif arg == ord('\r'):
28 s = r'\r'
29 elif arg == ord('\t'):
30 s = r'\t'
31 elif arg >= 0x80 or arg < 0x20: # 0x1f and below
32 s = '\\x%02x' % arg # for \x80-\xff
33 elif arg in char_escapes:
34 s = '\\' + chr(arg)
35 else:
36 s = chr(arg)
37 return s
38
39
40# - means range. Note that re2c gives an error we uselessly escape \^.
41CHAR_CLASS_META = ['\\', '-', ']']
42CHAR_CLASS_META_CODES = [ord(c) for c in CHAR_CLASS_META]
43
44
45def _CharClassLiteral(arg):
46 return _Literal(arg, char_escapes=CHAR_CLASS_META_CODES)
47
48
49def TranslateConstant(pat):
50 return '"' + ''.join(_Literal(ord(c)) for c in pat) + '"'
51
52
53def TranslateTree(re_tree, f, in_char_class=False):
54 """
55 re_tree: List of children
56 """
57 for child in re_tree:
58 name, arg = child
59 if name == 'in': # character class
60 f.write('[')
61 TranslateTree(arg, f,
62 in_char_class=True) # list of literals/ranges
63 f.write(']')
64
65 elif name == 'branch': # |
66 _, branches = arg
67 for i, branch in enumerate(branches):
68 if i != 0:
69 f.write(' | ')
70 TranslateTree(branch, f)
71
72 elif name == 'max_repeat': # repetition
73 min_, max_, children = arg
74 # min = 0 means *, min = 1 means +
75 #assert min_ in (0, 1), min_
76 TranslateTree(children, f)
77
78 if min_ == 0 and max_ == 1:
79 f.write('? ')
80
81 elif max_ == sre_constants.MAXREPEAT:
82 if min_ == 0:
83 f.write('* ')
84 elif min_ == 1:
85 f.write('+ ')
86 else:
87 assert 0, min_
88
89 else: # re2c also supports [0-7]{1,2} syntax
90 # note: might generated {2,2}
91 f.write('{%d,%d} ' % (min_, max_))
92
93 elif name == 'negate': # ^ in [^a-z]
94 assert arg is None
95 f.write('^')
96
97 elif name == 'literal': # Quote \ and " in re2c syntax
98 # TODO: it matters if we're inside a character class
99 #print("literal ARG %r" % arg)
100
101 if in_char_class:
102 s = _CharClassLiteral(arg)
103 else:
104 s = '"%s" ' % _Literal(arg)
105 f.write(s)
106
107 elif name == 'not_literal': # ditto
108 assert not in_char_class
109 f.write('[^%s]' % _CharClassLiteral(arg))
110
111 elif name == 'range': # ascii range
112 begin, end = arg
113 f.write('%s-%s' %
114 (_CharClassLiteral(begin), _CharClassLiteral(end)))
115
116 elif name == 'any': # This is the '.' character
117 assert arg is None
118 f.write('.')
119
120 elif name == 'subpattern':
121 _, children = arg # Not sure what the _ is, but it works
122 f.write('(')
123 TranslateTree(children, f)
124 f.write(')')
125
126 else:
127 if 0:
128 from mycpp.mylib import log
129 log('child %s', child)
130 raise RuntimeError("I don't understand regex construct: %r" % name)
131
132 # NOTE: negate and not_literal are sort of duplicated
133
134
135def TranslateRegex(pat):
136 re_tree = sre_parse.parse(pat)
137 # For debugging
138 #import pprint
139 #print(pprint.pformat(re_tree), file=sys.stderr)
140 f = cStringIO.StringIO()
141 try:
142 TranslateTree(re_tree, f)
143 except RuntimeError:
144 print('Error translating %r' % pat, file=sys.stderr)
145 raise
146 return f.getvalue()
147
148
149# This explains the sentinel method, which we will use.
150# http://re2c.org/examples/example_01.html
151#
152# TODO: Change ParseTuple to use 's' rather than '#s' ?
153
154# I don't think we need this YYFILL mechanism, because we lex a line at a
155# time.
156# http://re2c.org/examples/example_03.html
157
158
159def TranslateSimpleLexer(func_name, lexer_def):
160 print(r"""
161static inline void %s(const unsigned char* line, int line_len,
162 int start_pos, int* id, int* end_pos) {
163 assert(start_pos <= line_len); /* caller should have checked */
164
165 const unsigned char* p = line + start_pos; /* modified by re2c */
166
167 /* Echo and History lexer apparently need this, but others don't */
168 __attribute__((unused)) const unsigned char* YYMARKER;
169
170 for (;;) {
171 /*!re2c
172""" % func_name)
173
174 for is_regex, pat, id_ in lexer_def:
175 if is_regex:
176 re2c_pat = TranslateRegex(pat)
177 else:
178 re2c_pat = TranslateConstant(pat)
179 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
180 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
181
182 # EARLY RETURN: Do NOT advance past the NUL terminator.
183 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
184 r'"\x00"')
185
186 print("""
187 */
188 }
189 *end_pos = p - line; /* relative */
190}
191""")
192
193
194def TranslateBracket(func_name, token_dict):
195 print(r"""
196static inline int %s(const unsigned char* s, int len) {
197 const unsigned char* p = s; /* modified by re2c */
198 const unsigned char* end = s + len;
199
200 __attribute__((unused)) const unsigned char* YYMARKER;
201 int id;
202
203 for (;;) {
204 /*!re2c
205""" % func_name)
206
207 for pat in sorted(token_dict):
208 id_ = token_dict[pat]
209 re2c_pat = TranslateConstant(pat)
210 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
211 print(' %-30s { id = id__%s; break; }' % (re2c_pat, id_name))
212
213 # EARLY RETURN: Do NOT advance past other chars, including the NUL
214 # terminator.
215 print(' %-30s { return id__Undefined_Tok; }' % '*')
216
217 print("""
218 */
219 }
220 // must be an exact match
221 return (p == end) ? id : id__Undefined_Tok;
222}
223""")
224
225
226def TranslateOshLexer(lexer_def):
227 # https://stackoverflow.com/questions/12836171/difference-between-an-inline-function-and-static-inline-function
228 # Has to be 'static inline' rather than 'inline', otherwise the
229 # _bin/oil.ovm-dbg build fails (but the _bin/oil.ovm doesn't!).
230 # Since we reference this function in exactly one translation unit --
231 # fastlex.c, the difference is moot, and we just satisfy the compiler.
232
233 print(r"""
234/* Common stuff */
235
236/*!re2c
237 re2c:define:YYCTYPE = "unsigned char";
238 re2c:define:YYCURSOR = p;
239 re2c:yyfill:enable = 0; // generated code doesn't ask for more input
240*/
241
242static inline void MatchOshToken(int lex_mode, const unsigned char* line, int line_len,
243 int start_pos, int* id, int* end_pos) {
244 assert(start_pos <= line_len); /* caller should have checked */
245
246 const unsigned char* p = line + start_pos; /* modified by re2c */
247 //printf("p: %p q: %p\n", p, q);
248
249 __attribute__((unused)) const unsigned char* YYMARKER; /* why do we need this? */
250 switch (lex_mode) {
251""")
252
253 # TODO: Should be ordered by most common? Or will profile-directed feedback
254 # help?
255 for state, pat_list in lexer_def.iteritems():
256 # e.g. lex_mode.DQ => lex_mode__DQ
257 print(' case %s:' % lex_mode_str(state).replace('.', '__'))
258 print(' for (;;) {')
259 print(' /*!re2c')
260
261 for is_regex, pat, id_ in pat_list:
262 if is_regex:
263 re2c_pat = TranslateRegex(pat)
264 else:
265 re2c_pat = TranslateConstant(pat)
266 id_name = Id_str(id_).split('.')[-1] # e.g. Undefined_Tok
267 print(' %-30s { *id = id__%s; break; }' % (re2c_pat, id_name))
268
269 # EARLY RETURN: Do NOT advance past the NUL terminator.
270 print(' %-30s { *id = id__Eol_Tok; *end_pos = start_pos; return; }' % \
271 r'"\x00"')
272
273 print(' */')
274 print(' }')
275 print(' break;')
276 print()
277
278 # This is literal code without generation:
279 """
280 case lex_mode__OUTER:
281 for (;;) {
282 /*!re2c
283 literal_chunk = [a-zA-Z0-9_/.-]+;
284 var_like = [a-zA-Z_][a-zA-Z0-9_]* "="; // might be NAME=val
285 comment = [ \t\r]* "#" [^\000\r\n]*;
286 space = [ \t\r]+;
287 nul = "\000";
288
289 literal_chunk { *id = id__Lit_Chars; break; }
290 var_like { *id = id__Lit_VarLike; break; }
291
292 [ \t\r]* "\n" { *id = id__Op_Newline; break; }
293 space { *id = id__WS_Space; break; }
294
295 nul { *id = id__Eof_Real; break; }
296
297 // anything else
298 * { *id = id__Lit_Other; break; }
299
300 */
301 }
302
303 *end_pos = p - line;
304 break;
305
306 case lex_mode__COMMENT:
307 *id = id__Lit_Other;
308 *end_pos = 6;
309 break;
310 """
311
312 print("""\
313 default:
314 assert(0);
315
316 }
317 *end_pos = p - line; /* relative */
318}
319""")
320
321
322def TranslateRegexToPredicate(py_regex, func_name):
323 re2c_pat = TranslateRegex(py_regex)
324 print(r"""
325static inline int %s(const unsigned char* s, int len) {
326 const unsigned char* p = s; /* modified by re2c */
327 const unsigned char* end = s + len;
328
329 /* MatchBraceRangeToken needs this, but others don't */
330 __attribute__((unused)) const unsigned char* YYMARKER;
331
332 /*!re2c
333 re2c:define:YYCTYPE = "unsigned char";
334 re2c:define:YYCURSOR = p;
335 %-30s { return p == end; } // Match must be anchored right, like $
336 * { return 0; }
337 */
338}
339""" % (func_name, re2c_pat))
340
341
342 # note: use YYCURSOR and YYLIMIT
343 # limit should be the end of string
344 # line + line_len
345def main(argv):
346 action = argv[1]
347 if action == 'c':
348 # Code is printed to stdout
349
350 TranslateOshLexer(lexer_def.LEXER_DEF)
351
352 TranslateSimpleLexer('MatchEchoToken', lexer_def.ECHO_E_DEF)
353 TranslateSimpleLexer('MatchPrintfBToken', lexer_def.PRINTF_B_DEF)
354 TranslateSimpleLexer('MatchGlobToken', lexer_def.GLOB_DEF)
355 TranslateSimpleLexer('MatchPS1Token', lexer_def.PS1_DEF)
356 TranslateSimpleLexer('MatchHistoryToken', lexer_def.HISTORY_DEF)
357 TranslateSimpleLexer('MatchBraceRangeToken', lexer_def.BRACE_RANGE_DEF)
358 TranslateSimpleLexer('MatchJ8Token', lexer_def.J8_DEF)
359 TranslateSimpleLexer('MatchJ8LinesToken', lexer_def.J8_LINES_DEF)
360 TranslateSimpleLexer('MatchJ8StrToken', lexer_def.J8_STR_DEF)
361 TranslateSimpleLexer('MatchJsonStrToken', lexer_def.JSON_STR_DEF)
362 TranslateSimpleLexer('MatchShNumberToken', lexer_def.SH_NUMBER_DEF)
363
364 TranslateRegexToPredicate(lexer_def.IS_UTF8_CODESET_RE,
365 'IsUtf8Codeset')
366 TranslateRegexToPredicate(lexer_def.VAR_NAME_RE, 'IsValidVarName')
367 TranslateRegexToPredicate(lexer_def.SHOULD_HIJACK_RE, 'ShouldHijack')
368 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_INTEGER,
369 'LooksLikeInteger')
370 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_YSH_INT,
371 'LooksLikeYshInt')
372 TranslateRegexToPredicate(lexer_def.LOOKS_LIKE_YSH_FLOAT,
373 'LooksLikeYshFloat')
374
375 TranslateBracket('BracketUnary', TEST_UNARY_LOOKUP)
376 TranslateBracket('BracketBinary', TEST_BINARY_LOOKUP)
377 TranslateBracket('BracketOther', TEST_OTHER_LOOKUP)
378
379 elif action == 'print-all':
380 # Top level is a switch statement.
381 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
382 print(state)
383 # This level is re2c patterns.
384 for is_regex, pat, token_id in pat_list:
385 print('\t%r -> %r' % (pat, token_id))
386 if is_regex:
387 #print re_tree
388 _ = TranslateRegex(pat)
389 #print out_pat
390
391 print()
392
393 elif action == 'print-regex':
394 unique = set()
395
396 num_regexes = 0
397 for state, pat_list in lexer_def.LEXER_DEF.iteritems():
398 print(state)
399 # This level is re2c patterns.
400 for is_regex, pat, token_id in pat_list:
401 #print '\t%r -> %r' % (pat, token_id)
402 if is_regex:
403 print('\t' + pat)
404 print('\t' + TranslateRegex(pat))
405 print()
406 num_regexes += 1
407 unique.add(pat)
408 else:
409 print('\t' + TranslateConstant(pat))
410
411 print()
412
413 print('Printed %d regexes (%d unique)' % (num_regexes, len(unique)))
414
415
416if __name__ == '__main__':
417 try:
418 main(sys.argv)
419 except RuntimeError as e:
420 print('FATAL: %s' % e, file=sys.stderr)
421 sys.exit(1)