OILS / doctools / micro_syntax.re2c.h View on Github | oils.pub

731 lines, 548 significant
1#ifndef MICRO_SYNTAX_H
2#define MICRO_SYNTAX_H
3
4#include <assert.h>
5#include <string.h> // strlen()
6
7#include <vector>
8
9enum class Id {
10 // Common to nearly all languages
11 Comm,
12 MaybeComment, // for shell, resolved in a fix-up pass
13
14 WS,
15
16 Name, // Keyword or Identifier
17 Str, // "" and Python r""
18 // '' and Python r''
19 // ''' """
20 // body of here docs
21
22 Other, // any other text
23 Unknown,
24
25 // C++
26 DelimStrBegin, // for C++ R"zzz(hello)zzz"
27 DelimStrEnd,
28 Re2c, // re2c code block
29
30 MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31 PreprocCommand, // resolved #define
32 PreprocOther, // any other text
33 LineCont, // backslash at end of line, for #define continuation
34
35 // Braces for C++ block structure. Could be done in second pass after
36 // removing comments/strings?
37 LBrace,
38 RBrace,
39
40 // Shell
41 HereBegin,
42 HereEnd,
43
44 // Html
45 StartTag,
46 EndTag,
47 StartEndTag,
48
49 // Zero-width token to detect #ifdef and Python INDENT/DEDENT
50 // StartLine,
51
52 // These are special zero-width tokens for Python
53 // Indent,
54 // Dedent,
55 // Maintain our own stack!
56 // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
57};
58
59struct Token {
60 Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
61 }
62 Token(Id id, int end_col)
63 : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
64 }
65
66 Id id;
67 int end_col; // offset from char* line
68 int submatch_start; // ditto
69 int submatch_end; // ditto
70};
71
72// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
73
74template <typename T>
75class Lexer {
76 public:
77 Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
78 }
79
80 void SetLine(char* line) {
81 line_ = line;
82 p_current = line;
83 }
84
85 const char* line_;
86 const char* p_current; // points into line
87 T line_mode; // current mode, starts with Outer
88};
89
90template <typename T>
91class Matcher {
92 public:
93 // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
94 // param.
95 bool Match(Lexer<T>* lexer, Token* tok);
96};
97
98// Macros for semantic actions
99
100#define TOK(k) \
101 tok->id = k; \
102 break;
103
104#define TOK_MODE(k, m) \
105 tok->id = k; \
106 lexer->line_mode = m; \
107 break;
108
109// Must call TOK*() after this
110#define SUBMATCH(s, e) \
111 tok->submatch_start = s - lexer->line_; \
112 tok->submatch_end = e - lexer->line_;
113
114// Regex definitions shared between languages
115
116/*!re2c
117 re2c:yyfill:enable = 0;
118 re2c:define:YYCTYPE = char;
119 re2c:define:YYCURSOR = p;
120
121 nul = [\x00];
122 not_nul = [^\x00];
123
124 // Whitespace is needed for SLOC, to tell if a line is entirely blank
125 whitespace = [ \t\r\n]*;
126
127 identifier = [_a-zA-Z][_a-zA-Z0-9]*;
128
129 // Python and C++ have "" strings
130 // C++ char literals are similar, e.g. '\''
131 // We are not more precise
132
133 sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
134 dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
135
136 sq_string = ['] sq_middle ['];
137 dq_string = ["] dq_middle ["];
138
139 // Shell and Python have # comments
140 pound_comment = "#" not_nul*;
141
142 // YSH and Python have ''' """
143 triple_sq = "'''";
144 triple_dq = ["]["]["];
145*/
146
147enum class text_mode_e {
148 Outer, // default
149};
150
151// Returns whether EOL was hit
152template <>
153bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
154 const char* p = lexer->p_current; // mutated by re2c
155
156 while (true) {
157 /*!re2c
158 nul { return true; }
159
160 // whitespace at start of line
161 whitespace { TOK(Id::WS); }
162
163 // This rule consumes trailing whitespace, but
164 // it's OK. We're counting significant lines, not
165 // highlighting.
166 [^\x00]+ { TOK(Id::Other); }
167
168 * { TOK(Id::Other); }
169
170 */
171 }
172
173 tok->end_col = p - lexer->line_;
174 lexer->p_current = p;
175 return false;
176}
177
178enum class asdl_mode_e {
179 Outer,
180};
181
182// Returns whether EOL was hit
183template <>
184bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
185 const char* p = lexer->p_current; // mutated by re2c
186
187 switch (lexer->line_mode) {
188 case asdl_mode_e::Outer:
189 while (true) {
190 /*!re2c
191 nul { return true; }
192
193 whitespace { TOK(Id::WS); }
194
195 identifier { TOK(Id::Name); }
196
197 pound_comment { TOK(Id::Comm); }
198
199 // Not the start of a comment, identifier
200 [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
201
202 // e.g. unclosed quote like "foo
203 * { TOK(Id::Unknown); }
204
205 */
206 }
207 break;
208 }
209
210 tok->end_col = p - lexer->line_;
211 lexer->p_current = p;
212 return false;
213}
214
215enum class py_mode_e {
216 Outer, // default
217 MultiSQ, // inside '''
218 MultiDQ, // inside """
219};
220
221// Returns whether EOL was hit
222template <>
223bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
224 const char* p = lexer->p_current; // mutated by re2c
225 const char* YYMARKER = p;
226
227 switch (lexer->line_mode) {
228 case py_mode_e::Outer:
229 while (true) {
230 /*!re2c
231 nul { return true; }
232
233 whitespace { TOK(Id::WS); }
234
235 identifier { TOK(Id::Name); }
236
237 [r]? sq_string { TOK(Id::Str); }
238 [r]? dq_string { TOK(Id::Str); }
239
240 // optional raw prefix
241 [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
242 [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
243
244 pound_comment { TOK(Id::Comm); }
245
246 // Not the start of a string, comment, identifier
247 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
248
249 // e.g. unclosed quote like "foo
250 * { TOK(Id::Unknown); }
251
252 */
253 }
254 break;
255
256 case py_mode_e::MultiSQ:
257 while (true) {
258 /*!re2c
259 nul { return true; }
260
261 triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
262
263 [^\x00']* { TOK(Id::Str); }
264
265 * { TOK(Id::Str); }
266
267 */
268 }
269 break;
270
271 case py_mode_e::MultiDQ:
272 while (true) {
273 /*!re2c
274 nul { return true; }
275
276 triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
277
278 [^\x00"]* { TOK(Id::Str); }
279
280 * { TOK(Id::Str); }
281
282 */
283 }
284 break;
285 }
286
287 tok->end_col = p - lexer->line_;
288 lexer->p_current = p;
289 return false;
290}
291
292enum class cpp_mode_e {
293 Outer, // default
294 Comm, // inside /* */ comment
295 DelimStr, // R"zz(string literal)zz"
296 Re2c, // /* !re2c
297};
298
299// Returns whether EOL was hit
300template <>
301bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
302 const char* p = lexer->p_current; // mutated by re2c
303 const char* YYMARKER = p;
304 const char *s, *e; // submatch extraction
305
306 // Autogenerated tag variables used by the lexer to track tag values.
307 /*!stags:re2c format = 'const char *@@;\n'; */
308
309 switch (lexer->line_mode) {
310 case cpp_mode_e::Outer:
311
312 while (true) {
313 /*!re2c
314 nul { return true; }
315
316 whitespace { TOK(Id::WS); }
317
318 "{" { TOK(Id::LBrace); }
319 "}" { TOK(Id::RBrace); }
320
321 identifier { TOK(Id::Name); }
322
323 // approximation for C++ char literals
324 sq_string { TOK(Id::Str); }
325 dq_string { TOK(Id::Str); }
326
327 // Not the start of a string, comment, identifier
328 [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
329
330 "//" not_nul* { TOK(Id::Comm); }
331
332 // Treat re2c as preprocessor block
333 "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
334
335 "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
336
337 // Not sure what the rules are for R"zz(hello)zz". Make it similar to
338 // here docs.
339 cpp_delim_str = [_a-zA-Z]*;
340
341 "R" ["] @s cpp_delim_str @e "(" {
342 SUBMATCH(s, e);
343 TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
344 }
345
346 // e.g. unclosed quote like "foo
347 * { TOK(Id::Unknown); }
348
349 */
350 }
351 break;
352
353 case cpp_mode_e::Comm:
354 // Search until next */
355 while (true) {
356 /*!re2c
357 nul { return true; }
358
359 "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
360
361 [^\x00*]* { TOK(Id::Comm); }
362
363 * { TOK(Id::Comm); }
364
365 */
366 }
367 break;
368
369 case cpp_mode_e::Re2c:
370 // Search until next */
371 while (true) {
372 /*!re2c
373 nul { return true; }
374
375 "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
376
377 [^\x00*]* { TOK(Id::Re2c); }
378
379 * { TOK(Id::Re2c); }
380
381 */
382 }
383 break;
384
385 case cpp_mode_e::DelimStr:
386 // Search until next */
387 while (true) {
388 /*!re2c
389 nul { return true; }
390
391 ")" @s cpp_delim_str @e ["] {
392 SUBMATCH(s, e);
393 TOK(Id::DelimStrEnd);
394
395 // Caller is responsible for checking the extracted delimiter, and
396 // setting mode back to Cpp::Outer!
397 }
398
399 [^\x00)]* { TOK(Id::Str); }
400
401 * { TOK(Id::Str); }
402
403 */
404 }
405 break;
406 }
407
408 tok->end_col = p - lexer->line_;
409 lexer->p_current = p;
410 return false;
411}
412
413class Hook {
414 public:
415 // Return true if this is a preprocessor line, and fill in tokens
416 // Caller should check last token for whether there is a continuation line.
417 virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
418 ;
419 }
420 virtual ~Hook() {
421 }
422};
423
424enum class pp_mode_e {
425 Outer,
426};
427
428// Returns whether EOL was hit
429template <>
430bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
431 const char* p = lexer->p_current; // mutated by re2c
432 const char* YYMARKER = p;
433
434 switch (lexer->line_mode) {
435 case pp_mode_e::Outer:
436 while (true) {
437 /*!re2c
438 nul { return true; }
439
440 // Resolved in fix-up pass
441 // #include #define etc. only valid at the
442 // beginning
443 [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
444
445 // C-style comments can end these lines
446 "//" not_nul* { TOK(Id::Comm); }
447
448 [\\] [\n] { TOK(Id::LineCont); }
449
450 // A line could be all whitespace, then \ at the
451 // end. And it's not significant
452 whitespace { TOK(Id::WS); }
453
454 // Not the start of a command, comment, or line
455 // continuation
456 [^\x00#/\\]+ { TOK(Id::PreprocOther); }
457
458 * { TOK(Id::PreprocOther); }
459
460 */
461 }
462 break;
463 }
464
465 tok->end_col = p - lexer->line_;
466 lexer->p_current = p;
467 return false;
468}
469
470class CppHook : public Hook {
471 public:
472 virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
473};
474
475enum class R_mode_e {
476 Outer, // default
477
478 SQ, // inside multi-line ''
479 DQ, // inside multi-line ""
480};
481
482// Returns whether EOL was hit
483template <>
484bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
485 const char* p = lexer->p_current; // mutated by re2c
486 const char* YYMARKER = p;
487
488 switch (lexer->line_mode) {
489 case R_mode_e::Outer:
490 while (true) {
491 /*!re2c
492 nul { return true; }
493
494 whitespace { TOK(Id::WS); }
495
496 pound_comment { TOK(Id::Comm); }
497
498 identifier { TOK(Id::Name); }
499
500 // Not the start of a string, escaped, comment, identifier
501 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
502
503 ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
504 ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
505
506 * { TOK(Id::Unknown); }
507
508 */
509 }
510 break;
511
512 case R_mode_e::SQ:
513 while (true) {
514 /*!re2c
515 nul { return true; }
516
517 ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
518
519 sq_middle { TOK(Id::Str); }
520
521 * { TOK(Id::Str); }
522
523 */
524 }
525 break;
526
527 case R_mode_e::DQ:
528 while (true) {
529 /*!re2c
530 nul { return true; }
531
532 ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
533
534 dq_middle { TOK(Id::Str); }
535
536 * { TOK(Id::Str); }
537
538 */
539 }
540 break;
541 }
542
543 tok->end_col = p - lexer->line_;
544 lexer->p_current = p;
545 return false;
546}
547
548// Problem with shell: nested double quotes!!!
549// We probably discourage this in YSH
550
551enum class sh_mode_e {
552 Outer, // default
553
554 SQ, // inside multi-line ''
555 DollarSQ, // inside multi-line $''
556 DQ, // inside multi-line ""
557
558 // We could have a separate thing for this
559 YshSQ, // inside '''
560 YshDQ, // inside """
561 YshJ, // inside j"""
562};
563
564// Returns whether EOL was hit
565
566// Submatch docs:
567// https://re2c.org/manual/manual_c.html#submatch-extraction
568
569template <>
570bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
571 const char* p = lexer->p_current; // mutated by re2c
572 const char* YYMARKER = p;
573 const char *s, *e; // submatch extraction
574
575 // Autogenerated tag variables used by the lexer to track tag values.
576 /*!stags:re2c format = 'const char *@@;\n'; */
577
578 switch (lexer->line_mode) {
579 case sh_mode_e::Outer:
580 while (true) {
581 /*!re2c
582 nul { return true; }
583
584 whitespace { TOK(Id::WS); }
585
586 // Resolved in fix-up pass
587 pound_comment { TOK(Id::MaybeComment); }
588
589 // not that relevant for shell
590 identifier { TOK(Id::Name); }
591
592 // Not the start of a string, escaped, comment, identifier, here doc
593 [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
594
595 // echo is like a string
596 "\\" . { TOK(Id::Str); }
597
598 ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
599 ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
600 "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
601
602 // <<- is another syntax
603 here_op = "<<" [-]? [ \t]*;
604 h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
605
606 // unquoted or quoted
607 here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
608 here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
609 here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
610 here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
611
612 // NOT Unknown, as in Python
613 * { TOK(Id::Other); }
614
615 */
616 }
617 break;
618
619 case sh_mode_e::SQ:
620 // Search until next ' unconditionally
621 while (true) {
622 /*!re2c
623 nul { return true; }
624
625 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
626
627 [^\x00']* { TOK(Id::Str); }
628
629 * { TOK(Id::Str); }
630
631 */
632 }
633 break;
634
635 case sh_mode_e::DQ:
636 // Search until next " that's not preceded by "
637 while (true) {
638 /*!re2c
639 nul { return true; }
640
641 ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
642
643 dq_middle { TOK(Id::Str); }
644
645 * { TOK(Id::Str); }
646
647 */
648 }
649 break;
650
651 case sh_mode_e::DollarSQ:
652 // Search until next ' that's not preceded by "
653 while (true) {
654 /*!re2c
655 nul { return true; }
656
657 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
658
659 sq_middle { TOK(Id::Str); }
660
661 * { TOK(Id::Str); }
662
663 */
664 }
665 break;
666 case sh_mode_e::YshSQ:
667 case sh_mode_e::YshDQ:
668 case sh_mode_e::YshJ:
669 assert(0);
670 }
671
672 tok->end_col = p - lexer->line_;
673 lexer->p_current = p;
674 return false;
675}
676
677enum class html_mode_e {
678 Outer,
679};
680
681// Returns whether EOL was hit
682template <>
683bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
684 const char* p = lexer->p_current; // mutated by re2c
685 const char* YYMARKER = p;
686
687 switch (lexer->line_mode) {
688 case html_mode_e::Outer:
689 while (true) {
690 /*!re2c
691 nul { return true; }
692
693 // Like _NAME in HTM8
694 name = [a-zA-Z][a-zA-Z0-9:_-]* ;
695
696 '</' name '>' { TOK(Id::EndTag); }
697 '<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); }
698 '<' name [^>\x00]* '>' { TOK(Id::StartTag); }
699
700 // TODO: Fill in the rest of the HTM8 lexer.
701
702 * { TOK(Id::Other); }
703
704 */
705 }
706 break;
707 }
708
709 tok->end_col = p - lexer->line_;
710 lexer->p_current = p;
711 return false;
712}
713
714
715// TODO:
716// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
717// - same as C++ raw string, I think
718// - similar to here docs, but less complex
719//
720// Inherent problems with "micro segmentation":
721//
722// - Nested double quotes in shell. echo "hi ${name:-"default"}"
723// - This means that lexing is **dependent on** parsing: does the second
724// double quote **close** the first one, or does it start a nested string?
725// - lexing is non-recursive, parsing is recursive
726
727// Shell Comments depend on operator chars
728// echo one # comment
729// echo $(( 16#ff ))'
730
731#endif // MICRO_SYNTAX_H