OILS / doctools / micro_syntax.re2c.h View on Github | oils.pub

907 lines, 708 significant
1#ifndef MICRO_SYNTAX_H
2#define MICRO_SYNTAX_H
3
4#include <assert.h>
5#include <string.h> // strlen()
6
7#include <vector>
8
9enum class Id {
10 // Common to nearly all languages
11 Comm,
12 MaybeComment, // for shell, resolved in a fix-up pass
13
14 WS,
15
16 Name, // Keyword or Identifier
17 Str, // "" and Python r""
18 // '' and Python r''
19 // ''' """
20 // body of here docs
21
22 Other, // any other text
23 Unknown,
24
25 // C++
26 DelimStrBegin, // for C++ R"zzz(hello)zzz"
27 DelimStrEnd,
28 Re2c, // re2c code block
29
30 MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31 PreprocCommand, // resolved #define
32 PreprocOther, // any other text
33 LineCont, // backslash at end of line, for #define continuation
34
35 // Braces for C++ block structure. Could be done in second pass after
36 // removing comments/strings?
37 LBrace,
38 RBrace,
39
40 // Shell
41 HereBegin,
42 HereEnd,
43
44 // Html
45 TagNameLeft, // start <a> or <br id=foo />
46 SelfClose, // />
47 TagNameRight, // >
48 EndTag, // </a>
49 CharEscape, // &amp;
50 AttrName, // foo=
51 BadAmpersand,
52 BadLessThan,
53 BadGreaterThan,
54 // Reused: Str Other
55
56 // Zero-width token to detect #ifdef and Python INDENT/DEDENT
57 // StartLine,
58
59 // These are special zero-width tokens for Python
60 // Indent,
61 // Dedent,
62 // Maintain our own stack!
63 // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
64};
65
66struct Token {
67 Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
68 }
69 Token(Id id, int end_col)
70 : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
71 }
72
73 Id id;
74 int end_col; // offset from char* line
75 int submatch_start; // ditto
76 int submatch_end; // ditto
77};
78
79// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
80
81template <typename T>
82class Lexer {
83 public:
84 Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
85 }
86
87 void SetLine(char* line) {
88 line_ = line;
89 p_current = line;
90 }
91
92 const char* line_;
93 const char* p_current; // points into line
94 T line_mode; // current mode, starts with Outer
95};
96
97template <typename T>
98class Matcher {
99 public:
100 // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
101 // param.
102 bool Match(Lexer<T>* lexer, Token* tok);
103};
104
105// Macros for semantic actions
106
107#define TOK(k) \
108 tok->id = k; \
109 break;
110
111#define TOK_MODE(k, m) \
112 tok->id = k; \
113 lexer->line_mode = m; \
114 break;
115
116// Must call TOK*() after this
117#define SUBMATCH(s, e) \
118 tok->submatch_start = s - lexer->line_; \
119 tok->submatch_end = e - lexer->line_;
120
121// Regex definitions shared between languages
122
123/*!re2c
124 re2c:yyfill:enable = 0;
125 re2c:define:YYCTYPE = char;
126 re2c:define:YYCURSOR = p;
127
128 nul = [\x00];
129 not_nul = [^\x00];
130
131 // Whitespace is needed for SLOC, to tell if a line is entirely blank
132 whitespace = [ \t\r\n]*;
133 space_required = [ \t\r\n]+;
134
135 identifier = [_a-zA-Z][_a-zA-Z0-9]*;
136
137 // Python and C++ have "" strings
138 // C++ char literals are similar, e.g. '\''
139 // We are not more precise
140
141 sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
142 dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
143
144 sq_string = ['] sq_middle ['];
145 dq_string = ["] dq_middle ["];
146
147 // Shell and Python have # comments
148 pound_comment = "#" not_nul*;
149
150 // YSH and Python have ''' """
151 triple_sq = "'''";
152 triple_dq = ["]["]["];
153*/
154
155enum class text_mode_e {
156 Outer, // default
157};
158
159// Returns whether EOL was hit
160template <>
161bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
162 const char* p = lexer->p_current; // mutated by re2c
163
164 while (true) {
165 /*!re2c
166 nul { return true; }
167
168 // whitespace at start of line
169 whitespace { TOK(Id::WS); }
170
171 // This rule consumes trailing whitespace, but
172 // it's OK. We're counting significant lines, not
173 // highlighting.
174 [^\x00]+ { TOK(Id::Other); }
175
176 * { TOK(Id::Other); }
177
178 */
179 }
180
181 tok->end_col = p - lexer->line_;
182 lexer->p_current = p;
183 return false;
184}
185
186enum class asdl_mode_e {
187 Outer,
188};
189
190// Returns whether EOL was hit
191template <>
192bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
193 const char* p = lexer->p_current; // mutated by re2c
194
195 switch (lexer->line_mode) {
196 case asdl_mode_e::Outer:
197 while (true) {
198 /*!re2c
199 nul { return true; }
200
201 whitespace { TOK(Id::WS); }
202
203 identifier { TOK(Id::Name); }
204
205 pound_comment { TOK(Id::Comm); }
206
207 // Not the start of a comment, identifier
208 [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
209
210 // e.g. unclosed quote like "foo
211 * { TOK(Id::Unknown); }
212
213 */
214 }
215 break;
216 }
217
218 tok->end_col = p - lexer->line_;
219 lexer->p_current = p;
220 return false;
221}
222
223enum class py_mode_e {
224 Outer, // default
225 MultiSQ, // inside '''
226 MultiDQ, // inside """
227};
228
229// Returns whether EOL was hit
230template <>
231bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
232 const char* p = lexer->p_current; // mutated by re2c
233 const char* YYMARKER = p;
234
235 switch (lexer->line_mode) {
236 case py_mode_e::Outer:
237 while (true) {
238 /*!re2c
239 nul { return true; }
240
241 whitespace { TOK(Id::WS); }
242
243 identifier { TOK(Id::Name); }
244
245 [r]? sq_string { TOK(Id::Str); }
246 [r]? dq_string { TOK(Id::Str); }
247
248 // optional raw prefix
249 [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
250 [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
251
252 pound_comment { TOK(Id::Comm); }
253
254 // Not the start of a string, comment, identifier
255 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
256
257 // e.g. unclosed quote like "foo
258 * { TOK(Id::Unknown); }
259
260 */
261 }
262 break;
263
264 case py_mode_e::MultiSQ:
265 while (true) {
266 /*!re2c
267 nul { return true; }
268
269 triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
270
271 [^\x00']* { TOK(Id::Str); }
272
273 * { TOK(Id::Str); }
274
275 */
276 }
277 break;
278
279 case py_mode_e::MultiDQ:
280 while (true) {
281 /*!re2c
282 nul { return true; }
283
284 triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
285
286 [^\x00"]* { TOK(Id::Str); }
287
288 * { TOK(Id::Str); }
289
290 */
291 }
292 break;
293 }
294
295 tok->end_col = p - lexer->line_;
296 lexer->p_current = p;
297 return false;
298}
299
300enum class cpp_mode_e {
301 Outer, // default
302 Comm, // inside /* */ comment
303 DelimStr, // R"zz(string literal)zz"
304 Re2c, // /* !re2c
305};
306
307// Returns whether EOL was hit
308template <>
309bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
310 const char* p = lexer->p_current; // mutated by re2c
311 const char* YYMARKER = p;
312 const char *s, *e; // submatch extraction
313
314 // Autogenerated tag variables used by the lexer to track tag values.
315 /*!stags:re2c format = 'const char *@@;\n'; */
316
317 switch (lexer->line_mode) {
318 case cpp_mode_e::Outer:
319
320 while (true) {
321 /*!re2c
322 nul { return true; }
323
324 whitespace { TOK(Id::WS); }
325
326 "{" { TOK(Id::LBrace); }
327 "}" { TOK(Id::RBrace); }
328
329 identifier { TOK(Id::Name); }
330
331 // approximation for C++ char literals
332 sq_string { TOK(Id::Str); }
333 dq_string { TOK(Id::Str); }
334
335 // Not the start of a string, comment, identifier
336 [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
337
338 "//" not_nul* { TOK(Id::Comm); }
339
340 // Treat re2c as preprocessor block
341 "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
342
343 "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
344
345 // Not sure what the rules are for R"zz(hello)zz". Make it similar to
346 // here docs.
347 cpp_delim_str = [_a-zA-Z]*;
348
349 "R" ["] @s cpp_delim_str @e "(" {
350 SUBMATCH(s, e);
351 TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
352 }
353
354 // e.g. unclosed quote like "foo
355 * { TOK(Id::Unknown); }
356
357 */
358 }
359 break;
360
361 case cpp_mode_e::Comm:
362 // Search until next */
363 while (true) {
364 /*!re2c
365 nul { return true; }
366
367 "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
368
369 [^\x00*]* { TOK(Id::Comm); }
370
371 * { TOK(Id::Comm); }
372
373 */
374 }
375 break;
376
377 case cpp_mode_e::Re2c:
378 // Search until next */
379 while (true) {
380 /*!re2c
381 nul { return true; }
382
383 "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
384
385 [^\x00*]* { TOK(Id::Re2c); }
386
387 * { TOK(Id::Re2c); }
388
389 */
390 }
391 break;
392
393 case cpp_mode_e::DelimStr:
394 // Search until next */
395 while (true) {
396 /*!re2c
397 nul { return true; }
398
399 ")" @s cpp_delim_str @e ["] {
400 SUBMATCH(s, e);
401 TOK(Id::DelimStrEnd);
402
403 // Caller is responsible for checking the extracted delimiter, and
404 // setting mode back to Cpp::Outer!
405 }
406
407 [^\x00)]* { TOK(Id::Str); }
408
409 * { TOK(Id::Str); }
410
411 */
412 }
413 break;
414 }
415
416 tok->end_col = p - lexer->line_;
417 lexer->p_current = p;
418 return false;
419}
420
421class Hook {
422 public:
423 // Return true if this is a preprocessor line, and fill in tokens
424 // Caller should check last token for whether there is a continuation line.
425 virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
426 ;
427 }
428 virtual ~Hook() {
429 }
430};
431
432enum class pp_mode_e {
433 Outer,
434};
435
436// Returns whether EOL was hit
437template <>
438bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
439 const char* p = lexer->p_current; // mutated by re2c
440 const char* YYMARKER = p;
441
442 switch (lexer->line_mode) {
443 case pp_mode_e::Outer:
444 while (true) {
445 /*!re2c
446 nul { return true; }
447
448 // Resolved in fix-up pass
449 // #include #define etc. only valid at the
450 // beginning
451 [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
452
453 // C-style comments can end these lines
454 "//" not_nul* { TOK(Id::Comm); }
455
456 [\\] [\n] { TOK(Id::LineCont); }
457
458 // A line could be all whitespace, then \ at the
459 // end. And it's not significant
460 whitespace { TOK(Id::WS); }
461
462 // Not the start of a command, comment, or line
463 // continuation
464 [^\x00#/\\]+ { TOK(Id::PreprocOther); }
465
466 * { TOK(Id::PreprocOther); }
467
468 */
469 }
470 break;
471 }
472
473 tok->end_col = p - lexer->line_;
474 lexer->p_current = p;
475 return false;
476}
477
478class CppHook : public Hook {
479 public:
480 virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
481};
482
483enum class R_mode_e {
484 Outer, // default
485
486 SQ, // inside multi-line ''
487 DQ, // inside multi-line ""
488};
489
490// Returns whether EOL was hit
491template <>
492bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
493 const char* p = lexer->p_current; // mutated by re2c
494 const char* YYMARKER = p;
495
496 switch (lexer->line_mode) {
497 case R_mode_e::Outer:
498 while (true) {
499 /*!re2c
500 nul { return true; }
501
502 whitespace { TOK(Id::WS); }
503
504 pound_comment { TOK(Id::Comm); }
505
506 identifier { TOK(Id::Name); }
507
508 // Not the start of a string, escaped, comment, identifier
509 [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
510
511 ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
512 ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
513
514 * { TOK(Id::Unknown); }
515
516 */
517 }
518 break;
519
520 case R_mode_e::SQ:
521 while (true) {
522 /*!re2c
523 nul { return true; }
524
525 ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
526
527 sq_middle { TOK(Id::Str); }
528
529 * { TOK(Id::Str); }
530
531 */
532 }
533 break;
534
535 case R_mode_e::DQ:
536 while (true) {
537 /*!re2c
538 nul { return true; }
539
540 ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
541
542 dq_middle { TOK(Id::Str); }
543
544 * { TOK(Id::Str); }
545
546 */
547 }
548 break;
549 }
550
551 tok->end_col = p - lexer->line_;
552 lexer->p_current = p;
553 return false;
554}
555
556// Problem with shell: nested double quotes!!!
557// We probably discourage this in YSH
558
559enum class sh_mode_e {
560 Outer, // default
561
562 SQ, // inside multi-line ''
563 DollarSQ, // inside multi-line $''
564 DQ, // inside multi-line ""
565
566 // We could have a separate thing for this
567 YshSQ, // inside '''
568 YshDQ, // inside """
569 YshJ, // inside j"""
570};
571
572// Returns whether EOL was hit
573
574// Submatch docs:
575// https://re2c.org/manual/manual_c.html#submatch-extraction
576
577template <>
578bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
579 const char* p = lexer->p_current; // mutated by re2c
580 const char* YYMARKER = p;
581 const char *s, *e; // submatch extraction
582
583 // Autogenerated tag variables used by the lexer to track tag values.
584 /*!stags:re2c format = 'const char *@@;\n'; */
585
586 switch (lexer->line_mode) {
587 case sh_mode_e::Outer:
588 while (true) {
589 /*!re2c
590 nul { return true; }
591
592 whitespace { TOK(Id::WS); }
593
594 // Resolved in fix-up pass
595 pound_comment { TOK(Id::MaybeComment); }
596
597 // not that relevant for shell
598 identifier { TOK(Id::Name); }
599
600 // Not the start of a string, escaped, comment, identifier, here doc
601 [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
602
603 // echo is like a string
604 "\\" . { TOK(Id::Str); }
605
606 ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
607 ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
608 "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
609
610 // <<- is another syntax
611 here_op = "<<" [-]? [ \t]*;
612 h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
613
614 // unquoted or quoted
615 here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
616 here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
617 here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
618 here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
619
620 // NOT Unknown, as in Python
621 * { TOK(Id::Other); }
622
623 */
624 }
625 break;
626
627 case sh_mode_e::SQ:
628 // Search until next ' unconditionally
629 while (true) {
630 /*!re2c
631 nul { return true; }
632
633 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
634
635 [^\x00']* { TOK(Id::Str); }
636
637 * { TOK(Id::Str); }
638
639 */
640 }
641 break;
642
643 case sh_mode_e::DQ:
644 // Search until next " that's not preceded by "
645 while (true) {
646 /*!re2c
647 nul { return true; }
648
649 ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
650
651 dq_middle { TOK(Id::Str); }
652
653 * { TOK(Id::Str); }
654
655 */
656 }
657 break;
658
659 case sh_mode_e::DollarSQ:
660 // Search until next ' that's not preceded by "
661 while (true) {
662 /*!re2c
663 nul { return true; }
664
665 ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
666
667 sq_middle { TOK(Id::Str); }
668
669 * { TOK(Id::Str); }
670
671 */
672 }
673 break;
674 case sh_mode_e::YshSQ:
675 case sh_mode_e::YshDQ:
676 case sh_mode_e::YshJ:
677 assert(0);
678 }
679
680 tok->end_col = p - lexer->line_;
681 lexer->p_current = p;
682 return false;
683}
684
685enum class html_mode_e {
686 Outer, // <NAME enters the TAG state
687 AttrName, // NAME=" NAME=' NAME= NAME
688 AttrValue, // NAME=" NAME=' NAME=
689 SQ, // respects Chars, can contain "
690 DQ, // respects Chars, can contain '
691 Comm, // <!-- -->
692 Preprocessing, // <? ?>
693 CData, // <![CDATA[ x ]]>
694 HtmlCData, // <script> <style>
695};
696
697// LeftStartTag -> RightStartTag <a href=/ >
698// LeftStartTag -> SelfClose <br id=foo />
699
700// Returns whether EOL was hit
701template <>
702bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
703 const char* p = lexer->p_current; // mutated by re2c
704 const char* YYMARKER = p;
705
706 /*!re2c
707 // Common definitions
708
709 // Like _NAME_RE in HTM8
710 name = [a-zA-Z][a-zA-Z0-9:_-]* ;
711
712 // TODO: check this pattern
713 char_name = "&" [a-zA-Z][a-zA-Z0-9]* ";" ;
714 char_dec = "&#" [0-9]+ ";" ;
715 char_hex = "&#x" [0-9a-fA-F]+ ";" ;
716 */
717
718 switch (lexer->line_mode) {
719 case html_mode_e::Outer:
720 while (true) {
721 /*!re2c
722 // accepted EOF
723 nul { return true; }
724
725 char_name { TOK(Id::CharEscape); }
726 char_dec { TOK(Id::CharEscape); }
727 char_hex { TOK(Id::CharEscape); }
728
729 "&" { TOK(Id::BadAmpersand); }
730 ">" { TOK(Id::BadGreaterThan); }
731 "<" { TOK(Id::BadLessThan); }
732
733 "</" name ">" { TOK(Id::EndTag); }
734
735 "<" name {
736 TOK_MODE(Id::TagNameLeft, html_mode_e::AttrName);
737 // TODO: <script> <style> - special logic for strstr()
738 }
739
740 // Problem: these can span more than one linee ... it needs to be
741 // another mode? The end tag might be technically the same.
742 "<!" [^\x00>]* ">" { TOK(Id::Comm); }
743
744 "<!--" { TOK_MODE(Id::Comm, html_mode_e::Comm); }
745 "<?" { TOK_MODE(Id::Comm, html_mode_e::Preprocessing); }
746 "<![CDATA[" { TOK_MODE(Id::Str, html_mode_e::CData); }
747
748
749 // Like RawData
750 * { TOK(Id::Other); }
751
752 */
753 }
754 break;
755 case html_mode_e::AttrName:
756 while (true) {
757 /*!re2c
758 nul { return true; } // TODO: error
759
760 // TODO: If the tag was <script> or <STYLE>, then we want to enter
761 // HtmlCData mode, until we hit </script> or </STYLE>.
762 // This is live throughout AttrName, AttrValue, SQ, DQ states?
763 ">" { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
764 "/>" { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
765
766 space_required name {
767 // <a missing> - stay in the AttrName mode
768 TOK(Id::AttrName);
769 }
770
771 space_required name whitespace '=' whitespace {
772 // NAME= NAME=' NAME=" - expecting a value
773 TOK_MODE(Id::AttrName, html_mode_e::AttrValue);
774 }
775
776 * { TOK(Id::Unknown); }
777 */
778 }
779 break;
780 case html_mode_e::AttrValue:
781 while (true) {
782 /*!re2c
783 nul { return true; } // TODO: error
784
785 ["] { TOK_MODE(Id::Str, html_mode_e::DQ); }
786 ['] { TOK_MODE(Id::Str, html_mode_e::SQ); }
787
788 // Unquoted value - a single token
789 unquoted_value = [^\x00 \r\n\t<>&"']+ ;
790
791 unquoted_value { TOK_MODE(Id::Str, html_mode_e::AttrName); }
792
793 * { TOK(Id::Unknown); }
794 */
795 }
796 break;
797
798 case html_mode_e::DQ:
799 while (true) {
800 /*!re2c
801 nul { return true; } // TODO: error
802 char_name { TOK(Id::CharEscape); }
803 char_dec { TOK(Id::CharEscape); }
804 char_hex { TOK(Id::CharEscape); }
805
806 // we would only need these for translation to XML, not
807 // highlighting?
808 "&" { TOK(Id::BadAmpersand); }
809 ">" { TOK(Id::BadGreaterThan); }
810 "<" { TOK(Id::BadLessThan); }
811
812 ["] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
813 * { TOK(Id::Str); }
814 */
815 }
816 break;
817 case html_mode_e::SQ:
818 while (true) {
819 /*!re2c
820 nul { return true; } // TODO: error
821 char_name { TOK(Id::CharEscape); }
822 char_dec { TOK(Id::CharEscape); }
823 char_hex { TOK(Id::CharEscape); }
824
825 // we would only need these for translation to XML, not
826 // highlighting?
827 "&" { TOK(Id::BadAmpersand); }
828 ">" { TOK(Id::BadGreaterThan); }
829 "<" { TOK(Id::BadLessThan); }
830 ['] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
831
832 * { TOK(Id::Str); }
833 */
834 }
835 break;
836 case html_mode_e::Comm:
837 // Search until next -->
838 while (true) {
839 /*!re2c
840 nul { return true; }
841
842 "-->" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
843
844 [^\x00-]* { TOK(Id::Comm); }
845
846 * { TOK(Id::Comm); }
847
848 */
849 }
850 break;
851 case html_mode_e::Preprocessing:
852 // Search until next ?>
853 while (true) {
854 /*!re2c
855 nul { return true; }
856
857 "?>" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
858
859 [^\x00?]* { TOK(Id::Comm); }
860
861 * { TOK(Id::Comm); }
862
863 */
864 }
865 break;
866 case html_mode_e::CData:
867 // Search until next ]]>
868 while (true) {
869 /*!re2c
870 nul { return true; }
871
872 "]]>" { TOK_MODE(Id::Str, html_mode_e::Outer); }
873
874 [^\x00\]]* { TOK(Id::Str); }
875
876 * { TOK(Id::Str); }
877
878 */
879 }
880 break;
881
882 default:
883 assert(0);
884 }
885
886 tok->end_col = p - lexer->line_;
887 lexer->p_current = p;
888 return false;
889}
890
891// TODO:
892// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
893// - same as C++ raw string, I think
894// - similar to here docs, but less complex
895//
896// Inherent problems with "micro segmentation":
897//
898// - Nested double quotes in shell. echo "hi ${name:-"default"}"
899// - This means that lexing is **dependent on** parsing: does the second
900// double quote **close** the first one, or does it start a nested string?
901// - lexing is non-recursive, parsing is recursive
902
903// Shell Comments depend on operator chars
904// echo one # comment
905// echo $(( 16#ff ))'
906
907#endif // MICRO_SYNTAX_H