1 | #ifndef MICRO_SYNTAX_H
|
2 | #define MICRO_SYNTAX_H
|
3 |
|
4 | #include <assert.h>
|
5 | #include <string.h> // strlen()
|
6 |
|
7 | #include <vector>
|
8 |
|
9 | enum class Id {
|
10 | // Common to nearly all languages
|
11 | Comm,
|
12 | MaybeComment, // for shell, resolved in a fix-up pass
|
13 |
|
14 | WS,
|
15 |
|
16 | Name, // Keyword or Identifier
|
17 | Str, // "" and Python r""
|
18 | // '' and Python r''
|
19 | // ''' """
|
20 | // body of here docs
|
21 |
|
22 | Other, // any other text
|
23 | Unknown,
|
24 |
|
25 | // C++
|
26 | DelimStrBegin, // for C++ R"zzz(hello)zzz"
|
27 | DelimStrEnd,
|
28 | Re2c, // re2c code block
|
29 |
|
30 | MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
|
31 | PreprocCommand, // resolved #define
|
32 | PreprocOther, // any other text
|
33 | LineCont, // backslash at end of line, for #define continuation
|
34 |
|
35 | // Braces for C++ block structure. Could be done in second pass after
|
36 | // removing comments/strings?
|
37 | LBrace,
|
38 | RBrace,
|
39 |
|
40 | // Shell
|
41 | HereBegin,
|
42 | HereEnd,
|
43 |
|
44 | // Html
|
45 | StartTag,
|
46 | EndTag,
|
47 | StartEndTag,
|
48 |
|
49 | // Zero-width token to detect #ifdef and Python INDENT/DEDENT
|
50 | // StartLine,
|
51 |
|
52 | // These are special zero-width tokens for Python
|
53 | // Indent,
|
54 | // Dedent,
|
55 | // Maintain our own stack!
|
56 | // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
|
57 | };
|
58 |
|
59 | struct Token {
|
60 | Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
|
61 | }
|
62 | Token(Id id, int end_col)
|
63 | : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
|
64 | }
|
65 |
|
66 | Id id;
|
67 | int end_col; // offset from char* line
|
68 | int submatch_start; // ditto
|
69 | int submatch_end; // ditto
|
70 | };
|
71 |
|
72 | // Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
|
73 |
|
74 | template <typename T>
|
75 | class Lexer {
|
76 | public:
|
77 | Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
|
78 | }
|
79 |
|
80 | void SetLine(char* line) {
|
81 | line_ = line;
|
82 | p_current = line;
|
83 | }
|
84 |
|
85 | const char* line_;
|
86 | const char* p_current; // points into line
|
87 | T line_mode; // current mode, starts with Outer
|
88 | };
|
89 |
|
90 | template <typename T>
|
91 | class Matcher {
|
92 | public:
|
93 | // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
|
94 | // param.
|
95 | bool Match(Lexer<T>* lexer, Token* tok);
|
96 | };
|
97 |
|
98 | // Macros for semantic actions
|
99 |
|
100 | #define TOK(k) \
|
101 | tok->id = k; \
|
102 | break;
|
103 |
|
104 | #define TOK_MODE(k, m) \
|
105 | tok->id = k; \
|
106 | lexer->line_mode = m; \
|
107 | break;
|
108 |
|
109 | // Must call TOK*() after this
|
110 | #define SUBMATCH(s, e) \
|
111 | tok->submatch_start = s - lexer->line_; \
|
112 | tok->submatch_end = e - lexer->line_;
|
113 |
|
114 | // Regex definitions shared between languages
|
115 |
|
116 | /*!re2c
|
117 | re2c:yyfill:enable = 0;
|
118 | re2c:define:YYCTYPE = char;
|
119 | re2c:define:YYCURSOR = p;
|
120 |
|
121 | nul = [\x00];
|
122 | not_nul = [^\x00];
|
123 |
|
124 | // Whitespace is needed for SLOC, to tell if a line is entirely blank
|
125 | whitespace = [ \t\r\n]*;
|
126 |
|
127 | identifier = [_a-zA-Z][_a-zA-Z0-9]*;
|
128 |
|
129 | // Python and C++ have "" strings
|
130 | // C++ char literals are similar, e.g. '\''
|
131 | // We are not more precise
|
132 |
|
133 | sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
|
134 | dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
|
135 |
|
136 | sq_string = ['] sq_middle ['];
|
137 | dq_string = ["] dq_middle ["];
|
138 |
|
139 | // Shell and Python have # comments
|
140 | pound_comment = "#" not_nul*;
|
141 |
|
142 | // YSH and Python have ''' """
|
143 | triple_sq = "'''";
|
144 | triple_dq = ["]["]["];
|
145 | */
|
146 |
|
147 | enum class text_mode_e {
|
148 | Outer, // default
|
149 | };
|
150 |
|
151 | // Returns whether EOL was hit
|
152 | template <>
|
153 | bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
|
154 | const char* p = lexer->p_current; // mutated by re2c
|
155 |
|
156 | while (true) {
|
157 | /*!re2c
|
158 | nul { return true; }
|
159 |
|
160 | // whitespace at start of line
|
161 | whitespace { TOK(Id::WS); }
|
162 |
|
163 | // This rule consumes trailing whitespace, but
|
164 | // it's OK. We're counting significant lines, not
|
165 | // highlighting.
|
166 | [^\x00]+ { TOK(Id::Other); }
|
167 |
|
168 | * { TOK(Id::Other); }
|
169 |
|
170 | */
|
171 | }
|
172 |
|
173 | tok->end_col = p - lexer->line_;
|
174 | lexer->p_current = p;
|
175 | return false;
|
176 | }
|
177 |
|
178 | enum class asdl_mode_e {
|
179 | Outer,
|
180 | };
|
181 |
|
182 | // Returns whether EOL was hit
|
183 | template <>
|
184 | bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
|
185 | const char* p = lexer->p_current; // mutated by re2c
|
186 |
|
187 | switch (lexer->line_mode) {
|
188 | case asdl_mode_e::Outer:
|
189 | while (true) {
|
190 | /*!re2c
|
191 | nul { return true; }
|
192 |
|
193 | whitespace { TOK(Id::WS); }
|
194 |
|
195 | identifier { TOK(Id::Name); }
|
196 |
|
197 | pound_comment { TOK(Id::Comm); }
|
198 |
|
199 | // Not the start of a comment, identifier
|
200 | [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
|
201 |
|
202 | // e.g. unclosed quote like "foo
|
203 | * { TOK(Id::Unknown); }
|
204 |
|
205 | */
|
206 | }
|
207 | break;
|
208 | }
|
209 |
|
210 | tok->end_col = p - lexer->line_;
|
211 | lexer->p_current = p;
|
212 | return false;
|
213 | }
|
214 |
|
215 | enum class py_mode_e {
|
216 | Outer, // default
|
217 | MultiSQ, // inside '''
|
218 | MultiDQ, // inside """
|
219 | };
|
220 |
|
221 | // Returns whether EOL was hit
|
222 | template <>
|
223 | bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
|
224 | const char* p = lexer->p_current; // mutated by re2c
|
225 | const char* YYMARKER = p;
|
226 |
|
227 | switch (lexer->line_mode) {
|
228 | case py_mode_e::Outer:
|
229 | while (true) {
|
230 | /*!re2c
|
231 | nul { return true; }
|
232 |
|
233 | whitespace { TOK(Id::WS); }
|
234 |
|
235 | identifier { TOK(Id::Name); }
|
236 |
|
237 | [r]? sq_string { TOK(Id::Str); }
|
238 | [r]? dq_string { TOK(Id::Str); }
|
239 |
|
240 | // optional raw prefix
|
241 | [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
|
242 | [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
|
243 |
|
244 | pound_comment { TOK(Id::Comm); }
|
245 |
|
246 | // Not the start of a string, comment, identifier
|
247 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
248 |
|
249 | // e.g. unclosed quote like "foo
|
250 | * { TOK(Id::Unknown); }
|
251 |
|
252 | */
|
253 | }
|
254 | break;
|
255 |
|
256 | case py_mode_e::MultiSQ:
|
257 | while (true) {
|
258 | /*!re2c
|
259 | nul { return true; }
|
260 |
|
261 | triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
262 |
|
263 | [^\x00']* { TOK(Id::Str); }
|
264 |
|
265 | * { TOK(Id::Str); }
|
266 |
|
267 | */
|
268 | }
|
269 | break;
|
270 |
|
271 | case py_mode_e::MultiDQ:
|
272 | while (true) {
|
273 | /*!re2c
|
274 | nul { return true; }
|
275 |
|
276 | triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
277 |
|
278 | [^\x00"]* { TOK(Id::Str); }
|
279 |
|
280 | * { TOK(Id::Str); }
|
281 |
|
282 | */
|
283 | }
|
284 | break;
|
285 | }
|
286 |
|
287 | tok->end_col = p - lexer->line_;
|
288 | lexer->p_current = p;
|
289 | return false;
|
290 | }
|
291 |
|
292 | enum class cpp_mode_e {
|
293 | Outer, // default
|
294 | Comm, // inside /* */ comment
|
295 | DelimStr, // R"zz(string literal)zz"
|
296 | Re2c, // /* !re2c
|
297 | };
|
298 |
|
299 | // Returns whether EOL was hit
|
300 | template <>
|
301 | bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
|
302 | const char* p = lexer->p_current; // mutated by re2c
|
303 | const char* YYMARKER = p;
|
304 | const char *s, *e; // submatch extraction
|
305 |
|
306 | // Autogenerated tag variables used by the lexer to track tag values.
|
307 | /*!stags:re2c format = 'const char *@@;\n'; */
|
308 |
|
309 | switch (lexer->line_mode) {
|
310 | case cpp_mode_e::Outer:
|
311 |
|
312 | while (true) {
|
313 | /*!re2c
|
314 | nul { return true; }
|
315 |
|
316 | whitespace { TOK(Id::WS); }
|
317 |
|
318 | "{" { TOK(Id::LBrace); }
|
319 | "}" { TOK(Id::RBrace); }
|
320 |
|
321 | identifier { TOK(Id::Name); }
|
322 |
|
323 | // approximation for C++ char literals
|
324 | sq_string { TOK(Id::Str); }
|
325 | dq_string { TOK(Id::Str); }
|
326 |
|
327 | // Not the start of a string, comment, identifier
|
328 | [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
|
329 |
|
330 | "//" not_nul* { TOK(Id::Comm); }
|
331 |
|
332 | // Treat re2c as preprocessor block
|
333 | "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
|
334 |
|
335 | "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
|
336 |
|
337 | // Not sure what the rules are for R"zz(hello)zz". Make it similar to
|
338 | // here docs.
|
339 | cpp_delim_str = [_a-zA-Z]*;
|
340 |
|
341 | "R" ["] @s cpp_delim_str @e "(" {
|
342 | SUBMATCH(s, e);
|
343 | TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
|
344 | }
|
345 |
|
346 | // e.g. unclosed quote like "foo
|
347 | * { TOK(Id::Unknown); }
|
348 |
|
349 | */
|
350 | }
|
351 | break;
|
352 |
|
353 | case cpp_mode_e::Comm:
|
354 | // Search until next */
|
355 | while (true) {
|
356 | /*!re2c
|
357 | nul { return true; }
|
358 |
|
359 | "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
|
360 |
|
361 | [^\x00*]* { TOK(Id::Comm); }
|
362 |
|
363 | * { TOK(Id::Comm); }
|
364 |
|
365 | */
|
366 | }
|
367 | break;
|
368 |
|
369 | case cpp_mode_e::Re2c:
|
370 | // Search until next */
|
371 | while (true) {
|
372 | /*!re2c
|
373 | nul { return true; }
|
374 |
|
375 | "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
|
376 |
|
377 | [^\x00*]* { TOK(Id::Re2c); }
|
378 |
|
379 | * { TOK(Id::Re2c); }
|
380 |
|
381 | */
|
382 | }
|
383 | break;
|
384 |
|
385 | case cpp_mode_e::DelimStr:
|
386 | // Search until next */
|
387 | while (true) {
|
388 | /*!re2c
|
389 | nul { return true; }
|
390 |
|
391 | ")" @s cpp_delim_str @e ["] {
|
392 | SUBMATCH(s, e);
|
393 | TOK(Id::DelimStrEnd);
|
394 |
|
395 | // Caller is responsible for checking the extracted delimiter, and
|
396 | // setting mode back to Cpp::Outer!
|
397 | }
|
398 |
|
399 | [^\x00)]* { TOK(Id::Str); }
|
400 |
|
401 | * { TOK(Id::Str); }
|
402 |
|
403 | */
|
404 | }
|
405 | break;
|
406 | }
|
407 |
|
408 | tok->end_col = p - lexer->line_;
|
409 | lexer->p_current = p;
|
410 | return false;
|
411 | }
|
412 |
|
413 | class Hook {
|
414 | public:
|
415 | // Return true if this is a preprocessor line, and fill in tokens
|
416 | // Caller should check last token for whether there is a continuation line.
|
417 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
|
418 | ;
|
419 | }
|
420 | virtual ~Hook() {
|
421 | }
|
422 | };
|
423 |
|
424 | enum class pp_mode_e {
|
425 | Outer,
|
426 | };
|
427 |
|
428 | // Returns whether EOL was hit
|
429 | template <>
|
430 | bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
|
431 | const char* p = lexer->p_current; // mutated by re2c
|
432 | const char* YYMARKER = p;
|
433 |
|
434 | switch (lexer->line_mode) {
|
435 | case pp_mode_e::Outer:
|
436 | while (true) {
|
437 | /*!re2c
|
438 | nul { return true; }
|
439 |
|
440 | // Resolved in fix-up pass
|
441 | // #include #define etc. only valid at the
|
442 | // beginning
|
443 | [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
|
444 |
|
445 | // C-style comments can end these lines
|
446 | "//" not_nul* { TOK(Id::Comm); }
|
447 |
|
448 | [\\] [\n] { TOK(Id::LineCont); }
|
449 |
|
450 | // A line could be all whitespace, then \ at the
|
451 | // end. And it's not significant
|
452 | whitespace { TOK(Id::WS); }
|
453 |
|
454 | // Not the start of a command, comment, or line
|
455 | // continuation
|
456 | [^\x00#/\\]+ { TOK(Id::PreprocOther); }
|
457 |
|
458 | * { TOK(Id::PreprocOther); }
|
459 |
|
460 | */
|
461 | }
|
462 | break;
|
463 | }
|
464 |
|
465 | tok->end_col = p - lexer->line_;
|
466 | lexer->p_current = p;
|
467 | return false;
|
468 | }
|
469 |
|
470 | class CppHook : public Hook {
|
471 | public:
|
472 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
|
473 | };
|
474 |
|
475 | enum class R_mode_e {
|
476 | Outer, // default
|
477 |
|
478 | SQ, // inside multi-line ''
|
479 | DQ, // inside multi-line ""
|
480 | };
|
481 |
|
482 | // Returns whether EOL was hit
|
483 | template <>
|
484 | bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
|
485 | const char* p = lexer->p_current; // mutated by re2c
|
486 | const char* YYMARKER = p;
|
487 |
|
488 | switch (lexer->line_mode) {
|
489 | case R_mode_e::Outer:
|
490 | while (true) {
|
491 | /*!re2c
|
492 | nul { return true; }
|
493 |
|
494 | whitespace { TOK(Id::WS); }
|
495 |
|
496 | pound_comment { TOK(Id::Comm); }
|
497 |
|
498 | identifier { TOK(Id::Name); }
|
499 |
|
500 | // Not the start of a string, escaped, comment, identifier
|
501 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
502 |
|
503 | ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
|
504 | ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
|
505 |
|
506 | * { TOK(Id::Unknown); }
|
507 |
|
508 | */
|
509 | }
|
510 | break;
|
511 |
|
512 | case R_mode_e::SQ:
|
513 | while (true) {
|
514 | /*!re2c
|
515 | nul { return true; }
|
516 |
|
517 | ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
518 |
|
519 | sq_middle { TOK(Id::Str); }
|
520 |
|
521 | * { TOK(Id::Str); }
|
522 |
|
523 | */
|
524 | }
|
525 | break;
|
526 |
|
527 | case R_mode_e::DQ:
|
528 | while (true) {
|
529 | /*!re2c
|
530 | nul { return true; }
|
531 |
|
532 | ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
533 |
|
534 | dq_middle { TOK(Id::Str); }
|
535 |
|
536 | * { TOK(Id::Str); }
|
537 |
|
538 | */
|
539 | }
|
540 | break;
|
541 | }
|
542 |
|
543 | tok->end_col = p - lexer->line_;
|
544 | lexer->p_current = p;
|
545 | return false;
|
546 | }
|
547 |
|
548 | // Problem with shell: nested double quotes!!!
|
549 | // We probably discourage this in YSH
|
550 |
|
551 | enum class sh_mode_e {
|
552 | Outer, // default
|
553 |
|
554 | SQ, // inside multi-line ''
|
555 | DollarSQ, // inside multi-line $''
|
556 | DQ, // inside multi-line ""
|
557 |
|
558 | // We could have a separate thing for this
|
559 | YshSQ, // inside '''
|
560 | YshDQ, // inside """
|
561 | YshJ, // inside j"""
|
562 | };
|
563 |
|
564 | // Returns whether EOL was hit
|
565 |
|
566 | // Submatch docs:
|
567 | // https://re2c.org/manual/manual_c.html#submatch-extraction
|
568 |
|
569 | template <>
|
570 | bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
|
571 | const char* p = lexer->p_current; // mutated by re2c
|
572 | const char* YYMARKER = p;
|
573 | const char *s, *e; // submatch extraction
|
574 |
|
575 | // Autogenerated tag variables used by the lexer to track tag values.
|
576 | /*!stags:re2c format = 'const char *@@;\n'; */
|
577 |
|
578 | switch (lexer->line_mode) {
|
579 | case sh_mode_e::Outer:
|
580 | while (true) {
|
581 | /*!re2c
|
582 | nul { return true; }
|
583 |
|
584 | whitespace { TOK(Id::WS); }
|
585 |
|
586 | // Resolved in fix-up pass
|
587 | pound_comment { TOK(Id::MaybeComment); }
|
588 |
|
589 | // not that relevant for shell
|
590 | identifier { TOK(Id::Name); }
|
591 |
|
592 | // Not the start of a string, escaped, comment, identifier, here doc
|
593 | [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
|
594 |
|
595 | // echo is like a string
|
596 | "\\" . { TOK(Id::Str); }
|
597 |
|
598 | ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
|
599 | ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
|
600 | "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
|
601 |
|
602 | // <<- is another syntax
|
603 | here_op = "<<" [-]? [ \t]*;
|
604 | h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
|
605 |
|
606 | // unquoted or quoted
|
607 | here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
608 | here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
609 | here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
610 | here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
611 |
|
612 | // NOT Unknown, as in Python
|
613 | * { TOK(Id::Other); }
|
614 |
|
615 | */
|
616 | }
|
617 | break;
|
618 |
|
619 | case sh_mode_e::SQ:
|
620 | // Search until next ' unconditionally
|
621 | while (true) {
|
622 | /*!re2c
|
623 | nul { return true; }
|
624 |
|
625 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
626 |
|
627 | [^\x00']* { TOK(Id::Str); }
|
628 |
|
629 | * { TOK(Id::Str); }
|
630 |
|
631 | */
|
632 | }
|
633 | break;
|
634 |
|
635 | case sh_mode_e::DQ:
|
636 | // Search until next " that's not preceded by "
|
637 | while (true) {
|
638 | /*!re2c
|
639 | nul { return true; }
|
640 |
|
641 | ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
642 |
|
643 | dq_middle { TOK(Id::Str); }
|
644 |
|
645 | * { TOK(Id::Str); }
|
646 |
|
647 | */
|
648 | }
|
649 | break;
|
650 |
|
651 | case sh_mode_e::DollarSQ:
|
652 | // Search until next ' that's not preceded by "
|
653 | while (true) {
|
654 | /*!re2c
|
655 | nul { return true; }
|
656 |
|
657 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
658 |
|
659 | sq_middle { TOK(Id::Str); }
|
660 |
|
661 | * { TOK(Id::Str); }
|
662 |
|
663 | */
|
664 | }
|
665 | break;
|
666 | case sh_mode_e::YshSQ:
|
667 | case sh_mode_e::YshDQ:
|
668 | case sh_mode_e::YshJ:
|
669 | assert(0);
|
670 | }
|
671 |
|
672 | tok->end_col = p - lexer->line_;
|
673 | lexer->p_current = p;
|
674 | return false;
|
675 | }
|
676 |
|
677 | enum class html_mode_e {
|
678 | Outer,
|
679 | };
|
680 |
|
681 | // Returns whether EOL was hit
|
682 | template <>
|
683 | bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
|
684 | const char* p = lexer->p_current; // mutated by re2c
|
685 | const char* YYMARKER = p;
|
686 |
|
687 | switch (lexer->line_mode) {
|
688 | case html_mode_e::Outer:
|
689 | while (true) {
|
690 | /*!re2c
|
691 | nul { return true; }
|
692 |
|
693 | // Like _NAME in HTM8
|
694 | name = [a-zA-Z][a-zA-Z0-9:_-]* ;
|
695 |
|
696 | '</' name '>' { TOK(Id::EndTag); }
|
697 | '<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); }
|
698 | '<' name [^>\x00]* '>' { TOK(Id::StartTag); }
|
699 |
|
700 | // TODO: Fill in the rest of the HTM8 lexer.
|
701 |
|
702 | * { TOK(Id::Other); }
|
703 |
|
704 | */
|
705 | }
|
706 | break;
|
707 | }
|
708 |
|
709 | tok->end_col = p - lexer->line_;
|
710 | lexer->p_current = p;
|
711 | return false;
|
712 | }
|
713 |
|
714 |
|
715 | // TODO:
|
716 | // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
|
717 | // - same as C++ raw string, I think
|
718 | // - similar to here docs, but less complex
|
719 | //
|
720 | // Inherent problems with "micro segmentation":
|
721 | //
|
722 | // - Nested double quotes in shell. echo "hi ${name:-"default"}"
|
723 | // - This means that lexing is **dependent on** parsing: does the second
|
724 | // double quote **close** the first one, or does it start a nested string?
|
725 | // - lexing is non-recursive, parsing is recursive
|
726 |
|
727 | // Shell Comments depend on operator chars
|
728 | // echo one # comment
|
729 | // echo $(( 16#ff ))'
|
730 |
|
731 | #endif // MICRO_SYNTAX_H
|