1 | #ifndef MICRO_SYNTAX_H
|
2 | #define MICRO_SYNTAX_H
|
3 |
|
4 | #include <assert.h>
|
5 | #include <string.h> // strlen()
|
6 |
|
7 | #include <vector>
|
8 |
|
9 | enum class Id {
|
10 | // Common to nearly all languages
|
11 | Comm,
|
12 | MaybeComment, // for shell, resolved in a fix-up pass
|
13 |
|
14 | WS,
|
15 |
|
16 | Name, // Keyword or Identifier
|
17 | Str, // "" and Python r""
|
18 | // '' and Python r''
|
19 | // ''' """
|
20 | // body of here docs
|
21 |
|
22 | Other, // any other text
|
23 | Unknown,
|
24 |
|
25 | // C++
|
26 | DelimStrBegin, // for C++ R"zzz(hello)zzz"
|
27 | DelimStrEnd,
|
28 | Re2c, // re2c code block
|
29 |
|
30 | MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
|
31 | PreprocCommand, // resolved #define
|
32 | PreprocOther, // any other text
|
33 | LineCont, // backslash at end of line, for #define continuation
|
34 |
|
35 | // Braces for C++ block structure. Could be done in second pass after
|
36 | // removing comments/strings?
|
37 | LBrace,
|
38 | RBrace,
|
39 |
|
40 | // Shell
|
41 | HereBegin,
|
42 | HereEnd,
|
43 |
|
44 | // Html
|
45 | TagNameLeft, // start <a> or <br id=foo />
|
46 | SelfClose, // />
|
47 | TagNameRight, // >
|
48 | EndTag, // </a>
|
49 | CharEscape, // &
|
50 | AttrName, // foo=
|
51 | BadAmpersand,
|
52 | BadLessThan,
|
53 | BadGreaterThan,
|
54 | // Reused: Str Other
|
55 |
|
56 | // Zero-width token to detect #ifdef and Python INDENT/DEDENT
|
57 | // StartLine,
|
58 |
|
59 | // These are special zero-width tokens for Python
|
60 | // Indent,
|
61 | // Dedent,
|
62 | // Maintain our own stack!
|
63 | // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
|
64 | };
|
65 |
|
66 | struct Token {
|
67 | Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
|
68 | }
|
69 | Token(Id id, int end_col)
|
70 | : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
|
71 | }
|
72 |
|
73 | Id id;
|
74 | int end_col; // offset from char* line
|
75 | int submatch_start; // ditto
|
76 | int submatch_end; // ditto
|
77 | };
|
78 |
|
79 | // Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
|
80 |
|
81 | template <typename T>
|
82 | class Lexer {
|
83 | public:
|
84 | Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
|
85 | }
|
86 |
|
87 | void SetLine(char* line) {
|
88 | line_ = line;
|
89 | p_current = line;
|
90 | }
|
91 |
|
92 | const char* line_;
|
93 | const char* p_current; // points into line
|
94 | T line_mode; // current mode, starts with Outer
|
95 | };
|
96 |
|
97 | template <typename T>
|
98 | class Matcher {
|
99 | public:
|
100 | // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
|
101 | // param.
|
102 | bool Match(Lexer<T>* lexer, Token* tok);
|
103 | };
|
104 |
|
105 | // Macros for semantic actions
|
106 |
|
107 | #define TOK(k) \
|
108 | tok->id = k; \
|
109 | break;
|
110 |
|
111 | #define TOK_MODE(k, m) \
|
112 | tok->id = k; \
|
113 | lexer->line_mode = m; \
|
114 | break;
|
115 |
|
116 | // Must call TOK*() after this
|
117 | #define SUBMATCH(s, e) \
|
118 | tok->submatch_start = s - lexer->line_; \
|
119 | tok->submatch_end = e - lexer->line_;
|
120 |
|
121 | // Regex definitions shared between languages
|
122 |
|
123 | /*!re2c
|
124 | re2c:yyfill:enable = 0;
|
125 | re2c:define:YYCTYPE = char;
|
126 | re2c:define:YYCURSOR = p;
|
127 |
|
128 | nul = [\x00];
|
129 | not_nul = [^\x00];
|
130 |
|
131 | // Whitespace is needed for SLOC, to tell if a line is entirely blank
|
132 | whitespace = [ \t\r\n]*;
|
133 | space_required = [ \t\r\n]+;
|
134 |
|
135 | identifier = [_a-zA-Z][_a-zA-Z0-9]*;
|
136 |
|
137 | // Python and C++ have "" strings
|
138 | // C++ char literals are similar, e.g. '\''
|
139 | // We are not more precise
|
140 |
|
141 | sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
|
142 | dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
|
143 |
|
144 | sq_string = ['] sq_middle ['];
|
145 | dq_string = ["] dq_middle ["];
|
146 |
|
147 | // Shell and Python have # comments
|
148 | pound_comment = "#" not_nul*;
|
149 |
|
150 | // YSH and Python have ''' """
|
151 | triple_sq = "'''";
|
152 | triple_dq = ["]["]["];
|
153 | */
|
154 |
|
155 | enum class text_mode_e {
|
156 | Outer, // default
|
157 | };
|
158 |
|
159 | // Returns whether EOL was hit
|
160 | template <>
|
161 | bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
|
162 | const char* p = lexer->p_current; // mutated by re2c
|
163 |
|
164 | while (true) {
|
165 | /*!re2c
|
166 | nul { return true; }
|
167 |
|
168 | // whitespace at start of line
|
169 | whitespace { TOK(Id::WS); }
|
170 |
|
171 | // This rule consumes trailing whitespace, but
|
172 | // it's OK. We're counting significant lines, not
|
173 | // highlighting.
|
174 | [^\x00]+ { TOK(Id::Other); }
|
175 |
|
176 | * { TOK(Id::Other); }
|
177 |
|
178 | */
|
179 | }
|
180 |
|
181 | tok->end_col = p - lexer->line_;
|
182 | lexer->p_current = p;
|
183 | return false;
|
184 | }
|
185 |
|
186 | enum class asdl_mode_e {
|
187 | Outer,
|
188 | };
|
189 |
|
190 | // Returns whether EOL was hit
|
191 | template <>
|
192 | bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
|
193 | const char* p = lexer->p_current; // mutated by re2c
|
194 |
|
195 | switch (lexer->line_mode) {
|
196 | case asdl_mode_e::Outer:
|
197 | while (true) {
|
198 | /*!re2c
|
199 | nul { return true; }
|
200 |
|
201 | whitespace { TOK(Id::WS); }
|
202 |
|
203 | identifier { TOK(Id::Name); }
|
204 |
|
205 | pound_comment { TOK(Id::Comm); }
|
206 |
|
207 | // Not the start of a comment, identifier
|
208 | [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
|
209 |
|
210 | // e.g. unclosed quote like "foo
|
211 | * { TOK(Id::Unknown); }
|
212 |
|
213 | */
|
214 | }
|
215 | break;
|
216 | }
|
217 |
|
218 | tok->end_col = p - lexer->line_;
|
219 | lexer->p_current = p;
|
220 | return false;
|
221 | }
|
222 |
|
223 | enum class py_mode_e {
|
224 | Outer, // default
|
225 | MultiSQ, // inside '''
|
226 | MultiDQ, // inside """
|
227 | };
|
228 |
|
229 | // Returns whether EOL was hit
|
230 | template <>
|
231 | bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
|
232 | const char* p = lexer->p_current; // mutated by re2c
|
233 | const char* YYMARKER = p;
|
234 |
|
235 | switch (lexer->line_mode) {
|
236 | case py_mode_e::Outer:
|
237 | while (true) {
|
238 | /*!re2c
|
239 | nul { return true; }
|
240 |
|
241 | whitespace { TOK(Id::WS); }
|
242 |
|
243 | identifier { TOK(Id::Name); }
|
244 |
|
245 | [r]? sq_string { TOK(Id::Str); }
|
246 | [r]? dq_string { TOK(Id::Str); }
|
247 |
|
248 | // optional raw prefix
|
249 | [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
|
250 | [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
|
251 |
|
252 | pound_comment { TOK(Id::Comm); }
|
253 |
|
254 | // Not the start of a string, comment, identifier
|
255 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
256 |
|
257 | // e.g. unclosed quote like "foo
|
258 | * { TOK(Id::Unknown); }
|
259 |
|
260 | */
|
261 | }
|
262 | break;
|
263 |
|
264 | case py_mode_e::MultiSQ:
|
265 | while (true) {
|
266 | /*!re2c
|
267 | nul { return true; }
|
268 |
|
269 | triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
270 |
|
271 | [^\x00']* { TOK(Id::Str); }
|
272 |
|
273 | * { TOK(Id::Str); }
|
274 |
|
275 | */
|
276 | }
|
277 | break;
|
278 |
|
279 | case py_mode_e::MultiDQ:
|
280 | while (true) {
|
281 | /*!re2c
|
282 | nul { return true; }
|
283 |
|
284 | triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
285 |
|
286 | [^\x00"]* { TOK(Id::Str); }
|
287 |
|
288 | * { TOK(Id::Str); }
|
289 |
|
290 | */
|
291 | }
|
292 | break;
|
293 | }
|
294 |
|
295 | tok->end_col = p - lexer->line_;
|
296 | lexer->p_current = p;
|
297 | return false;
|
298 | }
|
299 |
|
300 | enum class cpp_mode_e {
|
301 | Outer, // default
|
302 | Comm, // inside /* */ comment
|
303 | DelimStr, // R"zz(string literal)zz"
|
304 | Re2c, // /* !re2c
|
305 | };
|
306 |
|
307 | // Returns whether EOL was hit
|
308 | template <>
|
309 | bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
|
310 | const char* p = lexer->p_current; // mutated by re2c
|
311 | const char* YYMARKER = p;
|
312 | const char *s, *e; // submatch extraction
|
313 |
|
314 | // Autogenerated tag variables used by the lexer to track tag values.
|
315 | /*!stags:re2c format = 'const char *@@;\n'; */
|
316 |
|
317 | switch (lexer->line_mode) {
|
318 | case cpp_mode_e::Outer:
|
319 |
|
320 | while (true) {
|
321 | /*!re2c
|
322 | nul { return true; }
|
323 |
|
324 | whitespace { TOK(Id::WS); }
|
325 |
|
326 | "{" { TOK(Id::LBrace); }
|
327 | "}" { TOK(Id::RBrace); }
|
328 |
|
329 | identifier { TOK(Id::Name); }
|
330 |
|
331 | // approximation for C++ char literals
|
332 | sq_string { TOK(Id::Str); }
|
333 | dq_string { TOK(Id::Str); }
|
334 |
|
335 | // Not the start of a string, comment, identifier
|
336 | [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
|
337 |
|
338 | "//" not_nul* { TOK(Id::Comm); }
|
339 |
|
340 | // Treat re2c as preprocessor block
|
341 | "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
|
342 |
|
343 | "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
|
344 |
|
345 | // Not sure what the rules are for R"zz(hello)zz". Make it similar to
|
346 | // here docs.
|
347 | cpp_delim_str = [_a-zA-Z]*;
|
348 |
|
349 | "R" ["] @s cpp_delim_str @e "(" {
|
350 | SUBMATCH(s, e);
|
351 | TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
|
352 | }
|
353 |
|
354 | // e.g. unclosed quote like "foo
|
355 | * { TOK(Id::Unknown); }
|
356 |
|
357 | */
|
358 | }
|
359 | break;
|
360 |
|
361 | case cpp_mode_e::Comm:
|
362 | // Search until next */
|
363 | while (true) {
|
364 | /*!re2c
|
365 | nul { return true; }
|
366 |
|
367 | "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
|
368 |
|
369 | [^\x00*]* { TOK(Id::Comm); }
|
370 |
|
371 | * { TOK(Id::Comm); }
|
372 |
|
373 | */
|
374 | }
|
375 | break;
|
376 |
|
377 | case cpp_mode_e::Re2c:
|
378 | // Search until next */
|
379 | while (true) {
|
380 | /*!re2c
|
381 | nul { return true; }
|
382 |
|
383 | "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
|
384 |
|
385 | [^\x00*]* { TOK(Id::Re2c); }
|
386 |
|
387 | * { TOK(Id::Re2c); }
|
388 |
|
389 | */
|
390 | }
|
391 | break;
|
392 |
|
393 | case cpp_mode_e::DelimStr:
|
394 | // Search until next */
|
395 | while (true) {
|
396 | /*!re2c
|
397 | nul { return true; }
|
398 |
|
399 | ")" @s cpp_delim_str @e ["] {
|
400 | SUBMATCH(s, e);
|
401 | TOK(Id::DelimStrEnd);
|
402 |
|
403 | // Caller is responsible for checking the extracted delimiter, and
|
404 | // setting mode back to Cpp::Outer!
|
405 | }
|
406 |
|
407 | [^\x00)]* { TOK(Id::Str); }
|
408 |
|
409 | * { TOK(Id::Str); }
|
410 |
|
411 | */
|
412 | }
|
413 | break;
|
414 | }
|
415 |
|
416 | tok->end_col = p - lexer->line_;
|
417 | lexer->p_current = p;
|
418 | return false;
|
419 | }
|
420 |
|
421 | class Hook {
|
422 | public:
|
423 | // Return true if this is a preprocessor line, and fill in tokens
|
424 | // Caller should check last token for whether there is a continuation line.
|
425 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
|
426 | ;
|
427 | }
|
428 | virtual ~Hook() {
|
429 | }
|
430 | };
|
431 |
|
432 | enum class pp_mode_e {
|
433 | Outer,
|
434 | };
|
435 |
|
436 | // Returns whether EOL was hit
|
437 | template <>
|
438 | bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
|
439 | const char* p = lexer->p_current; // mutated by re2c
|
440 | const char* YYMARKER = p;
|
441 |
|
442 | switch (lexer->line_mode) {
|
443 | case pp_mode_e::Outer:
|
444 | while (true) {
|
445 | /*!re2c
|
446 | nul { return true; }
|
447 |
|
448 | // Resolved in fix-up pass
|
449 | // #include #define etc. only valid at the
|
450 | // beginning
|
451 | [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
|
452 |
|
453 | // C-style comments can end these lines
|
454 | "//" not_nul* { TOK(Id::Comm); }
|
455 |
|
456 | [\\] [\n] { TOK(Id::LineCont); }
|
457 |
|
458 | // A line could be all whitespace, then \ at the
|
459 | // end. And it's not significant
|
460 | whitespace { TOK(Id::WS); }
|
461 |
|
462 | // Not the start of a command, comment, or line
|
463 | // continuation
|
464 | [^\x00#/\\]+ { TOK(Id::PreprocOther); }
|
465 |
|
466 | * { TOK(Id::PreprocOther); }
|
467 |
|
468 | */
|
469 | }
|
470 | break;
|
471 | }
|
472 |
|
473 | tok->end_col = p - lexer->line_;
|
474 | lexer->p_current = p;
|
475 | return false;
|
476 | }
|
477 |
|
478 | class CppHook : public Hook {
|
479 | public:
|
480 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
|
481 | };
|
482 |
|
483 | enum class R_mode_e {
|
484 | Outer, // default
|
485 |
|
486 | SQ, // inside multi-line ''
|
487 | DQ, // inside multi-line ""
|
488 | };
|
489 |
|
490 | // Returns whether EOL was hit
|
491 | template <>
|
492 | bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
|
493 | const char* p = lexer->p_current; // mutated by re2c
|
494 | const char* YYMARKER = p;
|
495 |
|
496 | switch (lexer->line_mode) {
|
497 | case R_mode_e::Outer:
|
498 | while (true) {
|
499 | /*!re2c
|
500 | nul { return true; }
|
501 |
|
502 | whitespace { TOK(Id::WS); }
|
503 |
|
504 | pound_comment { TOK(Id::Comm); }
|
505 |
|
506 | identifier { TOK(Id::Name); }
|
507 |
|
508 | // Not the start of a string, escaped, comment, identifier
|
509 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
510 |
|
511 | ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
|
512 | ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
|
513 |
|
514 | * { TOK(Id::Unknown); }
|
515 |
|
516 | */
|
517 | }
|
518 | break;
|
519 |
|
520 | case R_mode_e::SQ:
|
521 | while (true) {
|
522 | /*!re2c
|
523 | nul { return true; }
|
524 |
|
525 | ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
526 |
|
527 | sq_middle { TOK(Id::Str); }
|
528 |
|
529 | * { TOK(Id::Str); }
|
530 |
|
531 | */
|
532 | }
|
533 | break;
|
534 |
|
535 | case R_mode_e::DQ:
|
536 | while (true) {
|
537 | /*!re2c
|
538 | nul { return true; }
|
539 |
|
540 | ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
541 |
|
542 | dq_middle { TOK(Id::Str); }
|
543 |
|
544 | * { TOK(Id::Str); }
|
545 |
|
546 | */
|
547 | }
|
548 | break;
|
549 | }
|
550 |
|
551 | tok->end_col = p - lexer->line_;
|
552 | lexer->p_current = p;
|
553 | return false;
|
554 | }
|
555 |
|
556 | // Problem with shell: nested double quotes!!!
|
557 | // We probably discourage this in YSH
|
558 |
|
559 | enum class sh_mode_e {
|
560 | Outer, // default
|
561 |
|
562 | SQ, // inside multi-line ''
|
563 | DollarSQ, // inside multi-line $''
|
564 | DQ, // inside multi-line ""
|
565 |
|
566 | // We could have a separate thing for this
|
567 | YshSQ, // inside '''
|
568 | YshDQ, // inside """
|
569 | YshJ, // inside j"""
|
570 | };
|
571 |
|
572 | // Returns whether EOL was hit
|
573 |
|
574 | // Submatch docs:
|
575 | // https://re2c.org/manual/manual_c.html#submatch-extraction
|
576 |
|
577 | template <>
|
578 | bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
|
579 | const char* p = lexer->p_current; // mutated by re2c
|
580 | const char* YYMARKER = p;
|
581 | const char *s, *e; // submatch extraction
|
582 |
|
583 | // Autogenerated tag variables used by the lexer to track tag values.
|
584 | /*!stags:re2c format = 'const char *@@;\n'; */
|
585 |
|
586 | switch (lexer->line_mode) {
|
587 | case sh_mode_e::Outer:
|
588 | while (true) {
|
589 | /*!re2c
|
590 | nul { return true; }
|
591 |
|
592 | whitespace { TOK(Id::WS); }
|
593 |
|
594 | // Resolved in fix-up pass
|
595 | pound_comment { TOK(Id::MaybeComment); }
|
596 |
|
597 | // not that relevant for shell
|
598 | identifier { TOK(Id::Name); }
|
599 |
|
600 | // Not the start of a string, escaped, comment, identifier, here doc
|
601 | [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
|
602 |
|
603 | // echo is like a string
|
604 | "\\" . { TOK(Id::Str); }
|
605 |
|
606 | ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
|
607 | ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
|
608 | "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
|
609 |
|
610 | // <<- is another syntax
|
611 | here_op = "<<" [-]? [ \t]*;
|
612 | h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
|
613 |
|
614 | // unquoted or quoted
|
615 | here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
616 | here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
617 | here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
618 | here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
619 |
|
620 | // NOT Unknown, as in Python
|
621 | * { TOK(Id::Other); }
|
622 |
|
623 | */
|
624 | }
|
625 | break;
|
626 |
|
627 | case sh_mode_e::SQ:
|
628 | // Search until next ' unconditionally
|
629 | while (true) {
|
630 | /*!re2c
|
631 | nul { return true; }
|
632 |
|
633 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
634 |
|
635 | [^\x00']* { TOK(Id::Str); }
|
636 |
|
637 | * { TOK(Id::Str); }
|
638 |
|
639 | */
|
640 | }
|
641 | break;
|
642 |
|
643 | case sh_mode_e::DQ:
|
644 | // Search until next " that's not preceded by "
|
645 | while (true) {
|
646 | /*!re2c
|
647 | nul { return true; }
|
648 |
|
649 | ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
650 |
|
651 | dq_middle { TOK(Id::Str); }
|
652 |
|
653 | * { TOK(Id::Str); }
|
654 |
|
655 | */
|
656 | }
|
657 | break;
|
658 |
|
659 | case sh_mode_e::DollarSQ:
|
660 | // Search until next ' that's not preceded by "
|
661 | while (true) {
|
662 | /*!re2c
|
663 | nul { return true; }
|
664 |
|
665 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
666 |
|
667 | sq_middle { TOK(Id::Str); }
|
668 |
|
669 | * { TOK(Id::Str); }
|
670 |
|
671 | */
|
672 | }
|
673 | break;
|
674 | case sh_mode_e::YshSQ:
|
675 | case sh_mode_e::YshDQ:
|
676 | case sh_mode_e::YshJ:
|
677 | assert(0);
|
678 | }
|
679 |
|
680 | tok->end_col = p - lexer->line_;
|
681 | lexer->p_current = p;
|
682 | return false;
|
683 | }
|
684 |
|
685 | enum class html_mode_e {
|
686 | Outer, // <NAME enters the TAG state
|
687 | AttrName, // NAME=" NAME=' NAME= NAME
|
688 | AttrValue, // NAME=" NAME=' NAME=
|
689 | SQ, // respects Chars, can contain "
|
690 | DQ, // respects Chars, can contain '
|
691 | };
|
692 |
|
693 | // LeftStartTag -> RightStartTag <a href=/ >
|
694 | // LeftStartTag -> SelfClose <br id=foo />
|
695 |
|
696 | // Returns whether EOL was hit
|
697 | template <>
|
698 | bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
|
699 | const char* p = lexer->p_current; // mutated by re2c
|
700 | const char* YYMARKER = p;
|
701 |
|
702 | /*!re2c
|
703 | // Common definitions
|
704 |
|
705 | // Like _NAME_RE in HTM8
|
706 | name = [a-zA-Z][a-zA-Z0-9:_-]* ;
|
707 |
|
708 | // TODO: check this pattern
|
709 | char_name = '&' [a-zA-Z][a-zA-Z0-9]* ';' ;
|
710 | char_dec = '&#' [0-9]+ ';' ;
|
711 | char_hex = '&#x' [0-9a-fA-F]+ ';' ;
|
712 | */
|
713 |
|
714 | switch (lexer->line_mode) {
|
715 | case html_mode_e::Outer:
|
716 | while (true) {
|
717 | /*!re2c
|
718 | // accepted EOF
|
719 | nul { return true; }
|
720 |
|
721 | char_name { TOK(Id::CharEscape); }
|
722 | char_dec { TOK(Id::CharEscape); }
|
723 | char_hex { TOK(Id::CharEscape); }
|
724 |
|
725 | '&' { TOK(Id::BadAmpersand); }
|
726 | '>' { TOK(Id::BadGreaterThan); }
|
727 | '<' { TOK(Id::BadLessThan); }
|
728 |
|
729 | '</' name '>' { TOK(Id::EndTag); }
|
730 |
|
731 | '<' name {
|
732 | TOK_MODE(Id::TagNameLeft, html_mode_e::AttrName);
|
733 | // TODO: <script> <style> - special logic for strstr()
|
734 | }
|
735 |
|
736 | '<!' [^\x00>]* '>' { TOK(Id::Str); }
|
737 |
|
738 | // TODO: use strstr() to end these?
|
739 | // Problem: they all need their own mode, just like cpp_mode_e::Comm
|
740 | // html_mode_e::{Comm,Processing,CData,Script,Style}
|
741 | '<!--' { TOK(Id::Str); }
|
742 | '<?' { TOK(Id::Str); }
|
743 | '<![CDATA[' { TOK(Id::Str); }
|
744 |
|
745 |
|
746 | // Like RawData
|
747 | * { TOK(Id::Other); }
|
748 |
|
749 | */
|
750 | }
|
751 | break;
|
752 | case html_mode_e::AttrName:
|
753 | while (true) {
|
754 | /*!re2c
|
755 | nul { return true; } // TODO: error
|
756 |
|
757 | '>' { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
|
758 | '/>' { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
|
759 |
|
760 | space_required name {
|
761 | // <a missing> - stay in the AttrName mode
|
762 | TOK(Id::AttrName);
|
763 | }
|
764 |
|
765 | space_required name whitespace '=' whitespace {
|
766 | // NAME= NAME=' NAME=" - expecting a value
|
767 | TOK_MODE(Id::AttrName, html_mode_e::AttrValue);
|
768 | }
|
769 |
|
770 | * { TOK(Id::Unknown); }
|
771 | */
|
772 | }
|
773 | break;
|
774 | case html_mode_e::AttrValue:
|
775 | while (true) {
|
776 | /*!re2c
|
777 | nul { return true; } // TODO: error
|
778 |
|
779 | '"' { TOK_MODE(Id::Str, html_mode_e::DQ); }
|
780 | "'" { TOK_MODE(Id::Str, html_mode_e::SQ); }
|
781 |
|
782 | // Unquoted value - a single token
|
783 | unquoted_value = [^\x00 \r\n\t<>&"']+ ;
|
784 |
|
785 | unquoted_value { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
786 |
|
787 | * { TOK(Id::Unknown); }
|
788 | */
|
789 | }
|
790 | break;
|
791 |
|
792 | case html_mode_e::DQ:
|
793 | while (true) {
|
794 | /*!re2c
|
795 | nul { return true; } // TODO: error
|
796 | char_name { TOK(Id::CharEscape); }
|
797 | char_dec { TOK(Id::CharEscape); }
|
798 | char_hex { TOK(Id::CharEscape); }
|
799 |
|
800 | // we would only need these for translation to XML, not
|
801 | // highlighting?
|
802 | '&' { TOK(Id::BadAmpersand); }
|
803 | '>' { TOK(Id::BadGreaterThan); }
|
804 | '<' { TOK(Id::BadLessThan); }
|
805 |
|
806 | '"' { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
807 | * { TOK(Id::Str); }
|
808 | */
|
809 | }
|
810 | break;
|
811 | case html_mode_e::SQ:
|
812 | while (true) {
|
813 | /*!re2c
|
814 | nul { return true; } // TODO: error
|
815 | char_name { TOK(Id::CharEscape); }
|
816 | char_dec { TOK(Id::CharEscape); }
|
817 | char_hex { TOK(Id::CharEscape); }
|
818 |
|
819 | // we would only need these for translation to XML, not
|
820 | // highlighting?
|
821 | '&' { TOK(Id::BadAmpersand); }
|
822 | '>' { TOK(Id::BadGreaterThan); }
|
823 | '<' { TOK(Id::BadLessThan); }
|
824 | "'" { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
825 |
|
826 | * { TOK(Id::Str); }
|
827 | */
|
828 | }
|
829 | break;
|
830 | default:
|
831 | assert(0);
|
832 | }
|
833 |
|
834 | tok->end_col = p - lexer->line_;
|
835 | lexer->p_current = p;
|
836 | return false;
|
837 | }
|
838 |
|
839 | // TODO:
|
840 | // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
|
841 | // - same as C++ raw string, I think
|
842 | // - similar to here docs, but less complex
|
843 | //
|
844 | // Inherent problems with "micro segmentation":
|
845 | //
|
846 | // - Nested double quotes in shell. echo "hi ${name:-"default"}"
|
847 | // - This means that lexing is **dependent on** parsing: does the second
|
848 | // double quote **close** the first one, or does it start a nested string?
|
849 | // - lexing is non-recursive, parsing is recursive
|
850 |
|
851 | // Shell Comments depend on operator chars
|
852 | // echo one # comment
|
853 | // echo $(( 16#ff ))'
|
854 |
|
855 | #endif // MICRO_SYNTAX_H
|