doctools/micro_syntax.re2c.h

OILS / doctools / micro_syntax.re2c.h View on Github | oils.pub

731 lines, 548 significant

1	#ifndef MICRO_SYNTAX_H
2	#define MICRO_SYNTAX_H
3
4	#include <assert.h>
5	#include <string.h> // strlen()
6
7	#include <vector>
8
9	enum class Id {
10	// Common to nearly all languages
11	Comm,
12	MaybeComment, // for shell, resolved in a fix-up pass
13
14	WS,
15
16	Name, // Keyword or Identifier
17	Str, // "" and Python r""
18	// '' and Python r''
19	// ''' """
20	// body of here docs
21
22	Other, // any other text
23	Unknown,
24
25	// C++
26	DelimStrBegin, // for C++ R"zzz(hello)zzz"
27	DelimStrEnd,
28	Re2c, // re2c code block
29
30	MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31	PreprocCommand, // resolved #define
32	PreprocOther, // any other text
33	LineCont, // backslash at end of line, for #define continuation
34
35	// Braces for C++ block structure. Could be done in second pass after
36	// removing comments/strings?
37	LBrace,
38	RBrace,
39
40	// Shell
41	HereBegin,
42	HereEnd,
43
44	// Html
45	StartTag,
46	EndTag,
47	StartEndTag,
48
49	// Zero-width token to detect #ifdef and Python INDENT/DEDENT
50	// StartLine,
51
52	// These are special zero-width tokens for Python
53	// Indent,
54	// Dedent,
55	// Maintain our own stack!
56	// https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
57	};
58
59	struct Token {
60	Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
61	}
62	Token(Id id, int end_col)
63	: id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
64	}
65
66	Id id;
67	int end_col; // offset from char* line
68	int submatch_start; // ditto
69	int submatch_end; // ditto
70	};
71
72	// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
73
74	template <typename T>
75	class Lexer {
76	public:
77	Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
78	}
79
80	void SetLine(char* line) {
81	line_ = line;
82	p_current = line;
83	}
84
85	const char* line_;
86	const char* p_current; // points into line
87	T line_mode; // current mode, starts with Outer
88	};
89
90	template <typename T>
91	class Matcher {
92	public:
93	// Returns whether EOL was hit. Mutates lexer state, and fills in tok out
94	// param.
95	bool Match(Lexer<T>* lexer, Token* tok);
96	};
97
98	// Macros for semantic actions
99
100	#define TOK(k) \
101	tok->id = k; \
102	break;
103
104	#define TOK_MODE(k, m) \
105	tok->id = k; \
106	lexer->line_mode = m; \
107	break;
108
109	// Must call TOK*() after this
110	#define SUBMATCH(s, e) \
111	tok->submatch_start = s - lexer->line_; \
112	tok->submatch_end = e - lexer->line_;
113
114	// Regex definitions shared between languages
115
116	/*!re2c
117	re2c:yyfill:enable = 0;
118	re2c:define:YYCTYPE = char;
119	re2c:define:YYCURSOR = p;
120
121	nul = [\x00];
122	not_nul = [^\x00];
123
124	// Whitespace is needed for SLOC, to tell if a line is entirely blank
125	whitespace = [ \t\r\n]*;
126
127	identifier = [_a-zA-Z][_a-zA-Z0-9]*;
128
129	// Python and C++ have "" strings
130	// C++ char literals are similar, e.g. '\''
131	// We are not more precise
132
133	sq_middle = ( [^\x00'\\] \| "\\" not_nul )*;
134	dq_middle = ( [^\x00"\\] \| "\\" not_nul )*;
135
136	sq_string = ['] sq_middle ['];
137	dq_string = ["] dq_middle ["];
138
139	// Shell and Python have # comments
140	pound_comment = "#" not_nul*;
141
142	// YSH and Python have ''' """
143	triple_sq = "'''";
144	triple_dq = ["]["]["];
145	*/
146
147	enum class text_mode_e {
148	Outer, // default
149	};
150
151	// Returns whether EOL was hit
152	template <>
153	bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
154	const char* p = lexer->p_current; // mutated by re2c
155
156	while (true) {
157	/*!re2c
158	nul { return true; }
159
160	// whitespace at start of line
161	whitespace { TOK(Id::WS); }
162
163	// This rule consumes trailing whitespace, but
164	// it's OK. We're counting significant lines, not
165	// highlighting.
166	[^\x00]+ { TOK(Id::Other); }
167
168	* { TOK(Id::Other); }
169
170	*/
171	}
172
173	tok->end_col = p - lexer->line_;
174	lexer->p_current = p;
175	return false;
176	}
177
178	enum class asdl_mode_e {
179	Outer,
180	};
181
182	// Returns whether EOL was hit
183	template <>
184	bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
185	const char* p = lexer->p_current; // mutated by re2c
186
187	switch (lexer->line_mode) {
188	case asdl_mode_e::Outer:
189	while (true) {
190	/*!re2c
191	nul { return true; }
192
193	whitespace { TOK(Id::WS); }
194
195	identifier { TOK(Id::Name); }
196
197	pound_comment { TOK(Id::Comm); }
198
199	// Not the start of a comment, identifier
200	[^\x00#_a-zA-Z]+ { TOK(Id::Other); }
201
202	// e.g. unclosed quote like "foo
203	* { TOK(Id::Unknown); }
204
205	*/
206	}
207	break;
208	}
209
210	tok->end_col = p - lexer->line_;
211	lexer->p_current = p;
212	return false;
213	}
214
215	enum class py_mode_e {
216	Outer, // default
217	MultiSQ, // inside '''
218	MultiDQ, // inside """
219	};
220
221	// Returns whether EOL was hit
222	template <>
223	bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
224	const char* p = lexer->p_current; // mutated by re2c
225	const char* YYMARKER = p;
226
227	switch (lexer->line_mode) {
228	case py_mode_e::Outer:
229	while (true) {
230	/*!re2c
231	nul { return true; }
232
233	whitespace { TOK(Id::WS); }
234
235	identifier { TOK(Id::Name); }
236
237	[r]? sq_string { TOK(Id::Str); }
238	[r]? dq_string { TOK(Id::Str); }
239
240	// optional raw prefix
241	[r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
242	[r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
243
244	pound_comment { TOK(Id::Comm); }
245
246	// Not the start of a string, comment, identifier
247	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
248
249	// e.g. unclosed quote like "foo
250	* { TOK(Id::Unknown); }
251
252	*/
253	}
254	break;
255
256	case py_mode_e::MultiSQ:
257	while (true) {
258	/*!re2c
259	nul { return true; }
260
261	triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
262
263	[^\x00']* { TOK(Id::Str); }
264
265	* { TOK(Id::Str); }
266
267	*/
268	}
269	break;
270
271	case py_mode_e::MultiDQ:
272	while (true) {
273	/*!re2c
274	nul { return true; }
275
276	triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
277
278	[^\x00"]* { TOK(Id::Str); }
279
280	* { TOK(Id::Str); }
281
282	*/
283	}
284	break;
285	}
286
287	tok->end_col = p - lexer->line_;
288	lexer->p_current = p;
289	return false;
290	}
291
292	enum class cpp_mode_e {
293	Outer, // default
294	Comm, // inside /* */ comment
295	DelimStr, // R"zz(string literal)zz"
296	Re2c, // /* !re2c
297	};
298
299	// Returns whether EOL was hit
300	template <>
301	bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
302	const char* p = lexer->p_current; // mutated by re2c
303	const char* YYMARKER = p;
304	const char s, e; // submatch extraction
305
306	// Autogenerated tag variables used by the lexer to track tag values.
307	/!stags:re2c format = 'const char @@;\n'; */
308
309	switch (lexer->line_mode) {
310	case cpp_mode_e::Outer:
311
312	while (true) {
313	/*!re2c
314	nul { return true; }
315
316	whitespace { TOK(Id::WS); }
317
318	"{" { TOK(Id::LBrace); }
319	"}" { TOK(Id::RBrace); }
320
321	identifier { TOK(Id::Name); }
322
323	// approximation for C++ char literals
324	sq_string { TOK(Id::Str); }
325	dq_string { TOK(Id::Str); }
326
327	// Not the start of a string, comment, identifier
328	[^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
329
330	"//" not_nul* { TOK(Id::Comm); }
331
332	// Treat re2c as preprocessor block
333	"/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
334
335	"/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
336
337	// Not sure what the rules are for R"zz(hello)zz". Make it similar to
338	// here docs.
339	cpp_delim_str = [_a-zA-Z]*;
340
341	"R" ["] @s cpp_delim_str @e "(" {
342	SUBMATCH(s, e);
343	TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
344	}
345
346	// e.g. unclosed quote like "foo
347	* { TOK(Id::Unknown); }
348
349	*/
350	}
351	break;
352
353	case cpp_mode_e::Comm:
354	// Search until next */
355	while (true) {
356	/*!re2c
357	nul { return true; }
358
359	"*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
360
361	[^\x00] { TOK(Id::Comm); }
362
363	* { TOK(Id::Comm); }
364
365	*/
366	}
367	break;
368
369	case cpp_mode_e::Re2c:
370	// Search until next */
371	while (true) {
372	/*!re2c
373	nul { return true; }
374
375	"*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
376
377	[^\x00] { TOK(Id::Re2c); }
378
379	* { TOK(Id::Re2c); }
380
381	*/
382	}
383	break;
384
385	case cpp_mode_e::DelimStr:
386	// Search until next */
387	while (true) {
388	/*!re2c
389	nul { return true; }
390
391	")" @s cpp_delim_str @e ["] {
392	SUBMATCH(s, e);
393	TOK(Id::DelimStrEnd);
394
395	// Caller is responsible for checking the extracted delimiter, and
396	// setting mode back to Cpp::Outer!
397	}
398
399	[^\x00)]* { TOK(Id::Str); }
400
401	* { TOK(Id::Str); }
402
403	*/
404	}
405	break;
406	}
407
408	tok->end_col = p - lexer->line_;
409	lexer->p_current = p;
410	return false;
411	}
412
413	class Hook {
414	public:
415	// Return true if this is a preprocessor line, and fill in tokens
416	// Caller should check last token for whether there is a continuation line.
417	virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
418	;
419	}
420	virtual ~Hook() {
421	}
422	};
423
424	enum class pp_mode_e {
425	Outer,
426	};
427
428	// Returns whether EOL was hit
429	template <>
430	bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
431	const char* p = lexer->p_current; // mutated by re2c
432	const char* YYMARKER = p;
433
434	switch (lexer->line_mode) {
435	case pp_mode_e::Outer:
436	while (true) {
437	/*!re2c
438	nul { return true; }
439
440	// Resolved in fix-up pass
441	// #include #define etc. only valid at the
442	// beginning
443	[ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
444
445	// C-style comments can end these lines
446	"//" not_nul* { TOK(Id::Comm); }
447
448	[\\] [\n] { TOK(Id::LineCont); }
449
450	// A line could be all whitespace, then \ at the
451	// end. And it's not significant
452	whitespace { TOK(Id::WS); }
453
454	// Not the start of a command, comment, or line
455	// continuation
456	[^\x00#/\\]+ { TOK(Id::PreprocOther); }
457
458	* { TOK(Id::PreprocOther); }
459
460	*/
461	}
462	break;
463	}
464
465	tok->end_col = p - lexer->line_;
466	lexer->p_current = p;
467	return false;
468	}
469
470	class CppHook : public Hook {
471	public:
472	virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
473	};
474
475	enum class R_mode_e {
476	Outer, // default
477
478	SQ, // inside multi-line ''
479	DQ, // inside multi-line ""
480	};
481
482	// Returns whether EOL was hit
483	template <>
484	bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
485	const char* p = lexer->p_current; // mutated by re2c
486	const char* YYMARKER = p;
487
488	switch (lexer->line_mode) {
489	case R_mode_e::Outer:
490	while (true) {
491	/*!re2c
492	nul { return true; }
493
494	whitespace { TOK(Id::WS); }
495
496	pound_comment { TOK(Id::Comm); }
497
498	identifier { TOK(Id::Name); }
499
500	// Not the start of a string, escaped, comment, identifier
501	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
502
503	['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
504	["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
505
506	* { TOK(Id::Unknown); }
507
508	*/
509	}
510	break;
511
512	case R_mode_e::SQ:
513	while (true) {
514	/*!re2c
515	nul { return true; }
516
517	['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
518
519	sq_middle { TOK(Id::Str); }
520
521	* { TOK(Id::Str); }
522
523	*/
524	}
525	break;
526
527	case R_mode_e::DQ:
528	while (true) {
529	/*!re2c
530	nul { return true; }
531
532	["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
533
534	dq_middle { TOK(Id::Str); }
535
536	* { TOK(Id::Str); }
537
538	*/
539	}
540	break;
541	}
542
543	tok->end_col = p - lexer->line_;
544	lexer->p_current = p;
545	return false;
546	}
547
548	// Problem with shell: nested double quotes!!!
549	// We probably discourage this in YSH
550
551	enum class sh_mode_e {
552	Outer, // default
553
554	SQ, // inside multi-line ''
555	DollarSQ, // inside multi-line $''
556	DQ, // inside multi-line ""
557
558	// We could have a separate thing for this
559	YshSQ, // inside '''
560	YshDQ, // inside """
561	YshJ, // inside j"""
562	};
563
564	// Returns whether EOL was hit
565
566	// Submatch docs:
567	// https://re2c.org/manual/manual_c.html#submatch-extraction
568
569	template <>
570	bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
571	const char* p = lexer->p_current; // mutated by re2c
572	const char* YYMARKER = p;
573	const char s, e; // submatch extraction
574
575	// Autogenerated tag variables used by the lexer to track tag values.
576	/!stags:re2c format = 'const char @@;\n'; */
577
578	switch (lexer->line_mode) {
579	case sh_mode_e::Outer:
580	while (true) {
581	/*!re2c
582	nul { return true; }
583
584	whitespace { TOK(Id::WS); }
585
586	// Resolved in fix-up pass
587	pound_comment { TOK(Id::MaybeComment); }
588
589	// not that relevant for shell
590	identifier { TOK(Id::Name); }
591
592	// Not the start of a string, escaped, comment, identifier, here doc
593	[^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
594
595	// echo is like a string
596	"\\" . { TOK(Id::Str); }
597
598	['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
599	["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
600	"$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
601
602	// <<- is another syntax
603	here_op = "<<" [-]? [ \t]*;
604	h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
605
606	// unquoted or quoted
607	here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
608	here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
609	here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
610	here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
611
612	// NOT Unknown, as in Python
613	* { TOK(Id::Other); }
614
615	*/
616	}
617	break;
618
619	case sh_mode_e::SQ:
620	// Search until next ' unconditionally
621	while (true) {
622	/*!re2c
623	nul { return true; }
624
625	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
626
627	[^\x00']* { TOK(Id::Str); }
628
629	* { TOK(Id::Str); }
630
631	*/
632	}
633	break;
634
635	case sh_mode_e::DQ:
636	// Search until next " that's not preceded by "
637	while (true) {
638	/*!re2c
639	nul { return true; }
640
641	["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
642
643	dq_middle { TOK(Id::Str); }
644
645	* { TOK(Id::Str); }
646
647	*/
648	}
649	break;
650
651	case sh_mode_e::DollarSQ:
652	// Search until next ' that's not preceded by "
653	while (true) {
654	/*!re2c
655	nul { return true; }
656
657	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
658
659	sq_middle { TOK(Id::Str); }
660
661	* { TOK(Id::Str); }
662
663	*/
664	}
665	break;
666	case sh_mode_e::YshSQ:
667	case sh_mode_e::YshDQ:
668	case sh_mode_e::YshJ:
669	assert(0);
670	}
671
672	tok->end_col = p - lexer->line_;
673	lexer->p_current = p;
674	return false;
675	}
676
677	enum class html_mode_e {
678	Outer,
679	};
680
681	// Returns whether EOL was hit
682	template <>
683	bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
684	const char* p = lexer->p_current; // mutated by re2c
685	const char* YYMARKER = p;
686
687	switch (lexer->line_mode) {
688	case html_mode_e::Outer:
689	while (true) {
690	/*!re2c
691	nul { return true; }
692
693	// Like _NAME in HTM8
694	name = [a-zA-Z][a-zA-Z0-9:_-]* ;
695
696	'</' name '>' { TOK(Id::EndTag); }
697	'<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); }
698	'<' name [^>\x00]* '>' { TOK(Id::StartTag); }
699
700	// TODO: Fill in the rest of the HTM8 lexer.
701
702	* { TOK(Id::Other); }
703
704	*/
705	}
706	break;
707	}
708
709	tok->end_col = p - lexer->line_;
710	lexer->p_current = p;
711	return false;
712	}
713
714
715	// TODO:
716	// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
717	// - same as C++ raw string, I think
718	// - similar to here docs, but less complex
719	//
720	// Inherent problems with "micro segmentation":
721	//
722	// - Nested double quotes in shell. echo "hi ${name:-"default"}"
723	// - This means that lexing is dependent on parsing: does the second
724	// double quote close the first one, or does it start a nested string?
725	// - lexing is non-recursive, parsing is recursive
726
727	// Shell Comments depend on operator chars
728	// echo one # comment
729	// echo $(( 16#ff ))'
730
731	#endif // MICRO_SYNTAX_H