doctools/micro_syntax.cc

OILS / doctools / micro_syntax.cc View on Github | oils.pub

1157 lines, 733 significant

1	// Micro Syntax
2	//
3	// See doctools/micro-syntax.md
4
5	#include "micro_syntax.h" // requires -I $BASE_DIR
6
7	#include <assert.h>
8	#include <errno.h>
9	#include <getopt.h>
10	#include <stdarg.h> // va_list, etc.
11	#include <stdbool.h>
12	#include <stdio.h>
13	#include <stdlib.h> // free
14	#include <string.h>
15
16	#include <string>
17	#include <vector>
18
19	const char* RESET = "\x1b[0;0m";
20	const char* BOLD = "\x1b[1m";
21	const char* UNDERLINE = "\x1b[4m";
22	const char* REVERSE = "\x1b[7m"; // reverse video
23
24	const char* BLACK = "\x1b[30m";
25	const char* RED = "\x1b[31m";
26	const char* GREEN = "\x1b[32m";
27	const char* YELLOW = "\x1b[33m";
28	const char* BLUE = "\x1b[34m";
29	const char* PURPLE = "\x1b[35m";
30	const char* CYAN = "\x1b[36m";
31	const char* WHITE = "\x1b[37m";
32
33	const char* BLACK2 = "\x1b[90m";
34	const char* RED2 = "\x1b[91m";
35	const char* BLUE2 = "\x1b[94m";
36
37	void Log(const char* fmt, ...) {
38	va_list args;
39	va_start(args, fmt);
40	vfprintf(stderr, fmt, args);
41	va_end(args);
42	fputs("\n", stderr);
43	}
44
45	enum class lang_e {
46	PlainText,
47
48	Cpp, // including C
49	Py,
50	Shell,
51	Ysh, // ''' etc.
52	Html,
53	Asdl,
54	R, // uses # comments
55
56	// JS, // uses // comments
57	};
58
59	class Reader {
60	// We don't care about internal NUL, so this interface doesn't allow it
61
62	public:
63	Reader(FILE* f, const char* filename)
64	: f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
65	}
66
67	const char* Filename() { // for error messages only, nullptr for stdin
68	return filename_;
69	}
70
71	bool NextLine() {
72	// Returns false if there was an error, and sets err_num_.
73	// Returns true if not error, and Current() can be checked.
74
75	// Note: getline() frees the previous line, so we don't have to
76	ssize_t len = getline(&line_, &allocated_size_, f_);
77
78	if (len < 0) { // EOF is -1
79	// man page says the buffer should be freed if getline() fails
80	free(line_);
81
82	line_ = nullptr; // tell the caller not to continue
83
84	if (errno != 0) { // I/O error
85	err_num_ = errno;
86	return false;
87	}
88	}
89	return true;
90	}
91
92	char* Current() {
93	// Returns nullptr on EOF.
94	return line_;
95	}
96
97	FILE* f_;
98	const char* filename_;
99
100	char* line_; // valid for one NextLine() call, nullptr on EOF or error
101	size_t allocated_size_; // unused, but must pass address to getline()
102	int err_num_; // set on error
103	};
104
105	class Printer {
106	public:
107	virtual void PrintLineNumber(int line_num) = 0;
108	virtual void PrintLineEnd() {
109	}
110	virtual void PrintToken(const char* line, int line_num, int start_col,
111	Token token) = 0;
112	virtual void Swap(std::string* s) {
113	assert(0);
114	}
115	virtual ~Printer() {
116	}
117	};
118
119	class HtmlPrinter : public Printer {
120	public:
121	HtmlPrinter() : Printer(), out_() {
122	}
123
124	virtual void Swap(std::string* s) {
125	// assert(s != nullptr);
126	out_.swap(*s);
127	}
128
129	virtual void PrintLineNumber(int line_num) {
130	char buf[16];
131	snprintf(buf, 16, "%d", line_num);
132
133	out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
134	out_.append(buf);
135	out_.append("</td><td id=L"); // jump to line with foo.html#L32
136	out_.append(buf);
137	out_.append(" class=line>"); // <td> closed by PrintLineEnd()
138	}
139
140	virtual void PrintLineEnd() {
141	out_.append("</td></tr>");
142	}
143
144	virtual void PrintToken(const char* line, int line_num, int start_col,
145	Token tok) {
146	const char* p_start = line + start_col;
147	int num_bytes = tok.end_col - start_col;
148
149	switch (tok.id) {
150	case Id::Comm:
151	PrintSpan("comm", p_start, num_bytes);
152	break;
153
154	case Id::Name:
155	PrintEscaped(p_start, num_bytes);
156	break;
157
158	case Id::PreprocCommand:
159	case Id::LineCont:
160	PrintSpan("preproc", p_start, num_bytes);
161	break;
162
163	case Id::Re2c:
164	PrintSpan("re2c", p_start, num_bytes);
165	break;
166
167	case Id::Other:
168	// PrintSpan("other", p_start, num_bytes);
169	PrintEscaped(p_start, num_bytes);
170	break;
171
172	// for now these are strings
173	case Id::HereBegin:
174	case Id::HereEnd:
175	case Id::Str:
176	PrintSpan("str", p_start, num_bytes);
177	break;
178
179	case Id::LBrace:
180	case Id::RBrace:
181	PrintSpan("brace", p_start, num_bytes);
182	break;
183
184	case Id::Unknown:
185	PrintSpan("x", p_start, num_bytes);
186	break;
187
188	default:
189	PrintEscaped(p_start, num_bytes);
190	break;
191	}
192	}
193
194	private:
195	void PrintEscaped(const char* s, int len) {
196	// HTML escape the code string
197	for (int i = 0; i < len; ++i) {
198	char c = s[i];
199
200	switch (c) {
201	case '<':
202	out_.append("<");
203	break;
204	case '>':
205	out_.append(">");
206	break;
207	case '&':
208	out_.append("&");
209	break;
210	default:
211	// Is this inefficient? Fill 1 char
212	out_.append(1, s[i]);
213	break;
214	}
215	}
216	}
217
218	void PrintSpan(const char* css_class, const char* s, int len) {
219	out_.append("<span class=");
220	out_.append(css_class);
221	out_.append(">");
222
223	PrintEscaped(s, len);
224
225	out_.append("</span>");
226	}
227
228	std::string out_;
229	};
230
231	struct Flags {
232	lang_e lang;
233	bool tsv;
234	bool web;
235	bool more_color;
236	bool comments_only;
237
238	int argc;
239	char** argv;
240	};
241
242	class AnsiPrinter : public Printer {
243	public:
244	AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
245	}
246
247	virtual void PrintLineNumber(int line_num) {
248	if (flag_.comments_only) {
249	return;
250	}
251	printf("%s%5d%s ", BLACK2, line_num, RESET);
252	}
253
254	virtual void PrintToken(const char* line, int line_num, int start_col,
255	Token tok) {
256	const char* p_start = line + start_col;
257	int num_bytes = tok.end_col - start_col;
258	switch (tok.id) {
259	case Id::Comm:
260	if (flag_.comments_only) {
261	PrintAlways(p_start, num_bytes);
262	} else {
263	PrintColor(BLUE, p_start, num_bytes);
264	}
265	break;
266
267	case Id::Name:
268	PrintText(p_start, num_bytes);
269	break;
270
271	case Id::PreprocCommand:
272	case Id::LineCont:
273	PrintColor(PURPLE, p_start, num_bytes);
274	break;
275
276	case Id::Re2c:
277	PrintColor(PURPLE, p_start, num_bytes);
278	break;
279
280	case Id::Other:
281	if (flag_.more_color) {
282	PrintColor(PURPLE, p_start, num_bytes);
283	} else {
284	PrintText(p_start, num_bytes);
285	}
286	break;
287
288	case Id::WS:
289	if (flag_.more_color) {
290	fputs(REVERSE, stdout);
291	PrintColor(WHITE, p_start, num_bytes);
292	} else {
293	PrintText(p_start, num_bytes);
294	}
295	break;
296
297	case Id::Str:
298	PrintColor(RED, p_start, num_bytes);
299	break;
300
301	case Id::HereBegin:
302	case Id::HereEnd: {
303	PrintColor(RED2, p_start, num_bytes);
304
305	// Debug submatch extraction
306	#if 0
307	fputs(RED, stdout);
308	int n = tok.submatch_len;
309	fwrite(tok.submatch_start, 1, n, stdout);
310	fputs(RESET, stdout);
311	#endif
312	} break;
313
314	case Id::DelimStrBegin:
315	case Id::DelimStrEnd: {
316	PrintColor(RED2, p_start, num_bytes);
317
318	// Debug submatch extraction
319	#if 0
320	fputs(RED, stdout);
321	int n = tok.submatch_len;
322	fwrite(tok.submatch_start, 1, n, stdout);
323	fputs(RESET, stdout);
324	#endif
325	} break;
326
327	case Id::LBrace:
328	case Id::RBrace:
329	PrintColor(GREEN, p_start, num_bytes);
330	break;
331
332	case Id::TagNameLeft:
333	case Id::TagNameRight:
334	PrintColor(PURPLE, p_start, num_bytes);
335	break;
336
337	case Id::SelfClose:
338	case Id::EndTag:
339	PrintColor(RED2, p_start, num_bytes);
340	break;
341
342	case Id::CharEscape:
343	PrintColor(BLUE, p_start, num_bytes);
344	break;
345
346	case Id::Unknown:
347	case Id::BadAmpersand:
348	case Id::BadGreaterThan:
349	case Id::BadLessThan:
350	// Make errors red
351	fputs(REVERSE, stdout);
352	PrintColor(RED, p_start, num_bytes);
353	break;
354
355	default:
356	PrintText(p_start, num_bytes);
357	break;
358	}
359	}
360
361	private:
362	void PrintColor(const char* color, const char* s, int n) {
363	fputs(color, stdout);
364	PrintText(s, n);
365	fputs(RESET, stdout);
366	}
367
368	void PrintText(const char* s, int n) {
369	if (flag_.comments_only) {
370	for (int i = 0; i < n; ++i) {
371	// Replace everything but newline with space
372	// TODO: I think we always want a newline token, including in comments.
373	// That will simplify this.
374	char c = (s[i] == '\n') ? '\n' : ' ';
375	fwrite(&c, 1, 1, stdout);
376	}
377	} else {
378	fwrite(s, 1, n, stdout);
379	}
380	}
381
382	void PrintAlways(const char* s, int n) {
383	fwrite(s, 1, n, stdout);
384	}
385
386	const Flags& flag_;
387	};
388
389	const char* Id_str(Id id) {
390	switch (id) {
391	case Id::Comm:
392	return "Comm";
393	case Id::MaybeComment: // fix-up doesn't guarantee this is gone
394	return "MaybeComment";
395	case Id::WS:
396	return "WS";
397	case Id::Re2c:
398	return "Re2c";
399
400	case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
401	return "MaybePreproc";
402	case Id::PreprocCommand:
403	return "PreprocCommand";
404	case Id::PreprocOther:
405	return "PreprocOther";
406	case Id::LineCont:
407	return "LineCont";
408
409	case Id::Name:
410	return "Name";
411	case Id::Other:
412	return "Other";
413
414	case Id::Str:
415	return "Str";
416
417	case Id::HereBegin:
418	return "HereBegin";
419	case Id::HereEnd:
420	return "HereEnd";
421	case Id::DelimStrBegin:
422	return "DelimStrBegin";
423	case Id::DelimStrEnd:
424	return "DelimStrEnd";
425
426	case Id::LBrace:
427	return "LBrace";
428	case Id::RBrace:
429	return "RBrace";
430
431	case Id::Unknown:
432	return "Unknown";
433	default:
434	assert(0);
435	}
436	}
437
438	class TsvPrinter : public Printer {
439	public:
440	virtual void PrintLineNumber(int line_num) {
441	;
442	}
443
444	virtual void Swap(std::string* s) {
445	// out_.swap(*s);
446	}
447
448	virtual void PrintToken(const char* line, int line_num, int start_col,
449	Token tok) {
450	printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
451	tok.end_col);
452	// printf(" -> mode %d\n", lexer.line_mode);
453	}
454	virtual ~TsvPrinter() {
455	}
456	};
457
458	bool TokenIsSignificant(Id id) {
459	switch (id) {
460	case Id::Name:
461	case Id::Other:
462	case Id::PreprocCommand:
463	case Id::PreprocOther:
464	case Id::Re2c:
465	return true;
466
467	// Comments, whitespace, and string literals aren't significant
468	// TODO: can abort on Id::Unknown?
469	default:
470	break;
471	}
472	return false;
473	}
474
475	class OutputStream {
476	// stdout contains either
477	// - netstrings of HTML, or TSV Token structs
478	// - ANSI text
479
480	public:
481	OutputStream(Printer* pr) : pr_(pr) {
482	}
483	virtual void PathBegin(const char* path) = 0;
484	virtual void Line(int line_num, const char* line,
485	const std::vector<Token>& tokens) = 0;
486	virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
487	virtual ~OutputStream() {
488	}
489
490	protected:
491	Printer* pr_; // how to print each file
492	};
493
494	class NetStringOutput : public OutputStream {
495	public:
496	NetStringOutput(Printer* pr) : OutputStream(pr) {
497	}
498
499	virtual void PathBegin(const char* path) {
500	if (path == nullptr) {
501	path = "<stdin>";
502	}
503	PrintNetString(path, strlen(path));
504	}
505
506	virtual void Line(int line_num, const char* line,
507	const std::vector<Token>& tokens) {
508	pr_->PrintLineNumber(line_num);
509
510	int start_col = 0;
511	for (auto tok : tokens) {
512	pr_->PrintToken(line, line_num, start_col, tok);
513	start_col = tok.end_col;
514	}
515
516	pr_->PrintLineEnd();
517	}
518
519	virtual void PathEnd(int num_lines, int num_sig_lines) {
520	std::string string_for_file;
521	pr_->Swap(&string_for_file);
522
523	PrintNetString(string_for_file.c_str(), string_for_file.size());
524
525	// Output summary in JSON
526	// TODO: change this to a 4th column
527	char buf[64];
528	int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
529	num_lines, num_sig_lines);
530	PrintNetString(buf, n);
531	}
532
533	private:
534	void PrintNetString(const char* s, int len) {
535	fprintf(stdout, "%d:%*s,", len, len, s);
536	}
537	};
538
539	class AnsiOutput : public OutputStream {
540	public:
541	AnsiOutput(Printer* pr) : OutputStream(pr) {
542	}
543
544	// TODO: Can respect --comments-only
545	virtual void PathBegin(const char* path) {
546	if (path == nullptr) {
547	path = "<stdin>";
548	}
549	// diff uses +++ ---
550	printf("\n");
551	printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
552	printf("\n");
553	}
554
555	virtual void Line(int line_num, const char* line,
556	const std::vector<Token>& tokens) {
557	pr_->PrintLineNumber(line_num);
558
559	int start_col = 0;
560	for (auto tok : tokens) {
561	pr_->PrintToken(line, line_num, start_col, tok);
562	start_col = tok.end_col;
563	}
564
565	pr_->PrintLineEnd();
566	};
567
568	// TODO: Can respect --comments-only
569	virtual void PathEnd(int num_lines, int num_sig_lines) {
570	fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
571	num_sig_lines, RESET);
572	};
573	};
574
575	void PrintTokens(std::vector<Token>& toks) {
576	int start_col = 0;
577	int i = 0;
578	Log("===");
579	for (auto tok : toks) {
580	Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
581	start_col = tok.end_col;
582	++i;
583	}
584	Log("===");
585	}
586
587	// BUGGY, needs unit tests
588
589	// Fiddly function, reduces the size of the output a bit
590	// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
591	void Optimize(std::vector<Token>* tokens) {
592	std::vector<Token>& toks = *tokens; // alias
593
594	// PrintTokens(toks);
595
596	int n = toks.size();
597	if (n < 1) { // nothing to de-duplicate
598	return;
599	}
600
601	int left = 0;
602	int right = 1;
603	while (right < n) {
604	Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
605
606	if (toks[left].id == toks[right].id) {
607	// Join the tokens together
608	toks[left].end_col = toks[right].end_col;
609	} else {
610	toks[left] = toks[right];
611	left++;
612	Log(" not eq, left = %d", left);
613	}
614	right++;
615	}
616	Log("left = %d, right = %d", left, right);
617
618	// Fiddly condition: one more iteration. Need some unit tests for this.
619	toks[left] = toks[right - 1];
620	left++;
621	assert(left <= n);
622
623	// Erase the remaining ones
624	toks.resize(left);
625
626	// PrintTokens(toks);
627	}
628
629	// Version of the above that's not in-place, led to a bug fix
630	void Optimize2(std::vector<Token>* tokens) {
631	std::vector<Token> optimized;
632
633	int n = tokens->size();
634	if (n < 1) {
635	return;
636	}
637
638	optimized.reserve(n);
639
640	int left = 0;
641	int right = 1;
642	while (right < n) {
643	optimized.push_back((*tokens)[left]);
644	left++;
645	right++;
646	}
647	optimized.push_back((*tokens)[left]);
648	left++;
649
650	tokens->swap(optimized);
651	}
652
653	bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
654	// Compare EOF vs. EOF\n or EOF\t\n or x\n
655
656	// Hack: skip leading tab unconditionally, even though that's only alowed in
657	// <<- Really we should capture the operator and the delim?
658	if (*line == '\t') {
659	line++;
660	}
661
662	int n = strlen(line);
663	int h = here_delim.size();
664
665	// Log("Here delim=%s line=%s", here_delim.c_str(), line);
666
667	// Line should be at least one longer, EOF\n
668	if (n <= h) {
669	// Log(" [0] line too short");
670	return false;
671	}
672
673	int i = 0;
674	for (; i < h; ++i) {
675	if (here_delim[i] != line[i]) {
676	// Log(" [1] byte %d not equal", i);
677	return false;
678	}
679	}
680
681	while (i < n) {
682	switch (line[i]) {
683	case ' ':
684	case '\t':
685	case '\r':
686	case '\n':
687	break;
688	default:
689	// Log(" [2] byte %d not whitespace", i);
690	return false; // line can't have whitespace on the end
691	}
692	++i;
693	}
694
695	return true;
696	}
697
698	void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
699	// Fills tokens, which can be checked for beginning and end tokens
700
701	Lexer<pp_mode_e> lexer(line);
702	Matcher<pp_mode_e> matcher;
703
704	while (true) { // tokens on each line
705	Token tok;
706	// Log("Match %d", lexer.p_current - lexer.line_);
707	bool eol = matcher.Match(&lexer, &tok);
708	// Log("EOL %d", eol);
709	if (eol) {
710	break;
711	}
712	// Log("TOK %s %d", Id_str(tok.id), tok.end_col);
713	tokens->push_back(tok); // make a copy
714	}
715	}
716
717	void FixShellComments(std::vector<Token>& tokens) {
718	int n = tokens.size();
719	for (int i = 0; i < n; ++i) {
720	// # comment at start of line
721	if (tokens[i].id == Id::MaybeComment) {
722	if (i == 0) {
723	tokens[i].id = Id::Comm;
724	}
725	if (i != 0 and tokens[i - 1].id == Id::WS) {
726	tokens[i].id = Id::Comm;
727	}
728	}
729	}
730	}
731
732	// This templated method causes some code expansion, but not too much. The
733	// binary went from 38 KB to 42 KB, after being stripped.
734	// We get a little type safety with py_mode_e vs cpp_mode_e.
735
736	template <typename T>
737	int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
738	Lexer<T> lexer(nullptr);
739	Matcher<T> matcher;
740
741	int line_num = 1;
742	int num_sig = 0;
743
744	std::vector<std::string> here_list; // delimiters to pop
745	std::vector<int> here_start_num;
746
747	// For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
748	// the empty string.
749	std::vector<std::string> delim_begin;
750
751	while (true) { // read each line, handling errors
752	if (!reader->NextLine()) {
753	const char* name = reader->Filename() ?: "<stdin>";
754	Log("micro-syntax: getline() error on %s: %s", name,
755	strerror(reader->err_num_));
756	return 1;
757	}
758	char* line = reader->Current();
759	if (line == nullptr) {
760	break; // EOF
761	}
762
763	std::vector<Token> pre_tokens;
764
765	hook->TryPreprocess(line, &pre_tokens);
766
767	// e.g #define at beginning of line
768	if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
769	pre_tokens[0].id = Id::PreprocCommand;
770
771	out->Line(line_num, line, pre_tokens);
772
773	line_num += 1;
774	num_sig += 1;
775
776	Token last = pre_tokens.back();
777	while (last.id == Id::LineCont) {
778	const char* blame = reader->Filename() ?: "<stdin>";
779	if (!reader->NextLine()) {
780	Log("micro-syntax: getline() error on %s: %s", blame,
781	strerror(reader->err_num_));
782	return 1;
783	}
784	char* line = reader->Current();
785	if (line == nullptr) {
786	Log("Unexpected end-of-file in preprocessor in %s", blame);
787	return 1;
788	}
789
790	pre_tokens.clear();
791	hook->TryPreprocess(line, &pre_tokens);
792
793	out->Line(line_num, line, pre_tokens);
794
795	line_num += 1;
796	num_sig += 1;
797
798	last = pre_tokens.back();
799	}
800	continue; // Skip the rest of the loop
801	}
802
803	//
804	// Main Loop for "normal" lines (not preprocessor or here doc)
805	//
806
807	std::vector<Token> tokens;
808	lexer.SetLine(line);
809
810	bool line_is_sig = false;
811	while (true) { // tokens on each line
812	Token tok;
813	bool eol = matcher.Match(&lexer, &tok);
814	if (eol) {
815	break;
816	}
817
818	switch (tok.id) {
819	case Id::HereBegin: {
820	// Put a copy on the stack
821	int n = tok.submatch_end - tok.submatch_start;
822	here_list.emplace_back(line + tok.submatch_start, n);
823	here_start_num.push_back(line_num);
824	} break;
825
826	case Id::DelimStrBegin: {
827	if (delim_begin.empty()) {
828	int n = tok.submatch_end - tok.submatch_start;
829	delim_begin.emplace_back(line + tok.submatch_start, n);
830	} else {
831	// We have entered cpp_mode_e::DelimStr, which means we should never
832	// return another DelimStrBegin
833	assert(0);
834	}
835	} break;
836
837	case Id::DelimStrEnd: {
838	if (delim_begin.empty()) {
839	// We should never get this unless we got a DelimStrBegin first
840	assert(0);
841	} else {
842	size_t n = tok.submatch_end - tok.submatch_start;
843	std::string end_delim(line + tok.submatch_start, n);
844
845	if (end_delim == delim_begin.back()) {
846	lexer.line_mode = T::Outer; // the string is ended
847	delim_begin.pop_back();
848	} else {
849	tok.id = Id::Str; // mismatched delimiter is just a string
850	}
851	}
852	} break;
853
854	// TODO: I think we need a mode to escape into strstr(), for
855	// C++ - ending */
856	// HTML - ending --> ?> ]]> </SCRipt>
857	//
858	// So instead of returning 'eol', we can return a string to search for?
859	// Then we keep looking for more lines.
860	//
861	// This is similar to the problems of here doc and C++ multi-line
862	// strings. The main difference is that we're not using a submatch.
863	default:
864	break;
865	}
866
867	tokens.push_back(tok); // make a copy
868
869	if (TokenIsSignificant(tok.id)) {
870	line_is_sig = true;
871	}
872	}
873
874	#if 0
875	PrintTokens(tokens);
876	Log("%d tokens before", tokens.size());
877	Optimize(&tokens);
878	Log("%d tokens after", tokens.size());
879	PrintTokens(tokens);
880	#endif
881
882	FixShellComments(tokens);
883
884	out->Line(line_num, line, tokens);
885	tokens.clear();
886
887	// Potentially multiple here docs for this line
888	int here_index = 0;
889	for (auto here_delim : here_list) {
890	// Log("HERE %s", here_delim.c_str());
891
892	while (true) {
893	const char* blame = reader->Filename() ?: "<stdin>";
894	if (!reader->NextLine()) {
895	Log("micro-syntax: getline() error on %s: %s", blame,
896	strerror(reader->err_num_));
897	return 1;
898	}
899	char* line = reader->Current();
900	if (line == nullptr) {
901	int start_line = here_start_num[here_index];
902	Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
903	start_line);
904	return 1;
905	}
906
907	line_num++;
908
909	if (LineEqualsHereDelim(line, here_delim)) {
910	int n = strlen(line);
911	Token whole_line(Id::HereEnd, n);
912	tokens.push_back(whole_line);
913	out->Line(line_num, line, tokens);
914	tokens.clear();
915	break;
916
917	} else {
918	int n = strlen(line);
919	Token whole_line(Id::Str, n);
920	tokens.push_back(whole_line);
921	out->Line(line_num, line, tokens);
922	tokens.clear();
923
924	// Log(" not equal: %s", line);
925	}
926	}
927	here_index++;
928	}
929	here_list.clear();
930	here_start_num.clear();
931
932	line_num++;
933	num_sig += line_is_sig;
934	}
935
936	out->PathEnd(line_num - 1, num_sig);
937	return 0;
938	}
939
940	int ScanFiles(const Flags& flag, std::vector<char> files, OutputStream out,
941	Hook* hook) {
942	Reader* reader = nullptr;
943
944	int status = 0;
945	for (auto path : files) {
946	FILE* f;
947	if (path == nullptr) {
948	f = stdin;
949	} else {
950	f = fopen(path, "r");
951	if (f == nullptr) {
952	Log("Error opening %s: %s", path, strerror(errno));
953	return 1;
954	}
955	}
956	out->PathBegin(path);
957
958	reader = new Reader(f, path);
959
960	switch (flag.lang) {
961	case lang_e::PlainText:
962	status = ScanOne<text_mode_e>(reader, out, hook);
963	break;
964
965	case lang_e::Py:
966	status = ScanOne<py_mode_e>(reader, out, hook);
967	break;
968
969	case lang_e::Cpp:
970	status = ScanOne<cpp_mode_e>(reader, out, hook);
971	break;
972
973	case lang_e::Shell:
974	status = ScanOne<sh_mode_e>(reader, out, hook);
975	break;
976
977	case lang_e::Asdl:
978	status = ScanOne<asdl_mode_e>(reader, out, hook);
979	break;
980
981	case lang_e::R:
982	status = ScanOne<R_mode_e>(reader, out, hook);
983	break;
984
985	case lang_e::Html:
986	status = ScanOne<html_mode_e>(reader, out, hook);
987	break;
988
989	default:
990	assert(0);
991	}
992
993	delete reader;
994
995	if (path == nullptr) {
996	;
997	} else {
998	fclose(f);
999	}
1000
1001	if (status != 0) {
1002	break;
1003	}
1004	}
1005
1006	return status;
1007	}
1008
1009	void PrintHelp() {
1010	puts(R"(Usage: micro-syntax FLAGS* FILE*
1011
1012	Recognizes the syntax of each file,, and prints it to stdout.
1013
1014	If there are no files, reads stdin.
1015
1016	Flags:
1017	-h --help This help
1018
1019	-l --lang Language: py\|cpp\|shell\|...
1020	-t Print tokens as TSV, instead of ANSI color
1021	-w Print HTML for the web
1022
1023	-m More color, useful for debugging tokens
1024
1025	-n --no-comments Omit comments
1026	-o --comments-only Only print comments
1027	-e --empty-strs Substitute string literals for empty strings
1028	--color on off always more
1029
1030	)");
1031	}
1032
1033	int main(int argc, char** argv) {
1034	Flags flag = {lang_e::PlainText};
1035
1036	// http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1037	// + means to be strict about flag parsing.
1038	int c;
1039	while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1040	switch (c) {
1041	case 'h':
1042	PrintHelp();
1043	return 0;
1044
1045	case 'l':
1046	if (strcmp(optarg, "cpp") == 0) {
1047	flag.lang = lang_e::Cpp;
1048
1049	} else if (strcmp(optarg, "py") == 0) {
1050	flag.lang = lang_e::Py;
1051
1052	} else if (strcmp(optarg, "shell") == 0) {
1053	flag.lang = lang_e::Shell;
1054
1055	} else if (strcmp(optarg, "asdl") == 0) {
1056	flag.lang = lang_e::Asdl;
1057
1058	} else if (strcmp(optarg, "R") == 0) {
1059	flag.lang = lang_e::R;
1060
1061	// TODO: implement all of these
1062	} else if (strcmp(optarg, "js") == 0) {
1063	flag.lang = lang_e::PlainText;
1064
1065	} else if (strcmp(optarg, "css") == 0) {
1066	flag.lang = lang_e::PlainText;
1067
1068	} else if (strcmp(optarg, "md") == 0) {
1069	flag.lang = lang_e::PlainText;
1070
1071	} else if (strcmp(optarg, "yaml") == 0) {
1072	flag.lang = lang_e::PlainText;
1073
1074	} else if (strcmp(optarg, "html") == 0) {
1075	flag.lang = lang_e::Html;
1076
1077	} else if (strcmp(optarg, "txt") == 0) {
1078	flag.lang = lang_e::PlainText;
1079
1080	} else if (strcmp(optarg, "other") == 0) {
1081	flag.lang = lang_e::PlainText;
1082
1083	} else {
1084	Log("Expected -l LANG to be "
1085	"cpp\|py\|shell\|asdl\|R\|js\|css\|md\|yaml\|html\|txt, "
1086	"got %s",
1087	optarg);
1088	return 2;
1089	}
1090	break;
1091
1092	case 'm':
1093	flag.more_color = true;
1094	break;
1095
1096	case 'o':
1097	flag.comments_only = true;
1098	break;
1099
1100	case 't':
1101	flag.tsv = true;
1102	break;
1103
1104	case 'w':
1105	flag.web = true;
1106	break;
1107
1108	case '?': // getopt library will print error
1109	return 2;
1110
1111	default:
1112	abort(); // should never happen
1113	}
1114	}
1115
1116	int a = optind; // index into argv
1117	flag.argv = argv + a;
1118	flag.argc = argc - a;
1119
1120	std::vector<char*> files; // filename, or nullptr for stdin
1121	if (flag.argc != 0) {
1122	for (int i = 0; i < flag.argc; ++i) {
1123	files.push_back(flag.argv[i]);
1124	}
1125	} else {
1126	files.push_back(nullptr); // stands for stdin
1127	}
1128
1129	Printer* pr; // for each file
1130	OutputStream* out; // the entire stream
1131
1132	if (flag.tsv) {
1133	pr = new TsvPrinter();
1134	out = new NetStringOutput(pr);
1135	} else if (flag.web) {
1136	pr = new HtmlPrinter();
1137	out = new NetStringOutput(pr);
1138	} else {
1139	pr = new AnsiPrinter(flag);
1140	out = new AnsiOutput(pr);
1141	}
1142
1143	Hook* hook = nullptr;
1144	if (flag.lang == lang_e::Cpp) {
1145	hook = new CppHook();
1146	} else {
1147	hook = new Hook(); // default hook
1148	}
1149
1150	int status = ScanFiles(flag, files, out, hook);
1151
1152	delete hook;
1153	delete pr;
1154	delete out;
1155
1156	return status;
1157	}