OILS / doctools / micro_syntax.cc View on Github | oils.pub

1139 lines, 726 significant
1// Micro Syntax
2//
3// See doctools/micro-syntax.md
4
5#include "micro_syntax.h" // requires -I $BASE_DIR
6
7#include <assert.h>
8#include <errno.h>
9#include <getopt.h>
10#include <stdarg.h> // va_list, etc.
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h> // free
14#include <string.h>
15
16#include <string>
17#include <vector>
18
19const char* RESET = "\x1b[0;0m";
20const char* BOLD = "\x1b[1m";
21const char* UNDERLINE = "\x1b[4m";
22const char* REVERSE = "\x1b[7m"; // reverse video
23
24const char* BLACK = "\x1b[30m";
25const char* RED = "\x1b[31m";
26const char* GREEN = "\x1b[32m";
27const char* YELLOW = "\x1b[33m";
28const char* BLUE = "\x1b[34m";
29const char* PURPLE = "\x1b[35m";
30const char* CYAN = "\x1b[36m";
31const char* WHITE = "\x1b[37m";
32
33const char* BLACK2 = "\x1b[90m";
34const char* RED2 = "\x1b[91m";
35const char* BLUE2 = "\x1b[94m";
36
37void Log(const char* fmt, ...) {
38 va_list args;
39 va_start(args, fmt);
40 vfprintf(stderr, fmt, args);
41 va_end(args);
42 fputs("\n", stderr);
43}
44
45enum class lang_e {
46 PlainText,
47
48 Cpp, // including C
49 Py,
50 Shell,
51 Ysh, // ''' etc.
52 Html,
53 Asdl,
54 R, // uses # comments
55
56 // JS, // uses // comments
57};
58
59class Reader {
60 // We don't care about internal NUL, so this interface doesn't allow it
61
62 public:
63 Reader(FILE* f, const char* filename)
64 : f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
65 }
66
67 const char* Filename() { // for error messages only, nullptr for stdin
68 return filename_;
69 }
70
71 bool NextLine() {
72 // Returns false if there was an error, and sets err_num_.
73 // Returns true if not error, and Current() can be checked.
74
75 // Note: getline() frees the previous line, so we don't have to
76 ssize_t len = getline(&line_, &allocated_size_, f_);
77
78 if (len < 0) { // EOF is -1
79 // man page says the buffer should be freed if getline() fails
80 free(line_);
81
82 line_ = nullptr; // tell the caller not to continue
83
84 if (errno != 0) { // I/O error
85 err_num_ = errno;
86 return false;
87 }
88 }
89 return true;
90 }
91
92 char* Current() {
93 // Returns nullptr on EOF.
94 return line_;
95 }
96
97 FILE* f_;
98 const char* filename_;
99
100 char* line_; // valid for one NextLine() call, nullptr on EOF or error
101 size_t allocated_size_; // unused, but must pass address to getline()
102 int err_num_; // set on error
103};
104
105class Printer {
106 public:
107 virtual void PrintLineNumber(int line_num) = 0;
108 virtual void PrintLineEnd() {
109 }
110 virtual void PrintToken(const char* line, int line_num, int start_col,
111 Token token) = 0;
112 virtual void Swap(std::string* s) {
113 assert(0);
114 }
115 virtual ~Printer() {
116 }
117};
118
119class HtmlPrinter : public Printer {
120 public:
121 HtmlPrinter() : Printer(), out_() {
122 }
123
124 virtual void Swap(std::string* s) {
125 // assert(s != nullptr);
126 out_.swap(*s);
127 }
128
129 virtual void PrintLineNumber(int line_num) {
130 char buf[16];
131 snprintf(buf, 16, "%d", line_num);
132
133 out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
134 out_.append(buf);
135 out_.append("</td><td id=L"); // jump to line with foo.html#L32
136 out_.append(buf);
137 out_.append(" class=line>"); // <td> closed by PrintLineEnd()
138 }
139
140 virtual void PrintLineEnd() {
141 out_.append("</td></tr>");
142 }
143
144 virtual void PrintToken(const char* line, int line_num, int start_col,
145 Token tok) {
146 const char* p_start = line + start_col;
147 int num_bytes = tok.end_col - start_col;
148
149 switch (tok.id) {
150 case Id::Comm:
151 PrintSpan("comm", p_start, num_bytes);
152 break;
153
154 case Id::Name:
155 PrintEscaped(p_start, num_bytes);
156 break;
157
158 case Id::PreprocCommand:
159 case Id::LineCont:
160 PrintSpan("preproc", p_start, num_bytes);
161 break;
162
163 case Id::Re2c:
164 PrintSpan("re2c", p_start, num_bytes);
165 break;
166
167 case Id::Other:
168 // PrintSpan("other", p_start, num_bytes);
169 PrintEscaped(p_start, num_bytes);
170 break;
171
172 // for now these are strings
173 case Id::HereBegin:
174 case Id::HereEnd:
175 case Id::Str:
176 PrintSpan("str", p_start, num_bytes);
177 break;
178
179 case Id::LBrace:
180 case Id::RBrace:
181 PrintSpan("brace", p_start, num_bytes);
182 break;
183
184 case Id::Unknown:
185 PrintSpan("x", p_start, num_bytes);
186 break;
187
188 default:
189 PrintEscaped(p_start, num_bytes);
190 break;
191 }
192 }
193
194 private:
195 void PrintEscaped(const char* s, int len) {
196 // HTML escape the code string
197 for (int i = 0; i < len; ++i) {
198 char c = s[i];
199
200 switch (c) {
201 case '<':
202 out_.append("&lt;");
203 break;
204 case '>':
205 out_.append("&gt;");
206 break;
207 case '&':
208 out_.append("&amp;");
209 break;
210 default:
211 // Is this inefficient? Fill 1 char
212 out_.append(1, s[i]);
213 break;
214 }
215 }
216 }
217
218 void PrintSpan(const char* css_class, const char* s, int len) {
219 out_.append("<span class=");
220 out_.append(css_class);
221 out_.append(">");
222
223 PrintEscaped(s, len);
224
225 out_.append("</span>");
226 }
227
228 std::string out_;
229};
230
231struct Flags {
232 lang_e lang;
233 bool tsv;
234 bool web;
235 bool more_color;
236 bool comments_only;
237
238 int argc;
239 char** argv;
240};
241
242class AnsiPrinter : public Printer {
243 public:
244 AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
245 }
246
247 virtual void PrintLineNumber(int line_num) {
248 if (flag_.comments_only) {
249 return;
250 }
251 printf("%s%5d%s ", BLACK2, line_num, RESET);
252 }
253
254 virtual void PrintToken(const char* line, int line_num, int start_col,
255 Token tok) {
256 const char* p_start = line + start_col;
257 int num_bytes = tok.end_col - start_col;
258 switch (tok.id) {
259 case Id::Comm:
260 if (flag_.comments_only) {
261 PrintAlways(p_start, num_bytes);
262 } else {
263 PrintColor(BLUE, p_start, num_bytes);
264 }
265 break;
266
267 case Id::Name:
268 PrintText(p_start, num_bytes);
269 break;
270
271 case Id::PreprocCommand:
272 case Id::LineCont:
273 PrintColor(PURPLE, p_start, num_bytes);
274 break;
275
276 case Id::Re2c:
277 PrintColor(PURPLE, p_start, num_bytes);
278 break;
279
280 case Id::Other:
281 if (flag_.more_color) {
282 PrintColor(PURPLE, p_start, num_bytes);
283 } else {
284 PrintText(p_start, num_bytes);
285 }
286 break;
287
288 case Id::WS:
289 if (flag_.more_color) {
290 fputs(REVERSE, stdout);
291 PrintColor(WHITE, p_start, num_bytes);
292 } else {
293 PrintText(p_start, num_bytes);
294 }
295 break;
296
297 case Id::Str:
298 PrintColor(RED, p_start, num_bytes);
299 break;
300
301 case Id::HereBegin:
302 case Id::HereEnd: {
303 PrintColor(RED2, p_start, num_bytes);
304
305 // Debug submatch extraction
306#if 0
307 fputs(RED, stdout);
308 int n = tok.submatch_len;
309 fwrite(tok.submatch_start, 1, n, stdout);
310 fputs(RESET, stdout);
311#endif
312 } break;
313
314 case Id::DelimStrBegin:
315 case Id::DelimStrEnd: {
316 PrintColor(RED2, p_start, num_bytes);
317
318 // Debug submatch extraction
319#if 0
320 fputs(RED, stdout);
321 int n = tok.submatch_len;
322 fwrite(tok.submatch_start, 1, n, stdout);
323 fputs(RESET, stdout);
324#endif
325 } break;
326
327 case Id::LBrace:
328 case Id::RBrace:
329 PrintColor(GREEN, p_start, num_bytes);
330 break;
331
332 case Id::StartTag:
333 case Id::EndTag:
334 PrintColor(PURPLE, p_start, num_bytes);
335 break;
336
337 case Id::StartEndTag:
338 PrintColor(RED2, p_start, num_bytes);
339 break;
340
341 case Id::Unknown:
342 // Make errors red
343 fputs(REVERSE, stdout);
344 PrintColor(RED, p_start, num_bytes);
345 break;
346
347 default:
348 PrintText(p_start, num_bytes);
349 break;
350 }
351 }
352
353 private:
354 void PrintColor(const char* color, const char* s, int n) {
355 fputs(color, stdout);
356 PrintText(s, n);
357 fputs(RESET, stdout);
358 }
359
360 void PrintText(const char* s, int n) {
361 if (flag_.comments_only) {
362 for (int i = 0; i < n; ++i) {
363 // Replace everything but newline with space
364 // TODO: I think we always want a newline token, including in comments.
365 // That will simplify this.
366 char c = (s[i] == '\n') ? '\n' : ' ';
367 fwrite(&c, 1, 1, stdout);
368 }
369 } else {
370 fwrite(s, 1, n, stdout);
371 }
372 }
373
374 void PrintAlways(const char* s, int n) {
375 fwrite(s, 1, n, stdout);
376 }
377
378 const Flags& flag_;
379};
380
381const char* Id_str(Id id) {
382 switch (id) {
383 case Id::Comm:
384 return "Comm";
385 case Id::MaybeComment: // fix-up doesn't guarantee this is gone
386 return "MaybeComment";
387 case Id::WS:
388 return "WS";
389 case Id::Re2c:
390 return "Re2c";
391
392 case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
393 return "MaybePreproc";
394 case Id::PreprocCommand:
395 return "PreprocCommand";
396 case Id::PreprocOther:
397 return "PreprocOther";
398 case Id::LineCont:
399 return "LineCont";
400
401 case Id::Name:
402 return "Name";
403 case Id::Other:
404 return "Other";
405
406 case Id::Str:
407 return "Str";
408
409 case Id::HereBegin:
410 return "HereBegin";
411 case Id::HereEnd:
412 return "HereEnd";
413 case Id::DelimStrBegin:
414 return "DelimStrBegin";
415 case Id::DelimStrEnd:
416 return "DelimStrEnd";
417
418 case Id::LBrace:
419 return "LBrace";
420 case Id::RBrace:
421 return "RBrace";
422
423 case Id::Unknown:
424 return "Unknown";
425 default:
426 assert(0);
427 }
428}
429
430class TsvPrinter : public Printer {
431 public:
432 virtual void PrintLineNumber(int line_num) {
433 ;
434 }
435
436 virtual void Swap(std::string* s) {
437 // out_.swap(*s);
438 }
439
440 virtual void PrintToken(const char* line, int line_num, int start_col,
441 Token tok) {
442 printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
443 tok.end_col);
444 // printf(" -> mode %d\n", lexer.line_mode);
445 }
446 virtual ~TsvPrinter() {
447 }
448};
449
450bool TokenIsSignificant(Id id) {
451 switch (id) {
452 case Id::Name:
453 case Id::Other:
454 case Id::PreprocCommand:
455 case Id::PreprocOther:
456 case Id::Re2c:
457 return true;
458
459 // Comments, whitespace, and string literals aren't significant
460 // TODO: can abort on Id::Unknown?
461 default:
462 break;
463 }
464 return false;
465}
466
467class OutputStream {
468 // stdout contains either
469 // - netstrings of HTML, or TSV Token structs
470 // - ANSI text
471
472 public:
473 OutputStream(Printer* pr) : pr_(pr) {
474 }
475 virtual void PathBegin(const char* path) = 0;
476 virtual void Line(int line_num, const char* line,
477 const std::vector<Token>& tokens) = 0;
478 virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
479 virtual ~OutputStream() {
480 }
481
482 protected:
483 Printer* pr_; // how to print each file
484};
485
486class NetStringOutput : public OutputStream {
487 public:
488 NetStringOutput(Printer* pr) : OutputStream(pr) {
489 }
490
491 virtual void PathBegin(const char* path) {
492 if (path == nullptr) {
493 path = "<stdin>";
494 }
495 PrintNetString(path, strlen(path));
496 }
497
498 virtual void Line(int line_num, const char* line,
499 const std::vector<Token>& tokens) {
500 pr_->PrintLineNumber(line_num);
501
502 int start_col = 0;
503 for (auto tok : tokens) {
504 pr_->PrintToken(line, line_num, start_col, tok);
505 start_col = tok.end_col;
506 }
507
508 pr_->PrintLineEnd();
509 }
510
511 virtual void PathEnd(int num_lines, int num_sig_lines) {
512 std::string string_for_file;
513 pr_->Swap(&string_for_file);
514
515 PrintNetString(string_for_file.c_str(), string_for_file.size());
516
517 // Output summary in JSON
518 // TODO: change this to a 4th column
519 char buf[64];
520 int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
521 num_lines, num_sig_lines);
522 PrintNetString(buf, n);
523 }
524
525 private:
526 void PrintNetString(const char* s, int len) {
527 fprintf(stdout, "%d:%*s,", len, len, s);
528 }
529};
530
531class AnsiOutput : public OutputStream {
532 public:
533 AnsiOutput(Printer* pr) : OutputStream(pr) {
534 }
535
536 // TODO: Can respect --comments-only
537 virtual void PathBegin(const char* path) {
538 if (path == nullptr) {
539 path = "<stdin>";
540 }
541 // diff uses +++ ---
542 printf("\n");
543 printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
544 printf("\n");
545 }
546
547 virtual void Line(int line_num, const char* line,
548 const std::vector<Token>& tokens) {
549 pr_->PrintLineNumber(line_num);
550
551 int start_col = 0;
552 for (auto tok : tokens) {
553 pr_->PrintToken(line, line_num, start_col, tok);
554 start_col = tok.end_col;
555 }
556
557 pr_->PrintLineEnd();
558 };
559
560 // TODO: Can respect --comments-only
561 virtual void PathEnd(int num_lines, int num_sig_lines) {
562 fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
563 num_sig_lines, RESET);
564 };
565};
566
567void PrintTokens(std::vector<Token>& toks) {
568 int start_col = 0;
569 int i = 0;
570 Log("===");
571 for (auto tok : toks) {
572 Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
573 start_col = tok.end_col;
574 ++i;
575 }
576 Log("===");
577}
578
579// BUGGY, needs unit tests
580
581// Fiddly function, reduces the size of the output a bit
582// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
583void Optimize(std::vector<Token>* tokens) {
584 std::vector<Token>& toks = *tokens; // alias
585
586 // PrintTokens(toks);
587
588 int n = toks.size();
589 if (n < 1) { // nothing to de-duplicate
590 return;
591 }
592
593 int left = 0;
594 int right = 1;
595 while (right < n) {
596 Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
597
598 if (toks[left].id == toks[right].id) {
599 // Join the tokens together
600 toks[left].end_col = toks[right].end_col;
601 } else {
602 toks[left] = toks[right];
603 left++;
604 Log(" not eq, left = %d", left);
605 }
606 right++;
607 }
608 Log("left = %d, right = %d", left, right);
609
610 // Fiddly condition: one more iteration. Need some unit tests for this.
611 toks[left] = toks[right - 1];
612 left++;
613 assert(left <= n);
614
615 // Erase the remaining ones
616 toks.resize(left);
617
618 // PrintTokens(toks);
619}
620
621// Version of the above that's not in-place, led to a bug fix
622void Optimize2(std::vector<Token>* tokens) {
623 std::vector<Token> optimized;
624
625 int n = tokens->size();
626 if (n < 1) {
627 return;
628 }
629
630 optimized.reserve(n);
631
632 int left = 0;
633 int right = 1;
634 while (right < n) {
635 optimized.push_back((*tokens)[left]);
636 left++;
637 right++;
638 }
639 optimized.push_back((*tokens)[left]);
640 left++;
641
642 tokens->swap(optimized);
643}
644
645bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
646 // Compare EOF vs. EOF\n or EOF\t\n or x\n
647
648 // Hack: skip leading tab unconditionally, even though that's only alowed in
649 // <<- Really we should capture the operator and the delim?
650 if (*line == '\t') {
651 line++;
652 }
653
654 int n = strlen(line);
655 int h = here_delim.size();
656
657 // Log("Here delim=%s line=%s", here_delim.c_str(), line);
658
659 // Line should be at least one longer, EOF\n
660 if (n <= h) {
661 // Log(" [0] line too short");
662 return false;
663 }
664
665 int i = 0;
666 for (; i < h; ++i) {
667 if (here_delim[i] != line[i]) {
668 // Log(" [1] byte %d not equal", i);
669 return false;
670 }
671 }
672
673 while (i < n) {
674 switch (line[i]) {
675 case ' ':
676 case '\t':
677 case '\r':
678 case '\n':
679 break;
680 default:
681 // Log(" [2] byte %d not whitespace", i);
682 return false; // line can't have whitespace on the end
683 }
684 ++i;
685 }
686
687 return true;
688}
689
690void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
691 // Fills tokens, which can be checked for beginning and end tokens
692
693 Lexer<pp_mode_e> lexer(line);
694 Matcher<pp_mode_e> matcher;
695
696 while (true) { // tokens on each line
697 Token tok;
698 // Log("Match %d", lexer.p_current - lexer.line_);
699 bool eol = matcher.Match(&lexer, &tok);
700 // Log("EOL %d", eol);
701 if (eol) {
702 break;
703 }
704 // Log("TOK %s %d", Id_str(tok.id), tok.end_col);
705 tokens->push_back(tok); // make a copy
706 }
707}
708
709void FixShellComments(std::vector<Token>& tokens) {
710 int n = tokens.size();
711 for (int i = 0; i < n; ++i) {
712 // # comment at start of line
713 if (tokens[i].id == Id::MaybeComment) {
714 if (i == 0) {
715 tokens[i].id = Id::Comm;
716 }
717 if (i != 0 and tokens[i - 1].id == Id::WS) {
718 tokens[i].id = Id::Comm;
719 }
720 }
721 }
722}
723
724// This templated method causes some code expansion, but not too much. The
725// binary went from 38 KB to 42 KB, after being stripped.
726// We get a little type safety with py_mode_e vs cpp_mode_e.
727
728template <typename T>
729int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
730 Lexer<T> lexer(nullptr);
731 Matcher<T> matcher;
732
733 int line_num = 1;
734 int num_sig = 0;
735
736 std::vector<std::string> here_list; // delimiters to pop
737 std::vector<int> here_start_num;
738
739 // For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
740 // the empty string.
741 std::vector<std::string> delim_begin;
742
743 while (true) { // read each line, handling errors
744 if (!reader->NextLine()) {
745 const char* name = reader->Filename() ?: "<stdin>";
746 Log("micro-syntax: getline() error on %s: %s", name,
747 strerror(reader->err_num_));
748 return 1;
749 }
750 char* line = reader->Current();
751 if (line == nullptr) {
752 break; // EOF
753 }
754
755 std::vector<Token> pre_tokens;
756
757 hook->TryPreprocess(line, &pre_tokens);
758
759 // e.g #define at beginning of line
760 if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
761 pre_tokens[0].id = Id::PreprocCommand;
762
763 out->Line(line_num, line, pre_tokens);
764
765 line_num += 1;
766 num_sig += 1;
767
768 Token last = pre_tokens.back();
769 while (last.id == Id::LineCont) {
770 const char* blame = reader->Filename() ?: "<stdin>";
771 if (!reader->NextLine()) {
772 Log("micro-syntax: getline() error on %s: %s", blame,
773 strerror(reader->err_num_));
774 return 1;
775 }
776 char* line = reader->Current();
777 if (line == nullptr) {
778 Log("Unexpected end-of-file in preprocessor in %s", blame);
779 return 1;
780 }
781
782 pre_tokens.clear();
783 hook->TryPreprocess(line, &pre_tokens);
784
785 out->Line(line_num, line, pre_tokens);
786
787 line_num += 1;
788 num_sig += 1;
789
790 last = pre_tokens.back();
791 }
792 continue; // Skip the rest of the loop
793 }
794
795 //
796 // Main Loop for "normal" lines (not preprocessor or here doc)
797 //
798
799 std::vector<Token> tokens;
800 lexer.SetLine(line);
801
802 bool line_is_sig = false;
803 while (true) { // tokens on each line
804 Token tok;
805 bool eol = matcher.Match(&lexer, &tok);
806 if (eol) {
807 break;
808 }
809
810 switch (tok.id) {
811 case Id::HereBegin: {
812 // Put a copy on the stack
813 int n = tok.submatch_end - tok.submatch_start;
814 here_list.emplace_back(line + tok.submatch_start, n);
815 here_start_num.push_back(line_num);
816 } break;
817
818 case Id::DelimStrBegin: {
819 if (delim_begin.empty()) {
820 int n = tok.submatch_end - tok.submatch_start;
821 delim_begin.emplace_back(line + tok.submatch_start, n);
822 } else {
823 // We have entered cpp_mode_e::DelimStr, which means we should never
824 // return another DelimStrBegin
825 assert(0);
826 }
827 } break;
828
829 case Id::DelimStrEnd: {
830 if (delim_begin.empty()) {
831 // We should never get this unless we got a DelimStrBegin first
832 assert(0);
833 } else {
834 size_t n = tok.submatch_end - tok.submatch_start;
835 std::string end_delim(line + tok.submatch_start, n);
836
837 if (end_delim == delim_begin.back()) {
838 lexer.line_mode = T::Outer; // the string is ended
839 delim_begin.pop_back();
840 } else {
841 tok.id = Id::Str; // mismatched delimiter is just a string
842 }
843 }
844 } break;
845
846 default:
847 break;
848 }
849
850 tokens.push_back(tok); // make a copy
851
852 if (TokenIsSignificant(tok.id)) {
853 line_is_sig = true;
854 }
855 }
856
857#if 0
858 PrintTokens(tokens);
859 Log("%d tokens before", tokens.size());
860 Optimize(&tokens);
861 Log("%d tokens after", tokens.size());
862 PrintTokens(tokens);
863#endif
864
865 FixShellComments(tokens);
866
867 out->Line(line_num, line, tokens);
868 tokens.clear();
869
870 // Potentially multiple here docs for this line
871 int here_index = 0;
872 for (auto here_delim : here_list) {
873 // Log("HERE %s", here_delim.c_str());
874
875 while (true) {
876 const char* blame = reader->Filename() ?: "<stdin>";
877 if (!reader->NextLine()) {
878 Log("micro-syntax: getline() error on %s: %s", blame,
879 strerror(reader->err_num_));
880 return 1;
881 }
882 char* line = reader->Current();
883 if (line == nullptr) {
884 int start_line = here_start_num[here_index];
885 Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
886 start_line);
887 return 1;
888 }
889
890 line_num++;
891
892 if (LineEqualsHereDelim(line, here_delim)) {
893 int n = strlen(line);
894 Token whole_line(Id::HereEnd, n);
895 tokens.push_back(whole_line);
896 out->Line(line_num, line, tokens);
897 tokens.clear();
898 break;
899
900 } else {
901 int n = strlen(line);
902 Token whole_line(Id::Str, n);
903 tokens.push_back(whole_line);
904 out->Line(line_num, line, tokens);
905 tokens.clear();
906
907 // Log(" not equal: %s", line);
908 }
909 }
910 here_index++;
911 }
912 here_list.clear();
913 here_start_num.clear();
914
915 line_num++;
916 num_sig += line_is_sig;
917 }
918
919 out->PathEnd(line_num - 1, num_sig);
920 return 0;
921}
922
923int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
924 Hook* hook) {
925 Reader* reader = nullptr;
926
927 int status = 0;
928 for (auto path : files) {
929 FILE* f;
930 if (path == nullptr) {
931 f = stdin;
932 } else {
933 f = fopen(path, "r");
934 if (f == nullptr) {
935 Log("Error opening %s: %s", path, strerror(errno));
936 return 1;
937 }
938 }
939 out->PathBegin(path);
940
941 reader = new Reader(f, path);
942
943 switch (flag.lang) {
944 case lang_e::PlainText:
945 status = ScanOne<text_mode_e>(reader, out, hook);
946 break;
947
948 case lang_e::Py:
949 status = ScanOne<py_mode_e>(reader, out, hook);
950 break;
951
952 case lang_e::Cpp:
953 status = ScanOne<cpp_mode_e>(reader, out, hook);
954 break;
955
956 case lang_e::Shell:
957 status = ScanOne<sh_mode_e>(reader, out, hook);
958 break;
959
960 case lang_e::Asdl:
961 status = ScanOne<asdl_mode_e>(reader, out, hook);
962 break;
963
964 case lang_e::R:
965 status = ScanOne<R_mode_e>(reader, out, hook);
966 break;
967
968 case lang_e::Html:
969 status = ScanOne<html_mode_e>(reader, out, hook);
970 break;
971
972 default:
973 assert(0);
974 }
975
976 delete reader;
977
978 if (path == nullptr) {
979 ;
980 } else {
981 fclose(f);
982 }
983
984 if (status != 0) {
985 break;
986 }
987 }
988
989 return status;
990}
991
992void PrintHelp() {
993 puts(R"(Usage: micro-syntax FLAGS* FILE*
994
995Recognizes the syntax of each file,, and prints it to stdout.
996
997If there are no files, reads stdin.
998
999Flags:
1000 -h --help This help
1001
1002 -l --lang Language: py|cpp|shell|...
1003 -t Print tokens as TSV, instead of ANSI color
1004 -w Print HTML for the web
1005
1006 -m More color, useful for debugging tokens
1007
1008 -n --no-comments Omit comments
1009 -o --comments-only Only print comments
1010 -e --empty-strs Substitute string literals for empty strings
1011 --color on off always more
1012
1013)");
1014}
1015
1016int main(int argc, char** argv) {
1017 Flags flag = {lang_e::PlainText};
1018
1019 // http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1020 // + means to be strict about flag parsing.
1021 int c;
1022 while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1023 switch (c) {
1024 case 'h':
1025 PrintHelp();
1026 return 0;
1027
1028 case 'l':
1029 if (strcmp(optarg, "cpp") == 0) {
1030 flag.lang = lang_e::Cpp;
1031
1032 } else if (strcmp(optarg, "py") == 0) {
1033 flag.lang = lang_e::Py;
1034
1035 } else if (strcmp(optarg, "shell") == 0) {
1036 flag.lang = lang_e::Shell;
1037
1038 } else if (strcmp(optarg, "asdl") == 0) {
1039 flag.lang = lang_e::Asdl;
1040
1041 } else if (strcmp(optarg, "R") == 0) {
1042 flag.lang = lang_e::R;
1043
1044 // TODO: implement all of these
1045 } else if (strcmp(optarg, "js") == 0) {
1046 flag.lang = lang_e::PlainText;
1047
1048 } else if (strcmp(optarg, "css") == 0) {
1049 flag.lang = lang_e::PlainText;
1050
1051 } else if (strcmp(optarg, "md") == 0) {
1052 flag.lang = lang_e::PlainText;
1053
1054 } else if (strcmp(optarg, "yaml") == 0) {
1055 flag.lang = lang_e::PlainText;
1056
1057 } else if (strcmp(optarg, "html") == 0) {
1058 flag.lang = lang_e::Html;
1059
1060 } else if (strcmp(optarg, "txt") == 0) {
1061 flag.lang = lang_e::PlainText;
1062
1063 } else if (strcmp(optarg, "other") == 0) {
1064 flag.lang = lang_e::PlainText;
1065
1066 } else {
1067 Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, "
1068 "got %s",
1069 optarg);
1070 return 2;
1071 }
1072 break;
1073
1074 case 'm':
1075 flag.more_color = true;
1076 break;
1077
1078 case 'o':
1079 flag.comments_only = true;
1080 break;
1081
1082 case 't':
1083 flag.tsv = true;
1084 break;
1085
1086 case 'w':
1087 flag.web = true;
1088 break;
1089
1090 case '?': // getopt library will print error
1091 return 2;
1092
1093 default:
1094 abort(); // should never happen
1095 }
1096 }
1097
1098 int a = optind; // index into argv
1099 flag.argv = argv + a;
1100 flag.argc = argc - a;
1101
1102 std::vector<char*> files; // filename, or nullptr for stdin
1103 if (flag.argc != 0) {
1104 for (int i = 0; i < flag.argc; ++i) {
1105 files.push_back(flag.argv[i]);
1106 }
1107 } else {
1108 files.push_back(nullptr); // stands for stdin
1109 }
1110
1111 Printer* pr; // for each file
1112 OutputStream* out; // the entire stream
1113
1114 if (flag.tsv) {
1115 pr = new TsvPrinter();
1116 out = new NetStringOutput(pr);
1117 } else if (flag.web) {
1118 pr = new HtmlPrinter();
1119 out = new NetStringOutput(pr);
1120 } else {
1121 pr = new AnsiPrinter(flag);
1122 out = new AnsiOutput(pr);
1123 }
1124
1125 Hook* hook = nullptr;
1126 if (flag.lang == lang_e::Cpp) {
1127 hook = new CppHook();
1128 } else {
1129 hook = new Hook(); // default hook
1130 }
1131
1132 int status = ScanFiles(flag, files, out, hook);
1133
1134 delete hook;
1135 delete pr;
1136 delete out;
1137
1138 return status;
1139}