OILS / doctools / micro_syntax.cc View on Github | oils.pub

1157 lines, 733 significant
1// Micro Syntax
2//
3// See doctools/micro-syntax.md
4
5#include "micro_syntax.h" // requires -I $BASE_DIR
6
7#include <assert.h>
8#include <errno.h>
9#include <getopt.h>
10#include <stdarg.h> // va_list, etc.
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h> // free
14#include <string.h>
15
16#include <string>
17#include <vector>
18
19const char* RESET = "\x1b[0;0m";
20const char* BOLD = "\x1b[1m";
21const char* UNDERLINE = "\x1b[4m";
22const char* REVERSE = "\x1b[7m"; // reverse video
23
24const char* BLACK = "\x1b[30m";
25const char* RED = "\x1b[31m";
26const char* GREEN = "\x1b[32m";
27const char* YELLOW = "\x1b[33m";
28const char* BLUE = "\x1b[34m";
29const char* PURPLE = "\x1b[35m";
30const char* CYAN = "\x1b[36m";
31const char* WHITE = "\x1b[37m";
32
33const char* BLACK2 = "\x1b[90m";
34const char* RED2 = "\x1b[91m";
35const char* BLUE2 = "\x1b[94m";
36
37void Log(const char* fmt, ...) {
38 va_list args;
39 va_start(args, fmt);
40 vfprintf(stderr, fmt, args);
41 va_end(args);
42 fputs("\n", stderr);
43}
44
45enum class lang_e {
46 PlainText,
47
48 Cpp, // including C
49 Py,
50 Shell,
51 Ysh, // ''' etc.
52 Html,
53 Asdl,
54 R, // uses # comments
55
56 // JS, // uses // comments
57};
58
59class Reader {
60 // We don't care about internal NUL, so this interface doesn't allow it
61
62 public:
63 Reader(FILE* f, const char* filename)
64 : f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
65 }
66
67 const char* Filename() { // for error messages only, nullptr for stdin
68 return filename_;
69 }
70
71 bool NextLine() {
72 // Returns false if there was an error, and sets err_num_.
73 // Returns true if not error, and Current() can be checked.
74
75 // Note: getline() frees the previous line, so we don't have to
76 ssize_t len = getline(&line_, &allocated_size_, f_);
77
78 if (len < 0) { // EOF is -1
79 // man page says the buffer should be freed if getline() fails
80 free(line_);
81
82 line_ = nullptr; // tell the caller not to continue
83
84 if (errno != 0) { // I/O error
85 err_num_ = errno;
86 return false;
87 }
88 }
89 return true;
90 }
91
92 char* Current() {
93 // Returns nullptr on EOF.
94 return line_;
95 }
96
97 FILE* f_;
98 const char* filename_;
99
100 char* line_; // valid for one NextLine() call, nullptr on EOF or error
101 size_t allocated_size_; // unused, but must pass address to getline()
102 int err_num_; // set on error
103};
104
105class Printer {
106 public:
107 virtual void PrintLineNumber(int line_num) = 0;
108 virtual void PrintLineEnd() {
109 }
110 virtual void PrintToken(const char* line, int line_num, int start_col,
111 Token token) = 0;
112 virtual void Swap(std::string* s) {
113 assert(0);
114 }
115 virtual ~Printer() {
116 }
117};
118
119class HtmlPrinter : public Printer {
120 public:
121 HtmlPrinter() : Printer(), out_() {
122 }
123
124 virtual void Swap(std::string* s) {
125 // assert(s != nullptr);
126 out_.swap(*s);
127 }
128
129 virtual void PrintLineNumber(int line_num) {
130 char buf[16];
131 snprintf(buf, 16, "%d", line_num);
132
133 out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
134 out_.append(buf);
135 out_.append("</td><td id=L"); // jump to line with foo.html#L32
136 out_.append(buf);
137 out_.append(" class=line>"); // <td> closed by PrintLineEnd()
138 }
139
140 virtual void PrintLineEnd() {
141 out_.append("</td></tr>");
142 }
143
144 virtual void PrintToken(const char* line, int line_num, int start_col,
145 Token tok) {
146 const char* p_start = line + start_col;
147 int num_bytes = tok.end_col - start_col;
148
149 switch (tok.id) {
150 case Id::Comm:
151 PrintSpan("comm", p_start, num_bytes);
152 break;
153
154 case Id::Name:
155 PrintEscaped(p_start, num_bytes);
156 break;
157
158 case Id::PreprocCommand:
159 case Id::LineCont:
160 PrintSpan("preproc", p_start, num_bytes);
161 break;
162
163 case Id::Re2c:
164 PrintSpan("re2c", p_start, num_bytes);
165 break;
166
167 case Id::Other:
168 // PrintSpan("other", p_start, num_bytes);
169 PrintEscaped(p_start, num_bytes);
170 break;
171
172 // for now these are strings
173 case Id::HereBegin:
174 case Id::HereEnd:
175 case Id::Str:
176 PrintSpan("str", p_start, num_bytes);
177 break;
178
179 case Id::LBrace:
180 case Id::RBrace:
181 PrintSpan("brace", p_start, num_bytes);
182 break;
183
184 case Id::Unknown:
185 PrintSpan("x", p_start, num_bytes);
186 break;
187
188 default:
189 PrintEscaped(p_start, num_bytes);
190 break;
191 }
192 }
193
194 private:
195 void PrintEscaped(const char* s, int len) {
196 // HTML escape the code string
197 for (int i = 0; i < len; ++i) {
198 char c = s[i];
199
200 switch (c) {
201 case '<':
202 out_.append("&lt;");
203 break;
204 case '>':
205 out_.append("&gt;");
206 break;
207 case '&':
208 out_.append("&amp;");
209 break;
210 default:
211 // Is this inefficient? Fill 1 char
212 out_.append(1, s[i]);
213 break;
214 }
215 }
216 }
217
218 void PrintSpan(const char* css_class, const char* s, int len) {
219 out_.append("<span class=");
220 out_.append(css_class);
221 out_.append(">");
222
223 PrintEscaped(s, len);
224
225 out_.append("</span>");
226 }
227
228 std::string out_;
229};
230
231struct Flags {
232 lang_e lang;
233 bool tsv;
234 bool web;
235 bool more_color;
236 bool comments_only;
237
238 int argc;
239 char** argv;
240};
241
242class AnsiPrinter : public Printer {
243 public:
244 AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
245 }
246
247 virtual void PrintLineNumber(int line_num) {
248 if (flag_.comments_only) {
249 return;
250 }
251 printf("%s%5d%s ", BLACK2, line_num, RESET);
252 }
253
254 virtual void PrintToken(const char* line, int line_num, int start_col,
255 Token tok) {
256 const char* p_start = line + start_col;
257 int num_bytes = tok.end_col - start_col;
258 switch (tok.id) {
259 case Id::Comm:
260 if (flag_.comments_only) {
261 PrintAlways(p_start, num_bytes);
262 } else {
263 PrintColor(BLUE, p_start, num_bytes);
264 }
265 break;
266
267 case Id::Name:
268 PrintText(p_start, num_bytes);
269 break;
270
271 case Id::PreprocCommand:
272 case Id::LineCont:
273 PrintColor(PURPLE, p_start, num_bytes);
274 break;
275
276 case Id::Re2c:
277 PrintColor(PURPLE, p_start, num_bytes);
278 break;
279
280 case Id::Other:
281 if (flag_.more_color) {
282 PrintColor(PURPLE, p_start, num_bytes);
283 } else {
284 PrintText(p_start, num_bytes);
285 }
286 break;
287
288 case Id::WS:
289 if (flag_.more_color) {
290 fputs(REVERSE, stdout);
291 PrintColor(WHITE, p_start, num_bytes);
292 } else {
293 PrintText(p_start, num_bytes);
294 }
295 break;
296
297 case Id::Str:
298 PrintColor(RED, p_start, num_bytes);
299 break;
300
301 case Id::HereBegin:
302 case Id::HereEnd: {
303 PrintColor(RED2, p_start, num_bytes);
304
305 // Debug submatch extraction
306#if 0
307 fputs(RED, stdout);
308 int n = tok.submatch_len;
309 fwrite(tok.submatch_start, 1, n, stdout);
310 fputs(RESET, stdout);
311#endif
312 } break;
313
314 case Id::DelimStrBegin:
315 case Id::DelimStrEnd: {
316 PrintColor(RED2, p_start, num_bytes);
317
318 // Debug submatch extraction
319#if 0
320 fputs(RED, stdout);
321 int n = tok.submatch_len;
322 fwrite(tok.submatch_start, 1, n, stdout);
323 fputs(RESET, stdout);
324#endif
325 } break;
326
327 case Id::LBrace:
328 case Id::RBrace:
329 PrintColor(GREEN, p_start, num_bytes);
330 break;
331
332 case Id::TagNameLeft:
333 case Id::TagNameRight:
334 PrintColor(PURPLE, p_start, num_bytes);
335 break;
336
337 case Id::SelfClose:
338 case Id::EndTag:
339 PrintColor(RED2, p_start, num_bytes);
340 break;
341
342 case Id::CharEscape:
343 PrintColor(BLUE, p_start, num_bytes);
344 break;
345
346 case Id::Unknown:
347 case Id::BadAmpersand:
348 case Id::BadGreaterThan:
349 case Id::BadLessThan:
350 // Make errors red
351 fputs(REVERSE, stdout);
352 PrintColor(RED, p_start, num_bytes);
353 break;
354
355 default:
356 PrintText(p_start, num_bytes);
357 break;
358 }
359 }
360
361 private:
362 void PrintColor(const char* color, const char* s, int n) {
363 fputs(color, stdout);
364 PrintText(s, n);
365 fputs(RESET, stdout);
366 }
367
368 void PrintText(const char* s, int n) {
369 if (flag_.comments_only) {
370 for (int i = 0; i < n; ++i) {
371 // Replace everything but newline with space
372 // TODO: I think we always want a newline token, including in comments.
373 // That will simplify this.
374 char c = (s[i] == '\n') ? '\n' : ' ';
375 fwrite(&c, 1, 1, stdout);
376 }
377 } else {
378 fwrite(s, 1, n, stdout);
379 }
380 }
381
382 void PrintAlways(const char* s, int n) {
383 fwrite(s, 1, n, stdout);
384 }
385
386 const Flags& flag_;
387};
388
389const char* Id_str(Id id) {
390 switch (id) {
391 case Id::Comm:
392 return "Comm";
393 case Id::MaybeComment: // fix-up doesn't guarantee this is gone
394 return "MaybeComment";
395 case Id::WS:
396 return "WS";
397 case Id::Re2c:
398 return "Re2c";
399
400 case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
401 return "MaybePreproc";
402 case Id::PreprocCommand:
403 return "PreprocCommand";
404 case Id::PreprocOther:
405 return "PreprocOther";
406 case Id::LineCont:
407 return "LineCont";
408
409 case Id::Name:
410 return "Name";
411 case Id::Other:
412 return "Other";
413
414 case Id::Str:
415 return "Str";
416
417 case Id::HereBegin:
418 return "HereBegin";
419 case Id::HereEnd:
420 return "HereEnd";
421 case Id::DelimStrBegin:
422 return "DelimStrBegin";
423 case Id::DelimStrEnd:
424 return "DelimStrEnd";
425
426 case Id::LBrace:
427 return "LBrace";
428 case Id::RBrace:
429 return "RBrace";
430
431 case Id::Unknown:
432 return "Unknown";
433 default:
434 assert(0);
435 }
436}
437
438class TsvPrinter : public Printer {
439 public:
440 virtual void PrintLineNumber(int line_num) {
441 ;
442 }
443
444 virtual void Swap(std::string* s) {
445 // out_.swap(*s);
446 }
447
448 virtual void PrintToken(const char* line, int line_num, int start_col,
449 Token tok) {
450 printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
451 tok.end_col);
452 // printf(" -> mode %d\n", lexer.line_mode);
453 }
454 virtual ~TsvPrinter() {
455 }
456};
457
458bool TokenIsSignificant(Id id) {
459 switch (id) {
460 case Id::Name:
461 case Id::Other:
462 case Id::PreprocCommand:
463 case Id::PreprocOther:
464 case Id::Re2c:
465 return true;
466
467 // Comments, whitespace, and string literals aren't significant
468 // TODO: can abort on Id::Unknown?
469 default:
470 break;
471 }
472 return false;
473}
474
475class OutputStream {
476 // stdout contains either
477 // - netstrings of HTML, or TSV Token structs
478 // - ANSI text
479
480 public:
481 OutputStream(Printer* pr) : pr_(pr) {
482 }
483 virtual void PathBegin(const char* path) = 0;
484 virtual void Line(int line_num, const char* line,
485 const std::vector<Token>& tokens) = 0;
486 virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
487 virtual ~OutputStream() {
488 }
489
490 protected:
491 Printer* pr_; // how to print each file
492};
493
494class NetStringOutput : public OutputStream {
495 public:
496 NetStringOutput(Printer* pr) : OutputStream(pr) {
497 }
498
499 virtual void PathBegin(const char* path) {
500 if (path == nullptr) {
501 path = "<stdin>";
502 }
503 PrintNetString(path, strlen(path));
504 }
505
506 virtual void Line(int line_num, const char* line,
507 const std::vector<Token>& tokens) {
508 pr_->PrintLineNumber(line_num);
509
510 int start_col = 0;
511 for (auto tok : tokens) {
512 pr_->PrintToken(line, line_num, start_col, tok);
513 start_col = tok.end_col;
514 }
515
516 pr_->PrintLineEnd();
517 }
518
519 virtual void PathEnd(int num_lines, int num_sig_lines) {
520 std::string string_for_file;
521 pr_->Swap(&string_for_file);
522
523 PrintNetString(string_for_file.c_str(), string_for_file.size());
524
525 // Output summary in JSON
526 // TODO: change this to a 4th column
527 char buf[64];
528 int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
529 num_lines, num_sig_lines);
530 PrintNetString(buf, n);
531 }
532
533 private:
534 void PrintNetString(const char* s, int len) {
535 fprintf(stdout, "%d:%*s,", len, len, s);
536 }
537};
538
539class AnsiOutput : public OutputStream {
540 public:
541 AnsiOutput(Printer* pr) : OutputStream(pr) {
542 }
543
544 // TODO: Can respect --comments-only
545 virtual void PathBegin(const char* path) {
546 if (path == nullptr) {
547 path = "<stdin>";
548 }
549 // diff uses +++ ---
550 printf("\n");
551 printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
552 printf("\n");
553 }
554
555 virtual void Line(int line_num, const char* line,
556 const std::vector<Token>& tokens) {
557 pr_->PrintLineNumber(line_num);
558
559 int start_col = 0;
560 for (auto tok : tokens) {
561 pr_->PrintToken(line, line_num, start_col, tok);
562 start_col = tok.end_col;
563 }
564
565 pr_->PrintLineEnd();
566 };
567
568 // TODO: Can respect --comments-only
569 virtual void PathEnd(int num_lines, int num_sig_lines) {
570 fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
571 num_sig_lines, RESET);
572 };
573};
574
575void PrintTokens(std::vector<Token>& toks) {
576 int start_col = 0;
577 int i = 0;
578 Log("===");
579 for (auto tok : toks) {
580 Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
581 start_col = tok.end_col;
582 ++i;
583 }
584 Log("===");
585}
586
587// BUGGY, needs unit tests
588
589// Fiddly function, reduces the size of the output a bit
590// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
591void Optimize(std::vector<Token>* tokens) {
592 std::vector<Token>& toks = *tokens; // alias
593
594 // PrintTokens(toks);
595
596 int n = toks.size();
597 if (n < 1) { // nothing to de-duplicate
598 return;
599 }
600
601 int left = 0;
602 int right = 1;
603 while (right < n) {
604 Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
605
606 if (toks[left].id == toks[right].id) {
607 // Join the tokens together
608 toks[left].end_col = toks[right].end_col;
609 } else {
610 toks[left] = toks[right];
611 left++;
612 Log(" not eq, left = %d", left);
613 }
614 right++;
615 }
616 Log("left = %d, right = %d", left, right);
617
618 // Fiddly condition: one more iteration. Need some unit tests for this.
619 toks[left] = toks[right - 1];
620 left++;
621 assert(left <= n);
622
623 // Erase the remaining ones
624 toks.resize(left);
625
626 // PrintTokens(toks);
627}
628
629// Version of the above that's not in-place, led to a bug fix
630void Optimize2(std::vector<Token>* tokens) {
631 std::vector<Token> optimized;
632
633 int n = tokens->size();
634 if (n < 1) {
635 return;
636 }
637
638 optimized.reserve(n);
639
640 int left = 0;
641 int right = 1;
642 while (right < n) {
643 optimized.push_back((*tokens)[left]);
644 left++;
645 right++;
646 }
647 optimized.push_back((*tokens)[left]);
648 left++;
649
650 tokens->swap(optimized);
651}
652
653bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
654 // Compare EOF vs. EOF\n or EOF\t\n or x\n
655
656 // Hack: skip leading tab unconditionally, even though that's only alowed in
657 // <<- Really we should capture the operator and the delim?
658 if (*line == '\t') {
659 line++;
660 }
661
662 int n = strlen(line);
663 int h = here_delim.size();
664
665 // Log("Here delim=%s line=%s", here_delim.c_str(), line);
666
667 // Line should be at least one longer, EOF\n
668 if (n <= h) {
669 // Log(" [0] line too short");
670 return false;
671 }
672
673 int i = 0;
674 for (; i < h; ++i) {
675 if (here_delim[i] != line[i]) {
676 // Log(" [1] byte %d not equal", i);
677 return false;
678 }
679 }
680
681 while (i < n) {
682 switch (line[i]) {
683 case ' ':
684 case '\t':
685 case '\r':
686 case '\n':
687 break;
688 default:
689 // Log(" [2] byte %d not whitespace", i);
690 return false; // line can't have whitespace on the end
691 }
692 ++i;
693 }
694
695 return true;
696}
697
698void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
699 // Fills tokens, which can be checked for beginning and end tokens
700
701 Lexer<pp_mode_e> lexer(line);
702 Matcher<pp_mode_e> matcher;
703
704 while (true) { // tokens on each line
705 Token tok;
706 // Log("Match %d", lexer.p_current - lexer.line_);
707 bool eol = matcher.Match(&lexer, &tok);
708 // Log("EOL %d", eol);
709 if (eol) {
710 break;
711 }
712 // Log("TOK %s %d", Id_str(tok.id), tok.end_col);
713 tokens->push_back(tok); // make a copy
714 }
715}
716
717void FixShellComments(std::vector<Token>& tokens) {
718 int n = tokens.size();
719 for (int i = 0; i < n; ++i) {
720 // # comment at start of line
721 if (tokens[i].id == Id::MaybeComment) {
722 if (i == 0) {
723 tokens[i].id = Id::Comm;
724 }
725 if (i != 0 and tokens[i - 1].id == Id::WS) {
726 tokens[i].id = Id::Comm;
727 }
728 }
729 }
730}
731
732// This templated method causes some code expansion, but not too much. The
733// binary went from 38 KB to 42 KB, after being stripped.
734// We get a little type safety with py_mode_e vs cpp_mode_e.
735
736template <typename T>
737int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
738 Lexer<T> lexer(nullptr);
739 Matcher<T> matcher;
740
741 int line_num = 1;
742 int num_sig = 0;
743
744 std::vector<std::string> here_list; // delimiters to pop
745 std::vector<int> here_start_num;
746
747 // For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
748 // the empty string.
749 std::vector<std::string> delim_begin;
750
751 while (true) { // read each line, handling errors
752 if (!reader->NextLine()) {
753 const char* name = reader->Filename() ?: "<stdin>";
754 Log("micro-syntax: getline() error on %s: %s", name,
755 strerror(reader->err_num_));
756 return 1;
757 }
758 char* line = reader->Current();
759 if (line == nullptr) {
760 break; // EOF
761 }
762
763 std::vector<Token> pre_tokens;
764
765 hook->TryPreprocess(line, &pre_tokens);
766
767 // e.g #define at beginning of line
768 if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
769 pre_tokens[0].id = Id::PreprocCommand;
770
771 out->Line(line_num, line, pre_tokens);
772
773 line_num += 1;
774 num_sig += 1;
775
776 Token last = pre_tokens.back();
777 while (last.id == Id::LineCont) {
778 const char* blame = reader->Filename() ?: "<stdin>";
779 if (!reader->NextLine()) {
780 Log("micro-syntax: getline() error on %s: %s", blame,
781 strerror(reader->err_num_));
782 return 1;
783 }
784 char* line = reader->Current();
785 if (line == nullptr) {
786 Log("Unexpected end-of-file in preprocessor in %s", blame);
787 return 1;
788 }
789
790 pre_tokens.clear();
791 hook->TryPreprocess(line, &pre_tokens);
792
793 out->Line(line_num, line, pre_tokens);
794
795 line_num += 1;
796 num_sig += 1;
797
798 last = pre_tokens.back();
799 }
800 continue; // Skip the rest of the loop
801 }
802
803 //
804 // Main Loop for "normal" lines (not preprocessor or here doc)
805 //
806
807 std::vector<Token> tokens;
808 lexer.SetLine(line);
809
810 bool line_is_sig = false;
811 while (true) { // tokens on each line
812 Token tok;
813 bool eol = matcher.Match(&lexer, &tok);
814 if (eol) {
815 break;
816 }
817
818 switch (tok.id) {
819 case Id::HereBegin: {
820 // Put a copy on the stack
821 int n = tok.submatch_end - tok.submatch_start;
822 here_list.emplace_back(line + tok.submatch_start, n);
823 here_start_num.push_back(line_num);
824 } break;
825
826 case Id::DelimStrBegin: {
827 if (delim_begin.empty()) {
828 int n = tok.submatch_end - tok.submatch_start;
829 delim_begin.emplace_back(line + tok.submatch_start, n);
830 } else {
831 // We have entered cpp_mode_e::DelimStr, which means we should never
832 // return another DelimStrBegin
833 assert(0);
834 }
835 } break;
836
837 case Id::DelimStrEnd: {
838 if (delim_begin.empty()) {
839 // We should never get this unless we got a DelimStrBegin first
840 assert(0);
841 } else {
842 size_t n = tok.submatch_end - tok.submatch_start;
843 std::string end_delim(line + tok.submatch_start, n);
844
845 if (end_delim == delim_begin.back()) {
846 lexer.line_mode = T::Outer; // the string is ended
847 delim_begin.pop_back();
848 } else {
849 tok.id = Id::Str; // mismatched delimiter is just a string
850 }
851 }
852 } break;
853
854 // TODO: I think we need a mode to escape into strstr(), for
855 // C++ - ending */
856 // HTML - ending --> ?> ]]> </SCRipt>
857 //
858 // So instead of returning 'eol', we can return a string to search for?
859 // Then we keep looking for more lines.
860 //
861 // This is similar to the problems of here doc and C++ multi-line
862 // strings. The main difference is that we're not using a submatch.
863 default:
864 break;
865 }
866
867 tokens.push_back(tok); // make a copy
868
869 if (TokenIsSignificant(tok.id)) {
870 line_is_sig = true;
871 }
872 }
873
874#if 0
875 PrintTokens(tokens);
876 Log("%d tokens before", tokens.size());
877 Optimize(&tokens);
878 Log("%d tokens after", tokens.size());
879 PrintTokens(tokens);
880#endif
881
882 FixShellComments(tokens);
883
884 out->Line(line_num, line, tokens);
885 tokens.clear();
886
887 // Potentially multiple here docs for this line
888 int here_index = 0;
889 for (auto here_delim : here_list) {
890 // Log("HERE %s", here_delim.c_str());
891
892 while (true) {
893 const char* blame = reader->Filename() ?: "<stdin>";
894 if (!reader->NextLine()) {
895 Log("micro-syntax: getline() error on %s: %s", blame,
896 strerror(reader->err_num_));
897 return 1;
898 }
899 char* line = reader->Current();
900 if (line == nullptr) {
901 int start_line = here_start_num[here_index];
902 Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
903 start_line);
904 return 1;
905 }
906
907 line_num++;
908
909 if (LineEqualsHereDelim(line, here_delim)) {
910 int n = strlen(line);
911 Token whole_line(Id::HereEnd, n);
912 tokens.push_back(whole_line);
913 out->Line(line_num, line, tokens);
914 tokens.clear();
915 break;
916
917 } else {
918 int n = strlen(line);
919 Token whole_line(Id::Str, n);
920 tokens.push_back(whole_line);
921 out->Line(line_num, line, tokens);
922 tokens.clear();
923
924 // Log(" not equal: %s", line);
925 }
926 }
927 here_index++;
928 }
929 here_list.clear();
930 here_start_num.clear();
931
932 line_num++;
933 num_sig += line_is_sig;
934 }
935
936 out->PathEnd(line_num - 1, num_sig);
937 return 0;
938}
939
940int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
941 Hook* hook) {
942 Reader* reader = nullptr;
943
944 int status = 0;
945 for (auto path : files) {
946 FILE* f;
947 if (path == nullptr) {
948 f = stdin;
949 } else {
950 f = fopen(path, "r");
951 if (f == nullptr) {
952 Log("Error opening %s: %s", path, strerror(errno));
953 return 1;
954 }
955 }
956 out->PathBegin(path);
957
958 reader = new Reader(f, path);
959
960 switch (flag.lang) {
961 case lang_e::PlainText:
962 status = ScanOne<text_mode_e>(reader, out, hook);
963 break;
964
965 case lang_e::Py:
966 status = ScanOne<py_mode_e>(reader, out, hook);
967 break;
968
969 case lang_e::Cpp:
970 status = ScanOne<cpp_mode_e>(reader, out, hook);
971 break;
972
973 case lang_e::Shell:
974 status = ScanOne<sh_mode_e>(reader, out, hook);
975 break;
976
977 case lang_e::Asdl:
978 status = ScanOne<asdl_mode_e>(reader, out, hook);
979 break;
980
981 case lang_e::R:
982 status = ScanOne<R_mode_e>(reader, out, hook);
983 break;
984
985 case lang_e::Html:
986 status = ScanOne<html_mode_e>(reader, out, hook);
987 break;
988
989 default:
990 assert(0);
991 }
992
993 delete reader;
994
995 if (path == nullptr) {
996 ;
997 } else {
998 fclose(f);
999 }
1000
1001 if (status != 0) {
1002 break;
1003 }
1004 }
1005
1006 return status;
1007}
1008
1009void PrintHelp() {
1010 puts(R"(Usage: micro-syntax FLAGS* FILE*
1011
1012Recognizes the syntax of each file,, and prints it to stdout.
1013
1014If there are no files, reads stdin.
1015
1016Flags:
1017 -h --help This help
1018
1019 -l --lang Language: py|cpp|shell|...
1020 -t Print tokens as TSV, instead of ANSI color
1021 -w Print HTML for the web
1022
1023 -m More color, useful for debugging tokens
1024
1025 -n --no-comments Omit comments
1026 -o --comments-only Only print comments
1027 -e --empty-strs Substitute string literals for empty strings
1028 --color on off always more
1029
1030)");
1031}
1032
1033int main(int argc, char** argv) {
1034 Flags flag = {lang_e::PlainText};
1035
1036 // http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1037 // + means to be strict about flag parsing.
1038 int c;
1039 while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1040 switch (c) {
1041 case 'h':
1042 PrintHelp();
1043 return 0;
1044
1045 case 'l':
1046 if (strcmp(optarg, "cpp") == 0) {
1047 flag.lang = lang_e::Cpp;
1048
1049 } else if (strcmp(optarg, "py") == 0) {
1050 flag.lang = lang_e::Py;
1051
1052 } else if (strcmp(optarg, "shell") == 0) {
1053 flag.lang = lang_e::Shell;
1054
1055 } else if (strcmp(optarg, "asdl") == 0) {
1056 flag.lang = lang_e::Asdl;
1057
1058 } else if (strcmp(optarg, "R") == 0) {
1059 flag.lang = lang_e::R;
1060
1061 // TODO: implement all of these
1062 } else if (strcmp(optarg, "js") == 0) {
1063 flag.lang = lang_e::PlainText;
1064
1065 } else if (strcmp(optarg, "css") == 0) {
1066 flag.lang = lang_e::PlainText;
1067
1068 } else if (strcmp(optarg, "md") == 0) {
1069 flag.lang = lang_e::PlainText;
1070
1071 } else if (strcmp(optarg, "yaml") == 0) {
1072 flag.lang = lang_e::PlainText;
1073
1074 } else if (strcmp(optarg, "html") == 0) {
1075 flag.lang = lang_e::Html;
1076
1077 } else if (strcmp(optarg, "txt") == 0) {
1078 flag.lang = lang_e::PlainText;
1079
1080 } else if (strcmp(optarg, "other") == 0) {
1081 flag.lang = lang_e::PlainText;
1082
1083 } else {
1084 Log("Expected -l LANG to be "
1085 "cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, "
1086 "got %s",
1087 optarg);
1088 return 2;
1089 }
1090 break;
1091
1092 case 'm':
1093 flag.more_color = true;
1094 break;
1095
1096 case 'o':
1097 flag.comments_only = true;
1098 break;
1099
1100 case 't':
1101 flag.tsv = true;
1102 break;
1103
1104 case 'w':
1105 flag.web = true;
1106 break;
1107
1108 case '?': // getopt library will print error
1109 return 2;
1110
1111 default:
1112 abort(); // should never happen
1113 }
1114 }
1115
1116 int a = optind; // index into argv
1117 flag.argv = argv + a;
1118 flag.argc = argc - a;
1119
1120 std::vector<char*> files; // filename, or nullptr for stdin
1121 if (flag.argc != 0) {
1122 for (int i = 0; i < flag.argc; ++i) {
1123 files.push_back(flag.argv[i]);
1124 }
1125 } else {
1126 files.push_back(nullptr); // stands for stdin
1127 }
1128
1129 Printer* pr; // for each file
1130 OutputStream* out; // the entire stream
1131
1132 if (flag.tsv) {
1133 pr = new TsvPrinter();
1134 out = new NetStringOutput(pr);
1135 } else if (flag.web) {
1136 pr = new HtmlPrinter();
1137 out = new NetStringOutput(pr);
1138 } else {
1139 pr = new AnsiPrinter(flag);
1140 out = new AnsiOutput(pr);
1141 }
1142
1143 Hook* hook = nullptr;
1144 if (flag.lang == lang_e::Cpp) {
1145 hook = new CppHook();
1146 } else {
1147 hook = new Hook(); // default hook
1148 }
1149
1150 int status = ScanFiles(flag, files, out, hook);
1151
1152 delete hook;
1153 delete pr;
1154 delete out;
1155
1156 return status;
1157}