OILS / doctools / micro_syntax.cc View on Github | oils.pub

1152 lines, 736 significant
1// Micro Syntax
2//
3// See doctools/micro-syntax.md
4
5#include "micro_syntax.h" // requires -I $BASE_DIR
6
7#include <assert.h>
8#include <errno.h>
9#include <getopt.h>
10#include <stdarg.h> // va_list, etc.
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h> // free
14#include <string.h>
15
16#include <string>
17#include <vector>
18
19const char* RESET = "\x1b[0;0m";
20const char* BOLD = "\x1b[1m";
21const char* UNDERLINE = "\x1b[4m";
22const char* REVERSE = "\x1b[7m"; // reverse video
23
24const char* BLACK = "\x1b[30m";
25const char* RED = "\x1b[31m";
26const char* GREEN = "\x1b[32m";
27const char* YELLOW = "\x1b[33m";
28const char* BLUE = "\x1b[34m";
29const char* PURPLE = "\x1b[35m";
30const char* CYAN = "\x1b[36m";
31const char* WHITE = "\x1b[37m";
32
33const char* BLACK2 = "\x1b[90m";
34const char* RED2 = "\x1b[91m";
35const char* BLUE2 = "\x1b[94m";
36
37void Log(const char* fmt, ...) {
38 va_list args;
39 va_start(args, fmt);
40 vfprintf(stderr, fmt, args);
41 va_end(args);
42 fputs("\n", stderr);
43}
44
45enum class lang_e {
46 PlainText,
47
48 Cpp, // including C
49 Py,
50 Shell,
51 Ysh, // ''' etc.
52 Html,
53 Asdl,
54 R, // uses # comments
55
56 // JS, // uses // comments
57};
58
59class Reader {
60 // We don't care about internal NUL, so this interface doesn't allow it
61
62 public:
63 Reader(FILE* f, const char* filename)
64 : f_(f), filename_(filename), line_(nullptr), allocated_size_(0) {
65 }
66
67 const char* Filename() { // for error messages only, nullptr for stdin
68 return filename_;
69 }
70
71 bool NextLine() {
72 // Returns false if there was an error, and sets err_num_.
73 // Returns true if not error, and Current() can be checked.
74
75 // Note: getline() frees the previous line, so we don't have to
76 ssize_t len = getline(&line_, &allocated_size_, f_);
77
78 if (len < 0) { // EOF is -1
79 // man page says the buffer should be freed if getline() fails
80 free(line_);
81
82 line_ = nullptr; // tell the caller not to continue
83
84 if (errno != 0) { // I/O error
85 err_num_ = errno;
86 return false;
87 }
88 }
89 return true;
90 }
91
92 char* Current() {
93 // Returns nullptr on EOF.
94 return line_;
95 }
96
97 FILE* f_;
98 const char* filename_;
99
100 char* line_; // valid for one NextLine() call, nullptr on EOF or error
101 size_t allocated_size_; // unused, but must pass address to getline()
102 int err_num_; // set on error
103};
104
105class Printer {
106 public:
107 virtual void PrintLineNumber(int line_num) = 0;
108 virtual void PrintLineEnd() {
109 }
110 virtual void PrintToken(const char* line, int line_num, int start_col,
111 Token token) = 0;
112 virtual void Swap(std::string* s) {
113 assert(0);
114 }
115 virtual ~Printer() {
116 }
117};
118
119class HtmlPrinter : public Printer {
120 public:
121 HtmlPrinter() : Printer(), out_() {
122 }
123
124 virtual void Swap(std::string* s) {
125 // assert(s != nullptr);
126 out_.swap(*s);
127 }
128
129 virtual void PrintLineNumber(int line_num) {
130 char buf[16];
131 snprintf(buf, 16, "%d", line_num);
132
133 out_.append("<tr><td class=num>"); // <tr> closed by PrintLineEnd()
134 out_.append(buf);
135 out_.append("</td><td id=L"); // jump to line with foo.html#L32
136 out_.append(buf);
137 out_.append(" class=line>"); // <td> closed by PrintLineEnd()
138 }
139
140 virtual void PrintLineEnd() {
141 out_.append("</td></tr>");
142 }
143
144 virtual void PrintToken(const char* line, int line_num, int start_col,
145 Token tok) {
146 const char* p_start = line + start_col;
147 int num_bytes = tok.end_col - start_col;
148
149 switch (tok.id) {
150 case Id::Comm:
151 PrintSpan("comm", p_start, num_bytes);
152 break;
153
154 case Id::Name:
155 PrintEscaped(p_start, num_bytes);
156 break;
157
158 case Id::PreprocCommand:
159 case Id::LineCont:
160 PrintSpan("preproc", p_start, num_bytes);
161 break;
162
163 case Id::Re2c:
164 PrintSpan("re2c", p_start, num_bytes);
165 break;
166
167 case Id::Other:
168 // PrintSpan("other", p_start, num_bytes);
169 PrintEscaped(p_start, num_bytes);
170 break;
171
172 // for now these are strings
173 case Id::HereBegin:
174 case Id::HereEnd:
175 case Id::Str:
176 PrintSpan("str", p_start, num_bytes);
177 break;
178
179 case Id::LBrace:
180 case Id::RBrace:
181 PrintSpan("brace", p_start, num_bytes);
182 break;
183
184 case Id::Unknown:
185 PrintSpan("x", p_start, num_bytes);
186 break;
187
188 default:
189 PrintEscaped(p_start, num_bytes);
190 break;
191 }
192 }
193
194 private:
195 void PrintEscaped(const char* s, int len) {
196 // HTML escape the code string
197 for (int i = 0; i < len; ++i) {
198 char c = s[i];
199
200 switch (c) {
201 case '<':
202 out_.append("&lt;");
203 break;
204 case '>':
205 out_.append("&gt;");
206 break;
207 case '&':
208 out_.append("&amp;");
209 break;
210 default:
211 // Is this inefficient? Fill 1 char
212 out_.append(1, s[i]);
213 break;
214 }
215 }
216 }
217
218 void PrintSpan(const char* css_class, const char* s, int len) {
219 out_.append("<span class=");
220 out_.append(css_class);
221 out_.append(">");
222
223 PrintEscaped(s, len);
224
225 out_.append("</span>");
226 }
227
228 std::string out_;
229};
230
231struct Flags {
232 lang_e lang;
233 bool tsv;
234 bool web;
235 bool more_color;
236 bool comments_only;
237
238 int argc;
239 char** argv;
240};
241
242class AnsiPrinter : public Printer {
243 public:
244 AnsiPrinter(const Flags& flag) : Printer(), flag_(flag) {
245 }
246
247 virtual void PrintLineNumber(int line_num) {
248 if (flag_.comments_only) {
249 return;
250 }
251 printf("%s%5d%s ", BLACK2, line_num, RESET);
252 }
253
254 virtual void PrintToken(const char* line, int line_num, int start_col,
255 Token tok) {
256 const char* p_start = line + start_col;
257 int num_bytes = tok.end_col - start_col;
258 switch (tok.id) {
259 case Id::Comm:
260 if (flag_.comments_only) {
261 PrintAlways(p_start, num_bytes);
262 } else {
263 PrintColor(BLUE, p_start, num_bytes);
264 }
265 break;
266
267 case Id::Name:
268 PrintText(p_start, num_bytes);
269 break;
270
271 case Id::PreprocCommand:
272 case Id::LineCont:
273 PrintColor(PURPLE, p_start, num_bytes);
274 break;
275
276 case Id::Re2c:
277 PrintColor(PURPLE, p_start, num_bytes);
278 break;
279
280 case Id::Other:
281 if (flag_.more_color) {
282 PrintColor(PURPLE, p_start, num_bytes);
283 } else {
284 PrintText(p_start, num_bytes);
285 }
286 break;
287
288 case Id::WS:
289 if (flag_.more_color) {
290 fputs(REVERSE, stdout);
291 PrintColor(WHITE, p_start, num_bytes);
292 } else {
293 PrintText(p_start, num_bytes);
294 }
295 break;
296
297 case Id::Str:
298 PrintColor(RED, p_start, num_bytes);
299 break;
300
301 case Id::HereBegin:
302 case Id::HereEnd: {
303 PrintColor(RED2, p_start, num_bytes);
304
305 // Debug submatch extraction
306#if 0
307 fputs(RED, stdout);
308 int n = tok.submatch_len;
309 fwrite(tok.submatch_start, 1, n, stdout);
310 fputs(RESET, stdout);
311#endif
312 } break;
313
314 case Id::DelimStrBegin:
315 case Id::DelimStrEnd: {
316 PrintColor(RED2, p_start, num_bytes);
317
318 // Debug submatch extraction
319#if 0
320 fputs(RED, stdout);
321 int n = tok.submatch_len;
322 fwrite(tok.submatch_start, 1, n, stdout);
323 fputs(RESET, stdout);
324#endif
325 } break;
326
327 case Id::LBrace:
328 case Id::RBrace:
329 PrintColor(GREEN, p_start, num_bytes);
330 break;
331
332 case Id::AttrName:
333 PrintColor(GREEN, p_start, num_bytes);
334 break;
335
336 case Id::TagNameLeft:
337 case Id::TagNameRight:
338 PrintColor(PURPLE, p_start, num_bytes);
339 break;
340
341 case Id::SelfClose:
342 case Id::EndTag:
343 PrintColor(RED2, p_start, num_bytes);
344 break;
345
346 case Id::CharEscape:
347 PrintColor(BLUE, p_start, num_bytes);
348 break;
349
350 case Id::Unknown:
351 case Id::BadAmpersand:
352 case Id::BadGreaterThan:
353 case Id::BadLessThan:
354 // Make errors red
355 fputs(REVERSE, stdout);
356 PrintColor(RED, p_start, num_bytes);
357 break;
358
359 default:
360 PrintText(p_start, num_bytes);
361 break;
362 }
363 }
364
365 private:
366 void PrintColor(const char* color, const char* s, int n) {
367 fputs(color, stdout);
368 PrintText(s, n);
369 fputs(RESET, stdout);
370 }
371
372 void PrintText(const char* s, int n) {
373 if (flag_.comments_only) {
374 for (int i = 0; i < n; ++i) {
375 // Replace everything but newline with space
376 // TODO: I think we always want a newline token, including in comments.
377 // That will simplify this.
378 char c = (s[i] == '\n') ? '\n' : ' ';
379 fwrite(&c, 1, 1, stdout);
380 }
381 } else {
382 fwrite(s, 1, n, stdout);
383 }
384 }
385
386 void PrintAlways(const char* s, int n) {
387 fwrite(s, 1, n, stdout);
388 }
389
390 const Flags& flag_;
391};
392
393const char* Id_str(Id id) {
394 switch (id) {
395 case Id::Comm:
396 return "Comm";
397 case Id::MaybeComment: // fix-up doesn't guarantee this is gone
398 return "MaybeComment";
399 case Id::WS:
400 return "WS";
401 case Id::Re2c:
402 return "Re2c";
403
404 case Id::MaybePreproc: // fix-up doesn't guarantee this is gone
405 return "MaybePreproc";
406 case Id::PreprocCommand:
407 return "PreprocCommand";
408 case Id::PreprocOther:
409 return "PreprocOther";
410 case Id::LineCont:
411 return "LineCont";
412
413 case Id::Name:
414 return "Name";
415 case Id::Other:
416 return "Other";
417
418 case Id::Str:
419 return "Str";
420
421 case Id::HereBegin:
422 return "HereBegin";
423 case Id::HereEnd:
424 return "HereEnd";
425 case Id::DelimStrBegin:
426 return "DelimStrBegin";
427 case Id::DelimStrEnd:
428 return "DelimStrEnd";
429
430 case Id::LBrace:
431 return "LBrace";
432 case Id::RBrace:
433 return "RBrace";
434
435 case Id::Unknown:
436 return "Unknown";
437 default:
438 assert(0);
439 }
440}
441
442class TsvPrinter : public Printer {
443 public:
444 virtual void PrintLineNumber(int line_num) {
445 ;
446 }
447
448 virtual void Swap(std::string* s) {
449 // out_.swap(*s);
450 }
451
452 virtual void PrintToken(const char* line, int line_num, int start_col,
453 Token tok) {
454 printf("%d\t%s\t%d\t%d\n", line_num, Id_str(tok.id), start_col,
455 tok.end_col);
456 // printf(" -> mode %d\n", lexer.line_mode);
457 }
458 virtual ~TsvPrinter() {
459 }
460};
461
462bool TokenIsSignificant(Id id) {
463 switch (id) {
464 case Id::Name:
465 case Id::Other:
466 case Id::PreprocCommand:
467 case Id::PreprocOther:
468 case Id::Re2c:
469 return true;
470
471 // Comments, whitespace, and string literals aren't significant
472 // TODO: can abort on Id::Unknown?
473 default:
474 break;
475 }
476 return false;
477}
478
479class OutputStream {
480 // stdout contains either
481 // - netstrings of HTML, or TSV Token structs
482 // - ANSI text
483
484 public:
485 OutputStream(Printer* pr) : pr_(pr) {
486 }
487 virtual void PathBegin(const char* path) = 0;
488 virtual void Line(int line_num, const char* line,
489 const std::vector<Token>& tokens) = 0;
490 virtual void PathEnd(int num_lines, int num_sig_lines) = 0;
491 virtual ~OutputStream() {
492 }
493
494 protected:
495 Printer* pr_; // how to print each file
496};
497
498class NetStringOutput : public OutputStream {
499 public:
500 NetStringOutput(Printer* pr) : OutputStream(pr) {
501 }
502
503 virtual void PathBegin(const char* path) {
504 if (path == nullptr) {
505 path = "<stdin>";
506 }
507 PrintNetString(path, strlen(path));
508 }
509
510 virtual void Line(int line_num, const char* line,
511 const std::vector<Token>& tokens) {
512 pr_->PrintLineNumber(line_num);
513
514 int start_col = 0;
515 for (auto tok : tokens) {
516 pr_->PrintToken(line, line_num, start_col, tok);
517 start_col = tok.end_col;
518 }
519
520 pr_->PrintLineEnd();
521 }
522
523 virtual void PathEnd(int num_lines, int num_sig_lines) {
524 std::string string_for_file;
525 pr_->Swap(&string_for_file);
526
527 PrintNetString(string_for_file.c_str(), string_for_file.size());
528
529 // Output summary in JSON
530 // TODO: change this to a 4th column
531 char buf[64];
532 int n = snprintf(buf, 64, "{\"num_lines\": %d, \"num_sig_lines\": %d}",
533 num_lines, num_sig_lines);
534 PrintNetString(buf, n);
535 }
536
537 private:
538 void PrintNetString(const char* s, int len) {
539 fprintf(stdout, "%d:%*s,", len, len, s);
540 }
541};
542
543class AnsiOutput : public OutputStream {
544 public:
545 AnsiOutput(Printer* pr) : OutputStream(pr) {
546 }
547
548 // TODO: Can respect --comments-only
549 virtual void PathBegin(const char* path) {
550 if (path == nullptr) {
551 path = "<stdin>";
552 }
553 // diff uses +++ ---
554 printf("\n");
555 printf("=== %s%s%s%s ===\n", BOLD, PURPLE, path, RESET);
556 printf("\n");
557 }
558
559 virtual void Line(int line_num, const char* line,
560 const std::vector<Token>& tokens) {
561 pr_->PrintLineNumber(line_num);
562
563 int start_col = 0;
564 for (auto tok : tokens) {
565 pr_->PrintToken(line, line_num, start_col, tok);
566 start_col = tok.end_col;
567 }
568
569 pr_->PrintLineEnd();
570 };
571
572 // TODO: Can respect --comments-only
573 virtual void PathEnd(int num_lines, int num_sig_lines) {
574 fprintf(stdout, "%s%d lines, %d significant%s\n", GREEN, num_lines,
575 num_sig_lines, RESET);
576 };
577};
578
579void PrintTokens(std::vector<Token>& toks) {
580 int start_col = 0;
581 int i = 0;
582 Log("===");
583 for (auto tok : toks) {
584 Log("%2d %10s %2d %2d", i, Id_str(tok.id), start_col, tok.end_col);
585 start_col = tok.end_col;
586 ++i;
587 }
588 Log("===");
589}
590
591// BUGGY, needs unit tests
592
593// Fiddly function, reduces the size of the output a bit
594// "hi" becomes 1 Id::DQ token instead of 3 separate Id::DQ tokens
595void Optimize(std::vector<Token>* tokens) {
596 std::vector<Token>& toks = *tokens; // alias
597
598 // PrintTokens(toks);
599
600 int n = toks.size();
601 if (n < 1) { // nothing to de-duplicate
602 return;
603 }
604
605 int left = 0;
606 int right = 1;
607 while (right < n) {
608 Log("right ID = %s, end %d", Id_str(toks[right].id), toks[right].end_col);
609
610 if (toks[left].id == toks[right].id) {
611 // Join the tokens together
612 toks[left].end_col = toks[right].end_col;
613 } else {
614 toks[left] = toks[right];
615 left++;
616 Log(" not eq, left = %d", left);
617 }
618 right++;
619 }
620 Log("left = %d, right = %d", left, right);
621
622 // Fiddly condition: one more iteration. Need some unit tests for this.
623 toks[left] = toks[right - 1];
624 left++;
625 assert(left <= n);
626
627 // Erase the remaining ones
628 toks.resize(left);
629
630 // PrintTokens(toks);
631}
632
633// Version of the above that's not in-place, led to a bug fix
634void Optimize2(std::vector<Token>* tokens) {
635 std::vector<Token> optimized;
636
637 int n = tokens->size();
638 if (n < 1) {
639 return;
640 }
641
642 optimized.reserve(n);
643
644 int left = 0;
645 int right = 1;
646 while (right < n) {
647 optimized.push_back((*tokens)[left]);
648 left++;
649 right++;
650 }
651 optimized.push_back((*tokens)[left]);
652 left++;
653
654 tokens->swap(optimized);
655}
656
657bool LineEqualsHereDelim(const char* line, std::string& here_delim) {
658 // Compare EOF vs. EOF\n or EOF\t\n or x\n
659
660 // Hack: skip leading tab unconditionally, even though that's only alowed in
661 // <<- Really we should capture the operator and the delim?
662 if (*line == '\t') {
663 line++;
664 }
665
666 int n = strlen(line);
667 int h = here_delim.size();
668
669 // Log("Here delim=%s line=%s", here_delim.c_str(), line);
670
671 // Line should be at least one longer, EOF\n
672 if (n <= h) {
673 // Log(" [0] line too short");
674 return false;
675 }
676
677 int i = 0;
678 for (; i < h; ++i) {
679 if (here_delim[i] != line[i]) {
680 // Log(" [1] byte %d not equal", i);
681 return false;
682 }
683 }
684
685 while (i < n) {
686 switch (line[i]) {
687 case ' ':
688 case '\t':
689 case '\r':
690 case '\n':
691 break;
692 default:
693 // Log(" [2] byte %d not whitespace", i);
694 return false; // line can't have whitespace on the end
695 }
696 ++i;
697 }
698
699 return true;
700}
701
702void CppHook::TryPreprocess(char* line, std::vector<Token>* tokens) {
703 // Fills tokens, which can be checked for beginning and end tokens
704
705 Lexer<pp_mode_e> lexer(line);
706 Matcher<pp_mode_e> matcher;
707
708 while (true) { // tokens on each line
709 Token tok;
710 // Log("Match %d", lexer.p_current - lexer.line_);
711 bool eol = matcher.Match(&lexer, &tok);
712 // Log("EOL %d", eol);
713 if (eol) {
714 break;
715 }
716 // Log("TOK %s %d", Id_str(tok.id), tok.end_col);
717 tokens->push_back(tok); // make a copy
718 }
719}
720
721void FixShellComments(std::vector<Token>& tokens) {
722 int n = tokens.size();
723 for (int i = 0; i < n; ++i) {
724 // # comment at start of line
725 if (tokens[i].id == Id::MaybeComment) {
726 if (i == 0) {
727 tokens[i].id = Id::Comm;
728 }
729 if (i != 0 and tokens[i - 1].id == Id::WS) {
730 tokens[i].id = Id::Comm;
731 }
732 }
733 }
734}
735
736// This templated method causes some code expansion, but not too much. The
737// binary went from 38 KB to 42 KB, after being stripped.
738// We get a little type safety with py_mode_e vs cpp_mode_e.
739
740template <typename T>
741int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
742 Lexer<T> lexer(nullptr);
743 Matcher<T> matcher;
744
745 int line_num = 1;
746 int num_sig = 0;
747
748 std::vector<std::string> here_list; // delimiters to pop
749 std::vector<int> here_start_num;
750
751 // For multi-line strings. This has 0 or 1 entries, and the 1 entry can be
752 // the empty string.
753 std::vector<std::string> delim_begin;
754
755 while (true) { // read each line, handling errors
756 if (!reader->NextLine()) {
757 const char* name = reader->Filename() ?: "<stdin>";
758 Log("micro-syntax: getline() error on %s: %s", name,
759 strerror(reader->err_num_));
760 return 1;
761 }
762 char* line = reader->Current();
763 if (line == nullptr) {
764 break; // EOF
765 }
766
767 std::vector<Token> pre_tokens;
768
769 hook->TryPreprocess(line, &pre_tokens);
770
771 // e.g #define at beginning of line
772 if (pre_tokens.size() && pre_tokens[0].id == Id::MaybePreproc) {
773 pre_tokens[0].id = Id::PreprocCommand;
774
775 out->Line(line_num, line, pre_tokens);
776
777 line_num += 1;
778 num_sig += 1;
779
780 Token last = pre_tokens.back();
781 while (last.id == Id::LineCont) {
782 const char* blame = reader->Filename() ?: "<stdin>";
783 if (!reader->NextLine()) {
784 Log("micro-syntax: getline() error on %s: %s", blame,
785 strerror(reader->err_num_));
786 return 1;
787 }
788 char* line = reader->Current();
789 if (line == nullptr) {
790 Log("Unexpected end-of-file in preprocessor in %s", blame);
791 return 1;
792 }
793
794 pre_tokens.clear();
795 hook->TryPreprocess(line, &pre_tokens);
796
797 out->Line(line_num, line, pre_tokens);
798
799 line_num += 1;
800 num_sig += 1;
801
802 last = pre_tokens.back();
803 }
804 continue; // Skip the rest of the loop
805 }
806
807 //
808 // Main Loop for "normal" lines (not preprocessor or here doc)
809 //
810
811 std::vector<Token> tokens;
812 lexer.SetLine(line);
813
814 bool line_is_sig = false;
815 while (true) { // tokens on each line
816 Token tok;
817 bool eol = matcher.Match(&lexer, &tok);
818 if (eol) {
819 break;
820 }
821
822 switch (tok.id) {
823 case Id::HereBegin: {
824 // Put a copy on the stack
825 int n = tok.submatch_end - tok.submatch_start;
826 here_list.emplace_back(line + tok.submatch_start, n);
827 here_start_num.push_back(line_num);
828 } break;
829
830 case Id::DelimStrBegin: {
831 if (delim_begin.empty()) {
832 int n = tok.submatch_end - tok.submatch_start;
833 delim_begin.emplace_back(line + tok.submatch_start, n);
834 } else {
835 // We have entered cpp_mode_e::DelimStr, which means we should never
836 // return another DelimStrBegin
837 assert(0);
838 }
839 } break;
840
841 case Id::DelimStrEnd: {
842 if (delim_begin.empty()) {
843 // We should never get this unless we got a DelimStrBegin first
844 assert(0);
845 } else {
846 size_t n = tok.submatch_end - tok.submatch_start;
847 std::string end_delim(line + tok.submatch_start, n);
848
849 if (end_delim == delim_begin.back()) {
850 lexer.line_mode = T::Outer; // the string is ended
851 delim_begin.pop_back();
852 } else {
853 tok.id = Id::Str; // mismatched delimiter is just a string
854 }
855 }
856 } break;
857
858 default:
859 break;
860 }
861
862 tokens.push_back(tok); // make a copy
863
864 if (TokenIsSignificant(tok.id)) {
865 line_is_sig = true;
866 }
867 }
868
869#if 0
870 PrintTokens(tokens);
871 Log("%d tokens before", tokens.size());
872 Optimize(&tokens);
873 Log("%d tokens after", tokens.size());
874 PrintTokens(tokens);
875#endif
876
877 FixShellComments(tokens);
878
879 out->Line(line_num, line, tokens);
880 tokens.clear();
881
882 // Potentially multiple here docs for this line
883 int here_index = 0;
884 for (auto here_delim : here_list) {
885 // Log("HERE %s", here_delim.c_str());
886
887 while (true) {
888 const char* blame = reader->Filename() ?: "<stdin>";
889 if (!reader->NextLine()) {
890 Log("micro-syntax: getline() error on %s: %s", blame,
891 strerror(reader->err_num_));
892 return 1;
893 }
894 char* line = reader->Current();
895 if (line == nullptr) {
896 int start_line = here_start_num[here_index];
897 Log("Unexpected end-of-file in here doc in %s, start line %d", blame,
898 start_line);
899 return 1;
900 }
901
902 line_num++;
903
904 if (LineEqualsHereDelim(line, here_delim)) {
905 int n = strlen(line);
906 Token whole_line(Id::HereEnd, n);
907 tokens.push_back(whole_line);
908 out->Line(line_num, line, tokens);
909 tokens.clear();
910 break;
911
912 } else {
913 int n = strlen(line);
914 Token whole_line(Id::Str, n);
915 tokens.push_back(whole_line);
916 out->Line(line_num, line, tokens);
917 tokens.clear();
918
919 // Log(" not equal: %s", line);
920 }
921 }
922 here_index++;
923 }
924 here_list.clear();
925 here_start_num.clear();
926
927 line_num++;
928 num_sig += line_is_sig;
929 }
930
931 out->PathEnd(line_num - 1, num_sig);
932 return 0;
933}
934
935int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
936 Hook* hook) {
937 Reader* reader = nullptr;
938
939 int status = 0;
940 for (auto path : files) {
941 FILE* f;
942 if (path == nullptr) {
943 f = stdin;
944 } else {
945 f = fopen(path, "r");
946 if (f == nullptr) {
947 Log("Error opening %s: %s", path, strerror(errno));
948 return 1;
949 }
950 }
951 out->PathBegin(path);
952
953 reader = new Reader(f, path);
954
955 switch (flag.lang) {
956 case lang_e::PlainText:
957 status = ScanOne<text_mode_e>(reader, out, hook);
958 break;
959
960 case lang_e::Py:
961 status = ScanOne<py_mode_e>(reader, out, hook);
962 break;
963
964 case lang_e::Cpp:
965 status = ScanOne<cpp_mode_e>(reader, out, hook);
966 break;
967
968 case lang_e::Shell:
969 status = ScanOne<sh_mode_e>(reader, out, hook);
970 break;
971
972 case lang_e::Asdl:
973 status = ScanOne<asdl_mode_e>(reader, out, hook);
974 break;
975
976 case lang_e::R:
977 status = ScanOne<R_mode_e>(reader, out, hook);
978 break;
979
980 case lang_e::Html:
981 status = ScanOne<html_mode_e>(reader, out, hook);
982 break;
983
984 default:
985 assert(0);
986 }
987
988 delete reader;
989
990 if (path == nullptr) {
991 ;
992 } else {
993 fclose(f);
994 }
995
996 if (status != 0) {
997 break;
998 }
999 }
1000
1001 return status;
1002}
1003
1004void PrintHelp() {
1005 puts(R"(Usage: micro-syntax FLAGS* FILE*
1006
1007Recognizes the syntax of each file,, and prints it to stdout.
1008
1009If there are no files, reads stdin.
1010
1011Flags:
1012 -h --help This help
1013
1014 -l --lang Language: py|cpp|shell|...
1015 -t Print tokens as TSV, instead of ANSI color
1016 -w Print HTML for the web
1017
1018 -m More color, useful for debugging tokens
1019
1020 -n --no-comments Omit comments
1021 -o --comments-only Only print comments
1022 -e --empty-strs Substitute string literals for empty strings
1023 --color on off always more
1024
1025)");
1026}
1027
1028int main(int argc, char** argv) {
1029 Flags flag = {lang_e::PlainText};
1030
1031 // http://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
1032 // + means to be strict about flag parsing.
1033 int c;
1034 while ((c = getopt(argc, argv, "+hl:motw")) != -1) {
1035 switch (c) {
1036 case 'h':
1037 PrintHelp();
1038 return 0;
1039
1040 case 'l':
1041 if (strcmp(optarg, "cpp") == 0) {
1042 flag.lang = lang_e::Cpp;
1043
1044 } else if (strcmp(optarg, "py") == 0) {
1045 flag.lang = lang_e::Py;
1046
1047 } else if (strcmp(optarg, "shell") == 0) {
1048 flag.lang = lang_e::Shell;
1049
1050 } else if (strcmp(optarg, "asdl") == 0) {
1051 flag.lang = lang_e::Asdl;
1052
1053 } else if (strcmp(optarg, "R") == 0) {
1054 flag.lang = lang_e::R;
1055
1056 // TODO: implement all of these
1057 } else if (strcmp(optarg, "js") == 0) {
1058 flag.lang = lang_e::PlainText;
1059
1060 } else if (strcmp(optarg, "css") == 0) {
1061 flag.lang = lang_e::PlainText;
1062
1063 } else if (strcmp(optarg, "md") == 0) {
1064 flag.lang = lang_e::PlainText;
1065
1066 } else if (strcmp(optarg, "yaml") == 0) {
1067 flag.lang = lang_e::PlainText;
1068
1069 } else if (strcmp(optarg, "html") == 0) {
1070 flag.lang = lang_e::Html;
1071
1072 } else if (strcmp(optarg, "txt") == 0) {
1073 flag.lang = lang_e::PlainText;
1074
1075 } else if (strcmp(optarg, "other") == 0) {
1076 flag.lang = lang_e::PlainText;
1077
1078 } else {
1079 Log("Expected -l LANG to be "
1080 "cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, "
1081 "got %s",
1082 optarg);
1083 return 2;
1084 }
1085 break;
1086
1087 case 'm':
1088 flag.more_color = true;
1089 break;
1090
1091 case 'o':
1092 flag.comments_only = true;
1093 break;
1094
1095 case 't':
1096 flag.tsv = true;
1097 break;
1098
1099 case 'w':
1100 flag.web = true;
1101 break;
1102
1103 case '?': // getopt library will print error
1104 return 2;
1105
1106 default:
1107 abort(); // should never happen
1108 }
1109 }
1110
1111 int a = optind; // index into argv
1112 flag.argv = argv + a;
1113 flag.argc = argc - a;
1114
1115 std::vector<char*> files; // filename, or nullptr for stdin
1116 if (flag.argc != 0) {
1117 for (int i = 0; i < flag.argc; ++i) {
1118 files.push_back(flag.argv[i]);
1119 }
1120 } else {
1121 files.push_back(nullptr); // stands for stdin
1122 }
1123
1124 Printer* pr; // for each file
1125 OutputStream* out; // the entire stream
1126
1127 if (flag.tsv) {
1128 pr = new TsvPrinter();
1129 out = new NetStringOutput(pr);
1130 } else if (flag.web) {
1131 pr = new HtmlPrinter();
1132 out = new NetStringOutput(pr);
1133 } else {
1134 pr = new AnsiPrinter(flag);
1135 out = new AnsiOutput(pr);
1136 }
1137
1138 Hook* hook = nullptr;
1139 if (flag.lang == lang_e::Cpp) {
1140 hook = new CppHook();
1141 } else {
1142 hook = new Hook(); // default hook
1143 }
1144
1145 int status = ScanFiles(flag, files, out, hook);
1146
1147 delete hook;
1148 delete pr;
1149 delete out;
1150
1151 return status;
1152}