OILS / doctools / help_gen.py View on Github | oils.pub

685 lines, 357 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4"""help_gen.py
5
6Ideas for HTML -> ANSI converter:
7
8- `ls` -> <code>ls</code> -> is reverse video?
9- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10 - could also be bright blue
11- <pre> is also indented 4 spaces, like the markdown
12- red X <span class="X">X</span>
13
14- comments in code examples could be green?
15
16What about:
17
18- headings h1, h2, h3, h4
19 - Right now cards use reverse video. Centering didn't look great.
20
21- <ul> - you could use a Unicode bullet here
22- <ol>
23
24Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25thing.
26"""
27
28import cStringIO
29import HTMLParser
30import os
31import pprint
32import re
33import sys
34
35from typing import AnyStr
36
37from _devbuild.gen.htm8_asdl import h8_id
38from doctools import html_lib
39from doctools.util import log
40from lazylex import html
41
42#from typing import List, Tuple
43
44# Sections have alphabetical characters, spaces, and '/' for I/O. They are
45# turned into anchors.
46SECTION_RE = re.compile(
47 r'''
48 \s*
49 \[
50 ([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
51 \]
52''', re.VERBOSE)
53
54# Complex heuristic to highlight topics.
55TOPIC_RE = re.compile(
56 r'''
57 (X[ ])? # optional deprecation symbol X, then a single space
58 @? # optional @array, e.g. @BASH_SOURCE
59
60 ([a-zA-Z_][a-zA-Z0-9/:_-]+)
61 # topic names: osh-usage, _status, ysh:all, BASH_REMATCH
62 # List/append, cmd/append
63
64 ( [ ] [^a-zA-Z0-9 ] \S*
65 # trailer like >> or (make)
66 |
67 \(\) # optional () for func()
68 )?
69
70 ( # order of these 2 clauses matters
71 [ ]*\n # spaces/newline
72 |
73 [ ]+ # 1 or more spaces
74 )
75''', re.VERBOSE)
76"""
77''', re.VERBOSE)
78"""
79
80
81def _StringToHref(s):
82 # lower case to match what doctools/cmark.py does
83 return s.lower().replace(' ', '-')
84
85
86X_LEFT_SPAN = '<span style="color: darkred">'
87
88
89class TopicHtmlRenderer(object):
90
91 def __init__(self, chapter, debug_out, linkify_stop_col):
92 # type: (str, List, int) -> None
93 self.chapter = chapter
94 self.debug_out = debug_out
95 self.linkify_stop_col = linkify_stop_col
96
97 self.html_page = 'chap-%s.html' % chapter
98
99 def _PrintTopic(self, m, out, line_info):
100 # type: (Any, html.Output, Dict[str, Any]) -> None
101 # The X
102 topic_impl = True
103 if m.group(1):
104 out.PrintUntil(m.start(1))
105 out.Print(X_LEFT_SPAN)
106 out.PrintUntil(m.end(1))
107 out.Print('</span>')
108 topic_impl = False
109
110 # The topic name to link
111 topic = m.group(2)
112 line_info['topics'].append((topic, topic_impl))
113
114 out.PrintUntil(m.start(2))
115 out.Print('<a href="%s#%s">' % (self.html_page, topic))
116 out.PrintUntil(m.end(2))
117 out.Print('</a>')
118
119 def Render(self, line):
120 # type: (str) -> str
121 """Convert a line of text to HTML.
122
123 Topics are highlighted and X made red.
124
125 Args:
126 chapter: where to link to
127 line: RAW SPAN of HTML that is already escaped.
128 debug_out: structured data
129
130 Returns:
131 The HTML with some tags inserted.
132 """
133 f = cStringIO.StringIO()
134 out = html.Output(line, f)
135
136 pos = 0 # position within line
137
138 section_impl = True
139
140 if line.startswith('X '):
141 out.Print(X_LEFT_SPAN)
142 out.PrintUntil(2)
143 out.Print('</span>')
144 pos = 2
145 section_impl = False
146 elif line.startswith(' '):
147 pos = 2
148 else:
149 return line
150
151 # Highlight [Section] at the start of a line.
152 m = SECTION_RE.match(line, pos)
153 if m:
154 section_name = m.group(1)
155 #href = _StringToHref(section_name)
156 href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
157
158 out.PrintUntil(m.start(1))
159 out.Print('<a href="%s#%s" class="level2">' %
160 (self.html_page, href))
161 out.PrintUntil(m.end(1)) # anchor
162 out.Print('</a>')
163
164 pos = m.end(0) # ADVANCE
165 else:
166 section_name = None
167
168 line_info = {
169 'section': section_name,
170 'impl': section_impl,
171 'topics': []
172 }
173 self.debug_out.append(line_info)
174
175 # Whitespace after section, or leading whitespace
176 _SPACE_1 = re.compile(r'[ ]+')
177 m = _SPACE_1.match(line, pos)
178 assert m, 'Expected whitespace %r' % line
179
180 pos = m.end()
181
182 # Keep matching topics until it doesn't match.
183 while True:
184 m = TOPIC_RE.match(line, pos)
185
186 if not m:
187 break
188
189 pos = m.end()
190
191 # The 1-based column number of the end of this topic
192 col = m.end(2) + 1
193 if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
194 #log('STOPPING %d > %d' % (col, self.linkify_stop_col))
195 break
196
197 self._PrintTopic(m, out, line_info)
198
199 #log('trailing %r', line[pos:])
200
201 out.PrintTheRest()
202 return f.getvalue()
203
204
205class Splitter(HTMLParser.HTMLParser):
206 """Split an HTML stream starting at each of the heading tags.
207
208 For *-help.html.
209
210 TODO: Rewrite with this with lazylex!
211
212 Algorithm:
213 - ExtractBody() first, then match balanced tags
214 - SPLIT by h2, h3, h4
215 - Match <pre><code> blocks and re-indent
216 - Later:
217 - links <a href="">
218 - `` is turned into inline <code></code>
219 - ** ** for bold
220 - * * for emphasis
221 - <p> needs word wrapping! Oops.
222 - actually cmark seems to preserve this? OK maybe not.
223 - we just need space between <p>
224 """
225
226 def __init__(self, heading_tags, out):
227 # type: (List[str], List) -> None
228 HTMLParser.HTMLParser.__init__(self)
229 self.heading_tags = heading_tags
230 self.out = out
231
232 self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
233 self.in_heading = False
234
235 self.indent = 0
236
237 def log(self, msg, *args):
238 # type: (str, *Any) -> None
239 ind = self.indent * ' '
240 if 0:
241 log(ind + msg, *args)
242
243 def handle_starttag(self, tag, attrs):
244 # type: (AnyStr, List[Tuple[AnyStr, AnyStr]]) -> None
245 if tag in self.heading_tags:
246 self.in_heading = True
247 if self.cur_group:
248 self.out.append(self.cur_group)
249
250 self.cur_group = (tag, attrs, [], [])
251
252 self.log('[%d] <> %s %s', self.indent, tag, attrs)
253 self.indent += 1
254
255 def handle_endtag(self, tag):
256 # type: (AnyStr) -> None
257 if tag in self.heading_tags:
258 self.in_heading = False
259
260 self.log('[%d] </> %s', self.indent, tag)
261 self.indent -= 1
262
263 def handle_entityref(self, name):
264 # type: (AnyStr) -> None
265 """
266 From Python docs:
267 This method is called to process a named character reference of the form
268 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
269 """
270 c = html.CHAR_ENTITY[name]
271 if self.in_heading:
272 self.cur_group[2].append(c)
273 else:
274 if self.cur_group:
275 self.cur_group[3].append(c)
276
277 def handle_data(self, data):
278 # type: (AnyStr) -> None
279 self.log('data %r', data)
280 if self.in_heading:
281 self.cur_group[2].append(data)
282 else:
283 if self.cur_group:
284 self.cur_group[3].append(data)
285
286 def end(self):
287 # type: () -> None
288 if self.cur_group:
289 self.out.append(self.cur_group)
290
291 # Maybe detect nesting?
292 if self.indent != 0:
293 raise RuntimeError(
294 'Unbalanced HTML tags: indent=%d, cur_group=%s' %
295 (self.indent, self.cur_group))
296
297
298def ExtractBody(s):
299 # type: (str) -> str
300 """Extract what's in between <body></body>
301
302 The splitter needs balanced tags, and what's in <head> isn't
303 balanced.
304 """
305 f = cStringIO.StringIO()
306 out = html.Output(s, f)
307 tag_lexer = html.TagLexer(s)
308
309 pos = 0
310 it = html.ValidTokens(s)
311 while True:
312 try:
313 tok_id, end_pos = next(it)
314 except StopIteration:
315 break
316
317 if tok_id == h8_id.StartTag:
318 tag_lexer.Reset(pos, end_pos)
319 if tag_lexer.TagName() == 'body':
320 body_start_right = end_pos # right after <body>
321
322 out.SkipTo(body_start_right)
323 body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
324
325 out.PrintUntil(body_end_left)
326 break
327
328 pos = end_pos
329
330 return f.getvalue()
331
332
333def SplitIntoCards(heading_tags, contents):
334 # type: (List[str], str) -> Iterator
335 contents = ExtractBody(contents)
336
337 groups = []
338 sp = Splitter(heading_tags, groups)
339 sp.feed(contents)
340 sp.end()
341
342 for tag, attrs, heading_parts, parts in groups:
343 heading = ''.join(heading_parts).strip()
344
345 # Don't strip leading space?
346 text = ''.join(parts)
347 text = text.strip('\n') + '\n'
348
349 #log('text = %r', text[:10])
350
351 yield tag, attrs, heading, text
352
353 #log('make_help.py: Parsed %d parts', len(groups))
354
355
356def HelpTopics(s):
357 """
358 Given a rendered toc-{osh,ysh}.html
359
360 yield groups (section_id, section_name, block of text)
361 """
362 tag_lexer = html.TagLexer(s)
363
364 pos = 0
365 it = html.ValidTokens(s)
366 while True:
367 try:
368 tok_id, end_pos = next(it)
369 except StopIteration:
370 break
371
372 if tok_id == h8_id.StartTag:
373 tag_lexer.Reset(pos, end_pos)
374 #log('%r', tag_lexer.TagString())
375 #log('%r', tag_lexer.TagName())
376
377 # Capture <h2 id="foo"> first
378 if tag_lexer.TagName() == 'h2':
379 h2_start_right = end_pos
380
381 open_tag_right = end_pos
382 section_id = tag_lexer.GetAttrRaw('id')
383 assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
384
385 h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
386
387 anchor_html = s[h2_start_right:h2_end_left]
388 paren_pos = anchor_html.find('<') # remove HTML link
389 if paren_pos == -1:
390 section_name = anchor_html
391 else:
392 section_name = anchor_html[:paren_pos].strip()
393
394 # Now find the <code></code> span
395 _, code_start_right = html.ReadUntilStartTag(
396 it, tag_lexer, 'code')
397 css_class = tag_lexer.GetAttrRaw('class')
398 assert css_class is not None
399 assert css_class.startswith(
400 'language-chapter-links-'), tag_lexer.TagString()
401
402 code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
403
404 text = html.ToText(s, code_start_right, code_end_left)
405 yield section_id, section_name, text
406
407 pos = end_pos
408
409
410class DocNode(object):
411 """To visualize doc structure."""
412
413 def __init__(self, name, attrs=None, text=None):
414 # type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
415 self.name = name
416 self.attrs = attrs # for h2 and h3 links
417 self.text = text
418 self.children = []
419
420
421def CardsFromIndex(sh, out_prefix):
422 sections = []
423 for section_id, section_name, text in HelpTopics(sys.stdin.read()):
424 if 0:
425 log('section_id = %r', section_id)
426 log('section_name = %r', section_name)
427 log('')
428 #log('text = %r', text[:20])
429
430 topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
431
432 path = os.path.join(out_prefix, topic)
433 with open(path, 'w') as f:
434 f.write('%s\n\n' %
435 section_name) # section_id is printed dynamically
436 f.write(text)
437 #f.write('\n') # extra
438 #log(' Wrote %s', path)
439 sections.append(section_id)
440
441 log(' (doctools/make_help) -> %d sections -> %s', len(sections),
442 out_prefix)
443
444
445def CardsFromChapters(
446 out_dir, # type: str
447 tag_level, # type: str
448 paths, # type: List[str]
449):
450 # type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
451 """
452 Args:
453 paths: list of chap-*.html to read
454 """
455 topic_to_chap = {}
456
457 root_node = DocNode('/')
458 cur_h2_node = None
459
460 for path in paths:
461 with open(path) as f:
462 contents = f.read()
463
464 filename = os.path.basename(path)
465
466 tmp, _ = os.path.splitext(filename)
467 assert tmp.startswith('chap-')
468 chapter_name = tmp[len('chap-'):]
469
470 page_node = DocNode(filename)
471
472 cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
473
474 for tag, attrs, heading, text in cards:
475 values = [v for k, v in attrs if k == 'id']
476 id_value = values[0] if len(values) == 1 else None
477
478 topic_id = (id_value if id_value else html_lib.PrettyHref(
479 heading, preserve_anchor_case=True))
480
481 if tag == 'h2':
482 h2 = DocNode(topic_id, attrs=attrs)
483 page_node.children.append(h2)
484 cur_h2_node = h2
485 elif tag == 'h3':
486 # attach text so we can see which topics have empty bodies
487 h3 = DocNode(topic_id, attrs=attrs, text=text)
488 cur_h2_node.children.append(h3)
489
490 if tag != tag_level:
491 continue # we only care about h3 now
492
493 if 0:
494 log('tag = %r', tag)
495 log('topic_id = %r', topic_id)
496 log('heading = %r', heading)
497 log('text = %r', text[:20])
498
499 embed = ('oils-embed', '1') in attrs
500
501 if out_dir is not None and embed:
502 # indices start with _
503 path = os.path.join(out_dir, topic_id)
504 with open(path, 'w') as f:
505 f.write(text)
506
507 # help builtin will show URL if there's a chapter name
508 topic_to_chap[topic_id] = None if embed else chapter_name
509
510 root_node.children.append(page_node)
511
512 num_sections = sum(len(child.children) for child in root_node.children)
513
514 log(
515 '%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
516 len(paths), len(topic_to_chap), num_sections, out_dir)
517
518 return topic_to_chap, root_node
519
520
521class StrPool(object):
522
523 def __init__(self):
524 # type: () -> None
525 self.var_names = {}
526 self.global_strs = []
527 self.unique_id = 1
528
529 def Add(self, s):
530 # type: (str) -> None
531 if s in self.var_names:
532 return
533
534 var_name = 'gStr%d' % self.unique_id
535 self.unique_id += 1
536
537 import json
538 # Use JSON as approximation for C++ string
539 self.global_strs.append('GLOBAL_STR(%s, %s)' %
540 (var_name, json.dumps(s)))
541
542 self.var_names[s] = var_name
543
544
545def WriteTopicDict(topic_dict, header_f, cc_f):
546 # type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
547 header_f.write('''
548#include "mycpp/runtime.h"
549
550namespace help_meta {
551Dict<BigStr*, BigStr*>* TopicMetadata();
552}
553''')
554
555 pool = StrPool()
556
557 for k, v in topic_dict.iteritems():
558 pool.Add(k)
559 if v is not None:
560 pool.Add(v)
561 #log('%s %s', k, v)
562
563 num_items = len(topic_dict)
564 key_names = []
565 val_names = []
566
567 for k, v in topic_dict.iteritems():
568 key_names.append(pool.var_names[k])
569 if v is None:
570 v_str = 'nullptr'
571 else:
572 v_str = pool.var_names[v]
573 val_names.append(v_str)
574
575 cc_f.write('''
576#include "mycpp/runtime.h"
577
578namespace help_meta {
579
580%s
581
582GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
583
584Dict<BigStr*, BigStr*>* TopicMetadata() {
585 return gTopics;
586}
587}
588''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
589 ' COMMA '.join(val_names)))
590
591
592def main(argv):
593 # type: (List[str]) -> None
594 action = argv[1]
595
596 if action == 'cards-from-index':
597 sh = argv[2] # osh or ysh
598 out_prefix = argv[3]
599
600 # Read HTML from stdin
601 # TODO: could pass a list of files to speed it up
602 CardsFromIndex(sh, out_prefix)
603
604 elif action == 'cards-from-chapters':
605
606 out_dir = argv[2]
607 py_out = argv[3]
608 cc_prefix = argv[4]
609 pages = argv[5:]
610
611 topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
612
613 # Write topic dict as Python and C++
614
615 with open(py_out, 'w') as f:
616 f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
617
618 f.write('''
619
620from typing import Dict
621
622def TopicMetadata():
623 # type: () -> Dict[str, str]
624 return TOPICS
625''')
626
627 h_path = cc_prefix + '.h'
628 cc_path = cc_prefix + '.cc'
629
630 with open(h_path, 'w') as header_f:
631 with open(cc_path, 'w') as cc_f:
632 WriteTopicDict(topic_to_chap, header_f, cc_f)
633
634 elif action == 'ref-check':
635 from doctools import cmark
636 from doctools import oils_doc
637 from doctools import ref_check
638
639 chapters = []
640 all_toc_nodes = []
641
642 for path in argv[2:]:
643 filename = os.path.basename(path)
644
645 if filename.endswith('.md'):
646 assert filename.startswith('toc-'), path
647
648 # First convert to HTML
649 with open(path) as in_file:
650 html = cmark.md2html(in_file.read())
651
652 # Now highlight code, which # which gives debug output for the
653 # language-chapter-links-*
654
655 box_nodes = []
656 html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
657 all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
658
659 elif filename.endswith('.html'):
660 assert filename.startswith('chap-'), path
661 chapters.append(path)
662
663 else:
664 raise RuntimeError('Expected toc-* or chap-*, got %r' %
665 filename)
666
667 topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
668
669 #log('%d chapters: %s', len(chapters), chapters[:5])
670 #log('%d topics: %s', len(topics), topics.keys()[:10])
671 log('')
672
673 # Compare TOC vs. chapters
674 ref_check.Check(all_toc_nodes, chap_tree)
675
676 else:
677 raise RuntimeError('Invalid action %r' % action)
678
679
680if __name__ == '__main__':
681 try:
682 main(sys.argv)
683 except RuntimeError as e:
684 print('FATAL: %s' % e, file=sys.stderr)
685 sys.exit(1)