OILS / doctools / help_gen.py View on Github | oils.pub

692 lines, 360 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4"""help_gen.py
5
6Ideas for HTML -> ANSI converter:
7
8- `ls` -> <code>ls</code> -> is reverse video?
9- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10 - could also be bright blue
11- <pre> is also indented 4 spaces, like the markdown
12- red X <span class="X">X</span>
13
14- comments in code examples could be green?
15
16What about:
17
18- headings h1, h2, h3, h4
19 - Right now cards use reverse video. Centering didn't look great.
20
21- <ul> - you could use a Unicode bullet here
22- <ol>
23
24Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25thing.
26"""
27
28import cStringIO
29import HTMLParser
30import os
31import pprint
32import re
33import sys
34
35from typing import AnyStr
36
37from _devbuild.gen.htm8_asdl import h8_id
38from data_lang import htm8
39from doctools import html_lib
40from doctools.util import log
41from lazylex import html
42
43#from typing import List, Tuple
44
45# Sections have alphabetical characters, spaces, and '/' for I/O. They are
46# turned into anchors.
47SECTION_RE = re.compile(
48 r'''
49 \s*
50 \[
51 ([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
52 \]
53''', re.VERBOSE)
54
55# Complex heuristic to highlight topics.
56TOPIC_RE = re.compile(
57 r'''
58 (X[ ])? # optional deprecation symbol X, then a single space
59 @? # optional @array, e.g. @BASH_SOURCE
60
61 ([a-zA-Z_][a-zA-Z0-9/:_-]+)
62 # topic names: osh-usage, _status, ysh:all, BASH_REMATCH
63 # List/append, cmd/append
64
65 ( [ ] [^a-zA-Z0-9 ] \S*
66 # trailer like >> or (make)
67 |
68 \(\) # optional () for func()
69 )?
70
71 ( # order of these 2 clauses matters
72 [ ]*\n # spaces/newline
73 |
74 [ ]+ # 1 or more spaces
75 )
76''', re.VERBOSE)
77"""
78''', re.VERBOSE)
79"""
80
81
82def _StringToHref(s):
83 # lower case to match what doctools/cmark.py does
84 return s.lower().replace(' ', '-')
85
86
87X_LEFT_SPAN = '<span style="color: darkred">'
88
89
90class TopicHtmlRenderer(object):
91
92 def __init__(self, chapter, debug_out, linkify_stop_col):
93 # type: (str, List, int) -> None
94 self.chapter = chapter
95 self.debug_out = debug_out
96 self.linkify_stop_col = linkify_stop_col
97
98 self.html_page = 'chap-%s.html' % chapter
99
100 def _PrintTopic(self, m, out, line_info):
101 # type: (Any, htm8.Output, Dict[str, Any]) -> None
102 # The X
103 topic_impl = True
104 if m.group(1):
105 out.PrintUntil(m.start(1))
106 out.Print(X_LEFT_SPAN)
107 out.PrintUntil(m.end(1))
108 out.Print('</span>')
109 topic_impl = False
110
111 # The topic name to link
112 topic = m.group(2)
113 line_info['topics'].append((topic, topic_impl))
114
115 out.PrintUntil(m.start(2))
116 out.Print('<a href="%s#%s">' % (self.html_page, topic))
117 out.PrintUntil(m.end(2))
118 out.Print('</a>')
119
120 def Render(self, line):
121 # type: (str) -> str
122 """Convert a line of text to HTML.
123
124 Topics are highlighted and X made red.
125
126 Args:
127 chapter: where to link to
128 line: RAW SPAN of HTML that is already escaped.
129 debug_out: structured data
130
131 Returns:
132 The HTML with some tags inserted.
133 """
134 f = cStringIO.StringIO()
135 out = htm8.Output(line, f)
136
137 pos = 0 # position within line
138
139 section_impl = True
140
141 if line.startswith('X '):
142 out.Print(X_LEFT_SPAN)
143 out.PrintUntil(2)
144 out.Print('</span>')
145 pos = 2
146 section_impl = False
147 elif line.startswith(' '):
148 pos = 2
149 else:
150 return line
151
152 # Highlight [Section] at the start of a line.
153 m = SECTION_RE.match(line, pos)
154 if m:
155 section_name = m.group(1)
156 #href = _StringToHref(section_name)
157 href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
158
159 out.PrintUntil(m.start(1))
160 out.Print('<a href="%s#%s" class="level2">' %
161 (self.html_page, href))
162 out.PrintUntil(m.end(1)) # anchor
163 out.Print('</a>')
164
165 pos = m.end(0) # ADVANCE
166 else:
167 section_name = None
168
169 line_info = {
170 'section': section_name,
171 'impl': section_impl,
172 'topics': []
173 }
174 self.debug_out.append(line_info)
175
176 # Whitespace after section, or leading whitespace
177 _SPACE_1 = re.compile(r'[ ]+')
178 m = _SPACE_1.match(line, pos)
179 assert m, 'Expected whitespace %r' % line
180
181 pos = m.end()
182
183 # Keep matching topics until it doesn't match.
184 while True:
185 m = TOPIC_RE.match(line, pos)
186
187 if not m:
188 break
189
190 pos = m.end()
191
192 # The 1-based column number of the end of this topic
193 col = m.end(2) + 1
194 if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
195 #log('STOPPING %d > %d' % (col, self.linkify_stop_col))
196 break
197
198 self._PrintTopic(m, out, line_info)
199
200 #log('trailing %r', line[pos:])
201
202 out.PrintTheRest()
203 return f.getvalue()
204
205
206CurGroup = Tuple[AnyStr, List[Tuple[AnyStr, AnyStr]], AnyStr, List[Any]]
207
208
209class Splitter(HTMLParser.HTMLParser):
210 """Split an HTML stream starting at each of the heading tags.
211
212 For *-help.html.
213
214 TODO: Rewrite with this with lazylex!
215
216 Algorithm:
217 - ExtractBody() first, then match balanced tags
218 - SPLIT by h2, h3, h4
219 - Match <pre><code> blocks and re-indent
220 - Later:
221 - links <a href="">
222 - `` is turned into inline <code></code>
223 - ** ** for bold
224 - * * for emphasis
225 - <p> needs word wrapping! Oops.
226 - actually cmark seems to preserve this? OK maybe not.
227 - we just need space between <p>
228 """
229
230 def __init__(self, heading_tags, out):
231 # type: (List[str], List[CurGroup]) -> None
232 HTMLParser.HTMLParser.__init__(self)
233 self.heading_tags = heading_tags
234 self.out = out
235
236 self.cur_group = None # type: CurGroup
237 self.in_heading = False
238
239 self.indent = 0
240
241 def log(self, msg, *args):
242 # type: (str, *Any) -> None
243 ind = self.indent * ' '
244 if 0:
245 log(ind + msg, *args)
246
247 def handle_starttag(self, tag, attrs):
248 # type: (AnyStr, List[Tuple[AnyStr, AnyStr]]) -> None
249 if tag in self.heading_tags:
250 self.in_heading = True
251 if self.cur_group:
252 self.out.append(self.cur_group)
253
254 self.cur_group = (tag, attrs, [], [])
255
256 self.log('[%d] <> %s %s', self.indent, tag, attrs)
257 self.indent += 1
258
259 def handle_endtag(self, tag):
260 # type: (AnyStr) -> None
261 if tag in self.heading_tags:
262 self.in_heading = False
263
264 self.log('[%d] </> %s', self.indent, tag)
265 self.indent -= 1
266
267 def handle_entityref(self, name):
268 # type: (AnyStr) -> None
269 """
270 From Python docs:
271 This method is called to process a named character reference of the form
272 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
273 """
274 c = html.CHAR_ENTITY[name]
275 if self.in_heading:
276 self.cur_group[2].append(c)
277 else:
278 if self.cur_group:
279 self.cur_group[3].append(c)
280
281 def handle_data(self, data):
282 # type: (AnyStr) -> None
283 self.log('data %r', data)
284 if self.in_heading:
285 self.cur_group[2].append(data)
286 else:
287 if self.cur_group:
288 self.cur_group[3].append(data)
289
290 def end(self):
291 # type: () -> None
292 if self.cur_group:
293 self.out.append(self.cur_group)
294
295 # Maybe detect nesting?
296 if self.indent != 0:
297 raise RuntimeError(
298 'Unbalanced HTML tags: indent=%d, cur_group=%s' %
299 (self.indent, self.cur_group))
300
301
302def ExtractBody(s):
303 # type: (str) -> str
304 """Extract what's in between <body></body>
305
306 The splitter needs balanced tags, and what's in <head> isn't
307 balanced.
308 """
309 f = cStringIO.StringIO()
310 out = htm8.Output(s, f)
311 tag_lexer = htm8.TagLexer(s)
312
313 pos = 0
314 it = html.ValidTokens(s)
315 while True:
316 try:
317 tok_id, end_pos = next(it)
318 except StopIteration:
319 break
320
321 if tok_id == h8_id.StartTag:
322 tag_lexer.Reset(pos, end_pos)
323 if tag_lexer.GetTagName() == 'body':
324 body_start_right = end_pos # right after <body>
325
326 out.SkipTo(body_start_right)
327 body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
328
329 out.PrintUntil(body_end_left)
330 break
331
332 pos = end_pos
333
334 return f.getvalue()
335
336
337def SplitIntoCards(heading_tags, contents):
338 # type: (List[str], str) -> Iterator[str, Any, str, str]
339 contents = ExtractBody(contents)
340
341 groups = []
342 sp = Splitter(heading_tags, groups)
343 sp.feed(contents)
344 sp.end()
345
346 for tag, attrs, heading_parts, parts in groups:
347 heading = ''.join(heading_parts).strip()
348
349 # Don't strip leading space?
350 text = ''.join(parts)
351 text = text.strip('\n') + '\n'
352
353 #log('text = %r', text[:10])
354
355 yield tag, attrs, heading, text
356
357 #log('make_help.py: Parsed %d parts', len(groups))
358
359
360def HelpTopics(s):
361 # type: (str) -> Iterator[Tuple[str, str, str]]
362 """
363 Given a rendered toc-{osh,ysh}.html
364
365 yield groups (section_id, section_name, block of text)
366 """
367 tag_lexer = htm8.TagLexer(s)
368
369 pos = 0
370 it = html.ValidTokens(s)
371 while True:
372 try:
373 tok_id, end_pos = next(it)
374 except StopIteration:
375 break
376
377 if tok_id == h8_id.StartTag:
378 tag_lexer.Reset(pos, end_pos)
379 #log('%r', tag_lexer.TagString())
380 #log('%r', tag_lexer.GetTagName())
381
382 # Capture <h2 id="foo"> first
383 if tag_lexer.GetTagName() == 'h2':
384 h2_start_right = end_pos
385
386 open_tag_right = end_pos
387 section_id = tag_lexer.GetAttrRaw('id')
388 assert section_id, 'Expected id= in %r' % tag_lexer.WholeTagString(
389 )
390
391 h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
392
393 anchor_html = s[h2_start_right:h2_end_left]
394 paren_pos = anchor_html.find('<') # remove HTML link
395 if paren_pos == -1:
396 section_name = anchor_html
397 else:
398 section_name = anchor_html[:paren_pos].strip()
399
400 # Now find the <code></code> span
401 _, code_start_right = html.ReadUntilStartTag(
402 it, tag_lexer, 'code')
403 css_class = tag_lexer.GetAttrRaw('class')
404 assert css_class is not None
405 assert (css_class.startswith('language-chapter-links-')
406 ), tag_lexer.WholeTagString()
407
408 code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
409
410 text = html.ToText(s, code_start_right, code_end_left)
411 yield section_id, section_name, text
412
413 pos = end_pos
414
415
416class DocNode(object):
417 """To visualize doc structure."""
418
419 def __init__(self, name, attrs=None, text=None):
420 # type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
421 self.name = name
422 self.attrs = attrs # for h2 and h3 links
423 self.text = text
424 self.children = []
425
426
427def CardsFromIndex(sh, out_prefix):
428 # type: (str, str) -> None
429 sections = []
430 for section_id, section_name, text in HelpTopics(sys.stdin.read()):
431 if 0:
432 log('section_id = %r', section_id)
433 log('section_name = %r', section_name)
434 log('')
435 #log('text = %r', text[:20])
436
437 topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
438
439 path = os.path.join(out_prefix, topic)
440 with open(path, 'w') as f:
441 f.write('%s\n\n' %
442 section_name) # section_id is printed dynamically
443 f.write(text)
444 #f.write('\n') # extra
445 #log(' Wrote %s', path)
446 sections.append(section_id)
447
448 log(' (doctools/make_help) -> %d sections -> %s', len(sections),
449 out_prefix)
450
451
452def CardsFromChapters(
453 out_dir, # type: str
454 tag_level, # type: str
455 paths, # type: List[str]
456):
457 # type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
458 """
459 Args:
460 paths: list of chap-*.html to read
461 """
462 topic_to_chap = {}
463
464 root_node = DocNode('/')
465 cur_h2_node = None
466
467 for path in paths:
468 with open(path) as f:
469 contents = f.read()
470
471 filename = os.path.basename(path)
472
473 tmp, _ = os.path.splitext(filename)
474 assert tmp.startswith('chap-')
475 chapter_name = tmp[len('chap-'):]
476
477 page_node = DocNode(filename)
478
479 cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
480
481 for tag, attrs, heading, text in cards:
482 values = [v for k, v in attrs if k == 'id']
483 id_value = values[0] if len(values) == 1 else None
484
485 topic_id = (id_value if id_value else html_lib.PrettyHref(
486 heading, preserve_anchor_case=True))
487
488 if tag == 'h2':
489 h2 = DocNode(topic_id, attrs=attrs)
490 page_node.children.append(h2)
491 cur_h2_node = h2
492 elif tag == 'h3':
493 # attach text so we can see which topics have empty bodies
494 h3 = DocNode(topic_id, attrs=attrs, text=text)
495 cur_h2_node.children.append(h3)
496
497 if tag != tag_level:
498 continue # we only care about h3 now
499
500 if 0:
501 log('tag = %r', tag)
502 log('topic_id = %r', topic_id)
503 log('heading = %r', heading)
504 log('text = %r', text[:20])
505
506 embed = ('oils-embed', '1') in attrs
507
508 if out_dir is not None and embed:
509 # indices start with _
510 path = os.path.join(out_dir, topic_id)
511 with open(path, 'w') as f:
512 f.write(text)
513
514 # help builtin will show URL if there's a chapter name
515 topic_to_chap[topic_id] = None if embed else chapter_name
516
517 root_node.children.append(page_node)
518
519 num_sections = sum(len(child.children) for child in root_node.children)
520
521 log(
522 '%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
523 len(paths), len(topic_to_chap), num_sections, out_dir)
524
525 return topic_to_chap, root_node
526
527
528class StrPool(object):
529
530 def __init__(self):
531 # type: () -> None
532 self.var_names = {}
533 self.global_strs = []
534 self.unique_id = 1
535
536 def Add(self, s):
537 # type: (str) -> None
538 if s in self.var_names:
539 return
540
541 var_name = 'gStr%d' % self.unique_id
542 self.unique_id += 1
543
544 import json
545 # Use JSON as approximation for C++ string
546 self.global_strs.append('GLOBAL_STR(%s, %s)' %
547 (var_name, json.dumps(s)))
548
549 self.var_names[s] = var_name
550
551
552def WriteTopicDict(topic_dict, header_f, cc_f):
553 # type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
554 header_f.write('''
555#include "mycpp/runtime.h"
556
557namespace help_meta {
558Dict<BigStr*, BigStr*>* TopicMetadata();
559}
560''')
561
562 pool = StrPool()
563
564 for k, v in topic_dict.iteritems():
565 pool.Add(k)
566 if v is not None:
567 pool.Add(v)
568 #log('%s %s', k, v)
569
570 num_items = len(topic_dict)
571 key_names = []
572 val_names = []
573
574 for k, v in topic_dict.iteritems():
575 key_names.append(pool.var_names[k])
576 if v is None:
577 v_str = 'nullptr'
578 else:
579 v_str = pool.var_names[v]
580 val_names.append(v_str)
581
582 cc_f.write('''
583#include "mycpp/runtime.h"
584
585namespace help_meta {
586
587%s
588
589GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
590
591Dict<BigStr*, BigStr*>* TopicMetadata() {
592 return gTopics;
593}
594}
595''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
596 ' COMMA '.join(val_names)))
597
598
599def main(argv):
600 # type: (List[str]) -> None
601 action = argv[1]
602
603 if action == 'cards-from-index':
604 sh = argv[2] # osh or ysh
605 out_prefix = argv[3]
606
607 # Read HTML from stdin
608 # TODO: could pass a list of files to speed it up
609 CardsFromIndex(sh, out_prefix)
610
611 elif action == 'cards-from-chapters':
612
613 out_dir = argv[2]
614 py_out = argv[3]
615 cc_prefix = argv[4]
616 pages = argv[5:]
617
618 topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
619
620 # Write topic dict as Python and C++
621
622 with open(py_out, 'w') as f:
623 f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
624
625 f.write('''
626
627from typing import Dict
628
629def TopicMetadata():
630 # type: () -> Dict[str, str]
631 return TOPICS
632''')
633
634 h_path = cc_prefix + '.h'
635 cc_path = cc_prefix + '.cc'
636
637 with open(h_path, 'w') as header_f:
638 with open(cc_path, 'w') as cc_f:
639 WriteTopicDict(topic_to_chap, header_f, cc_f)
640
641 elif action == 'ref-check':
642 from doctools import cmark
643 from doctools import oils_doc
644 from doctools import ref_check
645
646 chapters = []
647 all_toc_nodes = []
648
649 for path in argv[2:]:
650 filename = os.path.basename(path)
651
652 if filename.endswith('.md'):
653 assert filename.startswith('toc-'), path
654
655 # First convert to HTML
656 with open(path) as in_file:
657 html = cmark.md2html(in_file.read())
658
659 # Now highlight code, which # which gives debug output for the
660 # language-chapter-links-*
661
662 box_nodes = []
663 html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
664 all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
665
666 elif filename.endswith('.html'):
667 assert filename.startswith('chap-'), path
668 chapters.append(path)
669
670 else:
671 raise RuntimeError('Expected toc-* or chap-*, got %r' %
672 filename)
673
674 topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
675
676 #log('%d chapters: %s', len(chapters), chapters[:5])
677 #log('%d topics: %s', len(topics), topics.keys()[:10])
678 log('')
679
680 # Compare TOC vs. chapters
681 ref_check.Check(all_toc_nodes, chap_tree)
682
683 else:
684 raise RuntimeError('Invalid action %r' % action)
685
686
687if __name__ == '__main__':
688 try:
689 main(sys.argv)
690 except RuntimeError as e:
691 print('FATAL: %s' % e, file=sys.stderr)
692 sys.exit(1)