OILS / doctools / help_gen.py View on Github | oils.pub

683 lines, 356 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4"""help_gen.py
5
6Ideas for HTML -> ANSI converter:
7
8- `ls` -> <code>ls</code> -> is reverse video?
9- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10 - could also be bright blue
11- <pre> is also indented 4 spaces, like the markdown
12- red X <span class="X">X</span>
13
14- comments in code examples could be green?
15
16What about:
17
18- headings h1, h2, h3, h4
19 - Right now cards use reverse video. Centering didn't look great.
20
21- <ul> - you could use a Unicode bullet here
22- <ol>
23
24Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25thing.
26"""
27
28import cStringIO
29import HTMLParser
30import os
31import pprint
32import re
33import sys
34
35from _devbuild.gen.htm8_asdl import h8_id
36from doctools import html_lib
37from doctools.util import log
38from lazylex import html
39
40#from typing import List, Tuple
41
42# Sections have alphabetical characters, spaces, and '/' for I/O. They are
43# turned into anchors.
44SECTION_RE = re.compile(
45 r'''
46 \s*
47 \[
48 ([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
49 \]
50''', re.VERBOSE)
51
52# Complex heuristic to highlight topics.
53TOPIC_RE = re.compile(
54 r'''
55 (X[ ])? # optional deprecation symbol X, then a single space
56 @? # optional @array, e.g. @BASH_SOURCE
57
58 ([a-zA-Z_][a-zA-Z0-9/:_-]+)
59 # topic names: osh-usage, _status, ysh:all, BASH_REMATCH
60 # List/append, cmd/append
61
62 ( [ ] [^a-zA-Z0-9 ] \S*
63 # trailer like >> or (make)
64 |
65 \(\) # optional () for func()
66 )?
67
68 ( # order of these 2 clauses matters
69 [ ]*\n # spaces/newline
70 |
71 [ ]+ # 1 or more spaces
72 )
73''', re.VERBOSE)
74"""
75''', re.VERBOSE)
76"""
77
78
79def _StringToHref(s):
80 # lower case to match what doctools/cmark.py does
81 return s.lower().replace(' ', '-')
82
83
84X_LEFT_SPAN = '<span style="color: darkred">'
85
86
87class TopicHtmlRenderer(object):
88
89 def __init__(self, chapter, debug_out, linkify_stop_col):
90 # type: (str, List, int) -> None
91 self.chapter = chapter
92 self.debug_out = debug_out
93 self.linkify_stop_col = linkify_stop_col
94
95 self.html_page = 'chap-%s.html' % chapter
96
97 def _PrintTopic(self, m, out, line_info):
98 # type: (Any, html.Output, Dict[str, Any]) -> None
99 # The X
100 topic_impl = True
101 if m.group(1):
102 out.PrintUntil(m.start(1))
103 out.Print(X_LEFT_SPAN)
104 out.PrintUntil(m.end(1))
105 out.Print('</span>')
106 topic_impl = False
107
108 # The topic name to link
109 topic = m.group(2)
110 line_info['topics'].append((topic, topic_impl))
111
112 out.PrintUntil(m.start(2))
113 out.Print('<a href="%s#%s">' % (self.html_page, topic))
114 out.PrintUntil(m.end(2))
115 out.Print('</a>')
116
117 def Render(self, line):
118 # type: (str) -> str
119 """Convert a line of text to HTML.
120
121 Topics are highlighted and X made red.
122
123 Args:
124 chapter: where to link to
125 line: RAW SPAN of HTML that is already escaped.
126 debug_out: structured data
127
128 Returns:
129 The HTML with some tags inserted.
130 """
131 f = cStringIO.StringIO()
132 out = html.Output(line, f)
133
134 pos = 0 # position within line
135
136 section_impl = True
137
138 if line.startswith('X '):
139 out.Print(X_LEFT_SPAN)
140 out.PrintUntil(2)
141 out.Print('</span>')
142 pos = 2
143 section_impl = False
144 elif line.startswith(' '):
145 pos = 2
146 else:
147 return line
148
149 # Highlight [Section] at the start of a line.
150 m = SECTION_RE.match(line, pos)
151 if m:
152 section_name = m.group(1)
153 #href = _StringToHref(section_name)
154 href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
155
156 out.PrintUntil(m.start(1))
157 out.Print('<a href="%s#%s" class="level2">' %
158 (self.html_page, href))
159 out.PrintUntil(m.end(1)) # anchor
160 out.Print('</a>')
161
162 pos = m.end(0) # ADVANCE
163 else:
164 section_name = None
165
166 line_info = {
167 'section': section_name,
168 'impl': section_impl,
169 'topics': []
170 }
171 self.debug_out.append(line_info)
172
173 # Whitespace after section, or leading whitespace
174 _SPACE_1 = re.compile(r'[ ]+')
175 m = _SPACE_1.match(line, pos)
176 assert m, 'Expected whitespace %r' % line
177
178 pos = m.end()
179
180 # Keep matching topics until it doesn't match.
181 while True:
182 m = TOPIC_RE.match(line, pos)
183
184 if not m:
185 break
186
187 pos = m.end()
188
189 # The 1-based column number of the end of this topic
190 col = m.end(2) + 1
191 if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
192 #log('STOPPING %d > %d' % (col, self.linkify_stop_col))
193 break
194
195 self._PrintTopic(m, out, line_info)
196
197 #log('trailing %r', line[pos:])
198
199 out.PrintTheRest()
200 return f.getvalue()
201
202
203class Splitter(HTMLParser.HTMLParser):
204 """Split an HTML stream starting at each of the heading tags.
205
206 For *-help.html.
207
208 TODO: Rewrite with this with lazylex!
209
210 Algorithm:
211 - ExtractBody() first, then match balanced tags
212 - SPLIT by h2, h3, h4
213 - Match <pre><code> blocks and re-indent
214 - Later:
215 - links <a href="">
216 - `` is turned into inline <code></code>
217 - ** ** for bold
218 - * * for emphasis
219 - <p> needs word wrapping! Oops.
220 - actually cmark seems to preserve this? OK maybe not.
221 - we just need space between <p>
222 """
223
224 def __init__(self, heading_tags, out):
225 # type: (List[str], List) -> None
226 HTMLParser.HTMLParser.__init__(self)
227 self.heading_tags = heading_tags
228 self.out = out
229
230 self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
231 self.in_heading = False
232
233 self.indent = 0
234
235 def log(self, msg, *args):
236 # type: (str, *Any) -> None
237 ind = self.indent * ' '
238 if 0:
239 log(ind + msg, *args)
240
241 def handle_starttag(self, tag, attrs):
242 # type: (str, List[Tuple[str, str]]) -> None
243 if tag in self.heading_tags:
244 self.in_heading = True
245 if self.cur_group:
246 self.out.append(self.cur_group)
247
248 self.cur_group = (tag, attrs, [], [])
249
250 self.log('[%d] <> %s %s', self.indent, tag, attrs)
251 self.indent += 1
252
253 def handle_endtag(self, tag):
254 # type: (str) -> None
255 if tag in self.heading_tags:
256 self.in_heading = False
257
258 self.log('[%d] </> %s', self.indent, tag)
259 self.indent -= 1
260
261 def handle_entityref(self, name):
262 # type: (str) -> None
263 """
264 From Python docs:
265 This method is called to process a named character reference of the form
266 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
267 """
268 c = html.CHAR_ENTITY[name]
269 if self.in_heading:
270 self.cur_group[2].append(c)
271 else:
272 if self.cur_group:
273 self.cur_group[3].append(c)
274
275 def handle_data(self, data):
276 # type: (str) -> None
277 self.log('data %r', data)
278 if self.in_heading:
279 self.cur_group[2].append(data)
280 else:
281 if self.cur_group:
282 self.cur_group[3].append(data)
283
284 def end(self):
285 # type: () -> None
286 if self.cur_group:
287 self.out.append(self.cur_group)
288
289 # Maybe detect nesting?
290 if self.indent != 0:
291 raise RuntimeError(
292 'Unbalanced HTML tags: indent=%d, cur_group=%s' %
293 (self.indent, self.cur_group))
294
295
296def ExtractBody(s):
297 # type: (str) -> str
298 """Extract what's in between <body></body>
299
300 The splitter needs balanced tags, and what's in <head> isn't
301 balanced.
302 """
303 f = cStringIO.StringIO()
304 out = html.Output(s, f)
305 tag_lexer = html.TagLexer(s)
306
307 pos = 0
308 it = html.ValidTokens(s)
309 while True:
310 try:
311 tok_id, end_pos = next(it)
312 except StopIteration:
313 break
314
315 if tok_id == h8_id.StartTag:
316 tag_lexer.Reset(pos, end_pos)
317 if tag_lexer.TagName() == 'body':
318 body_start_right = end_pos # right after <body>
319
320 out.SkipTo(body_start_right)
321 body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
322
323 out.PrintUntil(body_end_left)
324 break
325
326 pos = end_pos
327
328 return f.getvalue()
329
330
331def SplitIntoCards(heading_tags, contents):
332 # type: (List[str], str) -> Iterator
333 contents = ExtractBody(contents)
334
335 groups = []
336 sp = Splitter(heading_tags, groups)
337 sp.feed(contents)
338 sp.end()
339
340 for tag, attrs, heading_parts, parts in groups:
341 heading = ''.join(heading_parts).strip()
342
343 # Don't strip leading space?
344 text = ''.join(parts)
345 text = text.strip('\n') + '\n'
346
347 #log('text = %r', text[:10])
348
349 yield tag, attrs, heading, text
350
351 #log('make_help.py: Parsed %d parts', len(groups))
352
353
354def HelpTopics(s):
355 """
356 Given a rendered toc-{osh,ysh}.html
357
358 yield groups (section_id, section_name, block of text)
359 """
360 tag_lexer = html.TagLexer(s)
361
362 pos = 0
363 it = html.ValidTokens(s)
364 while True:
365 try:
366 tok_id, end_pos = next(it)
367 except StopIteration:
368 break
369
370 if tok_id == h8_id.StartTag:
371 tag_lexer.Reset(pos, end_pos)
372 #log('%r', tag_lexer.TagString())
373 #log('%r', tag_lexer.TagName())
374
375 # Capture <h2 id="foo"> first
376 if tag_lexer.TagName() == 'h2':
377 h2_start_right = end_pos
378
379 open_tag_right = end_pos
380 section_id = tag_lexer.GetAttrRaw('id')
381 assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
382
383 h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
384
385 anchor_html = s[h2_start_right:h2_end_left]
386 paren_pos = anchor_html.find('<') # remove HTML link
387 if paren_pos == -1:
388 section_name = anchor_html
389 else:
390 section_name = anchor_html[:paren_pos].strip()
391
392 # Now find the <code></code> span
393 _, code_start_right = html.ReadUntilStartTag(
394 it, tag_lexer, 'code')
395 css_class = tag_lexer.GetAttrRaw('class')
396 assert css_class is not None
397 assert css_class.startswith(
398 'language-chapter-links-'), tag_lexer.TagString()
399
400 code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
401
402 text = html.ToText(s, code_start_right, code_end_left)
403 yield section_id, section_name, text
404
405 pos = end_pos
406
407
408class DocNode(object):
409 """To visualize doc structure."""
410
411 def __init__(self, name, attrs=None, text=None):
412 # type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
413 self.name = name
414 self.attrs = attrs # for h2 and h3 links
415 self.text = text
416 self.children = []
417
418
419def CardsFromIndex(sh, out_prefix):
420 sections = []
421 for section_id, section_name, text in HelpTopics(sys.stdin.read()):
422 if 0:
423 log('section_id = %r', section_id)
424 log('section_name = %r', section_name)
425 log('')
426 #log('text = %r', text[:20])
427
428 topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
429
430 path = os.path.join(out_prefix, topic)
431 with open(path, 'w') as f:
432 f.write('%s\n\n' %
433 section_name) # section_id is printed dynamically
434 f.write(text)
435 #f.write('\n') # extra
436 #log(' Wrote %s', path)
437 sections.append(section_id)
438
439 log(' (doctools/make_help) -> %d sections -> %s', len(sections),
440 out_prefix)
441
442
443def CardsFromChapters(
444 out_dir, # type: str
445 tag_level, # type: str
446 paths, # type: List[str]
447):
448 # type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
449 """
450 Args:
451 paths: list of chap-*.html to read
452 """
453 topic_to_chap = {}
454
455 root_node = DocNode('/')
456 cur_h2_node = None
457
458 for path in paths:
459 with open(path) as f:
460 contents = f.read()
461
462 filename = os.path.basename(path)
463
464 tmp, _ = os.path.splitext(filename)
465 assert tmp.startswith('chap-')
466 chapter_name = tmp[len('chap-'):]
467
468 page_node = DocNode(filename)
469
470 cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
471
472 for tag, attrs, heading, text in cards:
473 values = [v for k, v in attrs if k == 'id']
474 id_value = values[0] if len(values) == 1 else None
475
476 topic_id = (id_value if id_value else html_lib.PrettyHref(
477 heading, preserve_anchor_case=True))
478
479 if tag == 'h2':
480 h2 = DocNode(topic_id, attrs=attrs)
481 page_node.children.append(h2)
482 cur_h2_node = h2
483 elif tag == 'h3':
484 # attach text so we can see which topics have empty bodies
485 h3 = DocNode(topic_id, attrs=attrs, text=text)
486 cur_h2_node.children.append(h3)
487
488 if tag != tag_level:
489 continue # we only care about h3 now
490
491 if 0:
492 log('tag = %r', tag)
493 log('topic_id = %r', topic_id)
494 log('heading = %r', heading)
495 log('text = %r', text[:20])
496
497 embed = ('oils-embed', '1') in attrs
498
499 if out_dir is not None and embed:
500 # indices start with _
501 path = os.path.join(out_dir, topic_id)
502 with open(path, 'w') as f:
503 f.write(text)
504
505 # help builtin will show URL if there's a chapter name
506 topic_to_chap[topic_id] = None if embed else chapter_name
507
508 root_node.children.append(page_node)
509
510 num_sections = sum(len(child.children) for child in root_node.children)
511
512 log(
513 '%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
514 len(paths), len(topic_to_chap), num_sections, out_dir)
515
516 return topic_to_chap, root_node
517
518
519class StrPool(object):
520
521 def __init__(self):
522 # type: () -> None
523 self.var_names = {}
524 self.global_strs = []
525 self.unique_id = 1
526
527 def Add(self, s):
528 # type: (str) -> None
529 if s in self.var_names:
530 return
531
532 var_name = 'gStr%d' % self.unique_id
533 self.unique_id += 1
534
535 import json
536 # Use JSON as approximation for C++ string
537 self.global_strs.append('GLOBAL_STR(%s, %s)' %
538 (var_name, json.dumps(s)))
539
540 self.var_names[s] = var_name
541
542
543def WriteTopicDict(topic_dict, header_f, cc_f):
544 # type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
545 header_f.write('''
546#include "mycpp/runtime.h"
547
548namespace help_meta {
549Dict<BigStr*, BigStr*>* TopicMetadata();
550}
551''')
552
553 pool = StrPool()
554
555 for k, v in topic_dict.iteritems():
556 pool.Add(k)
557 if v is not None:
558 pool.Add(v)
559 #log('%s %s', k, v)
560
561 num_items = len(topic_dict)
562 key_names = []
563 val_names = []
564
565 for k, v in topic_dict.iteritems():
566 key_names.append(pool.var_names[k])
567 if v is None:
568 v_str = 'nullptr'
569 else:
570 v_str = pool.var_names[v]
571 val_names.append(v_str)
572
573 cc_f.write('''
574#include "mycpp/runtime.h"
575
576namespace help_meta {
577
578%s
579
580GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
581
582Dict<BigStr*, BigStr*>* TopicMetadata() {
583 return gTopics;
584}
585}
586''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
587 ' COMMA '.join(val_names)))
588
589
590def main(argv):
591 # type: (List[str]) -> None
592 action = argv[1]
593
594 if action == 'cards-from-index':
595 sh = argv[2] # osh or ysh
596 out_prefix = argv[3]
597
598 # Read HTML from stdin
599 # TODO: could pass a list of files to speed it up
600 CardsFromIndex(sh, out_prefix)
601
602 elif action == 'cards-from-chapters':
603
604 out_dir = argv[2]
605 py_out = argv[3]
606 cc_prefix = argv[4]
607 pages = argv[5:]
608
609 topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
610
611 # Write topic dict as Python and C++
612
613 with open(py_out, 'w') as f:
614 f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
615
616 f.write('''
617
618from typing import Dict
619
620def TopicMetadata():
621 # type: () -> Dict[str, str]
622 return TOPICS
623''')
624
625 h_path = cc_prefix + '.h'
626 cc_path = cc_prefix + '.cc'
627
628 with open(h_path, 'w') as header_f:
629 with open(cc_path, 'w') as cc_f:
630 WriteTopicDict(topic_to_chap, header_f, cc_f)
631
632 elif action == 'ref-check':
633 from doctools import cmark
634 from doctools import oils_doc
635 from doctools import ref_check
636
637 chapters = []
638 all_toc_nodes = []
639
640 for path in argv[2:]:
641 filename = os.path.basename(path)
642
643 if filename.endswith('.md'):
644 assert filename.startswith('toc-'), path
645
646 # First convert to HTML
647 with open(path) as in_file:
648 html = cmark.md2html(in_file.read())
649
650 # Now highlight code, which # which gives debug output for the
651 # language-chapter-links-*
652
653 box_nodes = []
654 html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
655 all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
656
657 elif filename.endswith('.html'):
658 assert filename.startswith('chap-'), path
659 chapters.append(path)
660
661 else:
662 raise RuntimeError('Expected toc-* or chap-*, got %r' %
663 filename)
664
665 topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
666
667 #log('%d chapters: %s', len(chapters), chapters[:5])
668 #log('%d topics: %s', len(topics), topics.keys()[:10])
669 log('')
670
671 # Compare TOC vs. chapters
672 ref_check.Check(all_toc_nodes, chap_tree)
673
674 else:
675 raise RuntimeError('Invalid action %r' % action)
676
677
678if __name__ == '__main__':
679 try:
680 main(sys.argv)
681 except RuntimeError as e:
682 print('FATAL: %s' % e, file=sys.stderr)
683 sys.exit(1)