OILS / doctools / help_gen.py View on Github | oilshell.org

659 lines, 351 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""help_gen.py
4
5Ideas for HTML -> ANSI converter:
6
7- `ls` -> <code>ls</code> -> is reverse video?
8- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
9 - could also be bright blue
10- <pre> is also indented 4 spaces, like the markdown
11- red X <span class="X">X</span>
12
13- comments in code examples could be green?
14
15What about:
16
17- headings h1, h2, h3, h4
18 - Right now cards use reverse video. Centering didn't look great.
19
20- <ul> - you could use a Unicode bullet here
21- <ol>
22
23Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
24thing.
25"""
26
27import cStringIO
28import HTMLParser
29import os
30import pprint
31import re
32import sys
33
34from doctools import html_lib
35from doctools.util import log
36from lazylex import html
37
38#from typing import List, Tuple
39
40# Sections have alphabetical characters, spaces, and '/' for I/O. They are
41# turned into anchors.
42SECTION_RE = re.compile(
43 r'''
44 \s*
45 \[
46 ([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
47 \]
48''', re.VERBOSE)
49
50# Complex heuristic to highlight topics.
51TOPIC_RE = re.compile(
52 r'''
53 (X[ ])? # optional deprecation symbol X, then a single space
54 @? # optional @array, e.g. @BASH_SOURCE
55
56 ([a-zA-Z_][a-zA-Z0-9/:_-]+)
57 # topic names: osh-usage, _status, ysh:all, BASH_REMATCH
58 # List/append, cmd/append
59
60 ( [ ] [^a-zA-Z0-9 ] \S*
61 # trailer like >> or (make)
62 |
63 \(\) # optional () for func()
64 )?
65
66 ( # order of these 2 clauses matters
67 [ ]*\n # spaces/newline
68 |
69 [ ]+ # 1 or more spaces
70 )
71''', re.VERBOSE)
72"""
73''', re.VERBOSE)
74"""
75
76
77def _StringToHref(s):
78 # lower case to match what doctools/cmark.py does
79 return s.lower().replace(' ', '-')
80
81
82X_LEFT_SPAN = '<span style="color: darkred">'
83
84
85class TopicHtmlRenderer(object):
86
87 def __init__(self, chapter, debug_out, linkify_stop_col):
88 self.chapter = chapter
89 self.debug_out = debug_out
90 self.linkify_stop_col = linkify_stop_col
91
92 self.html_page = 'chap-%s.html' % chapter
93
94 def _PrintTopic(self, m, out, line_info):
95 # The X
96 topic_impl = True
97 if m.group(1):
98 out.PrintUntil(m.start(1))
99 out.Print(X_LEFT_SPAN)
100 out.PrintUntil(m.end(1))
101 out.Print('</span>')
102 topic_impl = False
103
104 # The topic name to link
105 topic = m.group(2)
106 line_info['topics'].append((topic, topic_impl))
107
108 out.PrintUntil(m.start(2))
109 out.Print('<a href="%s#%s">' % (self.html_page, topic))
110 out.PrintUntil(m.end(2))
111 out.Print('</a>')
112
113 def Render(self, line):
114 """Convert a line of text to HTML.
115
116 Topics are highlighted and X made red.
117
118 Args:
119 chapter: where to link to
120 line: RAW SPAN of HTML that is already escaped.
121 debug_out: structured data
122
123 Returns:
124 The HTML with some tags inserted.
125 """
126 f = cStringIO.StringIO()
127 out = html.Output(line, f)
128
129 pos = 0 # position within line
130
131 section_impl = True
132
133 if line.startswith('X '):
134 out.Print(X_LEFT_SPAN)
135 out.PrintUntil(2)
136 out.Print('</span>')
137 pos = 2
138 section_impl = False
139 elif line.startswith(' '):
140 pos = 2
141 else:
142 return line
143
144 # Highlight [Section] at the start of a line.
145 m = SECTION_RE.match(line, pos)
146 if m:
147 section_name = m.group(1)
148 #href = _StringToHref(section_name)
149 href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
150
151 out.PrintUntil(m.start(1))
152 out.Print('<a href="%s#%s" class="level2">' %
153 (self.html_page, href))
154 out.PrintUntil(m.end(1)) # anchor
155 out.Print('</a>')
156
157 pos = m.end(0) # ADVANCE
158 else:
159 section_name = None
160
161 line_info = {
162 'section': section_name,
163 'impl': section_impl,
164 'topics': []
165 }
166 self.debug_out.append(line_info)
167
168 # Whitespace after section, or leading whitespace
169 _SPACE_1 = re.compile(r'[ ]+')
170 m = _SPACE_1.match(line, pos)
171 assert m, 'Expected whitespace %r' % line
172
173 pos = m.end()
174
175 # Keep matching topics until it doesn't match.
176 while True:
177 m = TOPIC_RE.match(line, pos)
178
179 if not m:
180 break
181
182 pos = m.end()
183
184 # The 1-based column number of the end of this topic
185 col = m.end(2) + 1
186 if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
187 #log('STOPPING %d > %d' % (col, self.linkify_stop_col))
188 break
189
190 self._PrintTopic(m, out, line_info)
191
192 #log('trailing %r', line[pos:])
193
194 out.PrintTheRest()
195 return f.getvalue()
196
197
198class Splitter(HTMLParser.HTMLParser):
199 """Split an HTML stream starting at each of the heading tags.
200
201 For *-help.html.
202
203 TODO: Rewrite with this with lazylex!
204
205 Algorithm:
206 - ExtractBody() first, then match balanced tags
207 - SPLIT by h2, h3, h4
208 - Match <pre><code> blocks and re-indent
209 - Later:
210 - links <a href="">
211 - `` is turned into inline <code></code>
212 - ** ** for bold
213 - * * for emphasis
214 - <p> needs word wrapping! Oops.
215 - actually cmark seems to preserve this? OK maybe not.
216 - we just need space between <p>
217 """
218
219 def __init__(self, heading_tags, out):
220 HTMLParser.HTMLParser.__init__(self)
221 self.heading_tags = heading_tags
222 self.out = out
223
224 self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
225 self.in_heading = False
226
227 self.indent = 0
228
229 def log(self, msg, *args):
230 ind = self.indent * ' '
231 if 0:
232 log(ind + msg, *args)
233
234 def handle_starttag(self, tag, attrs):
235 if tag in self.heading_tags:
236 self.in_heading = True
237 if self.cur_group:
238 self.out.append(self.cur_group)
239
240 self.cur_group = (tag, attrs, [], [])
241
242 self.log('[%d] <> %s %s', self.indent, tag, attrs)
243 self.indent += 1
244
245 def handle_endtag(self, tag):
246 if tag in self.heading_tags:
247 self.in_heading = False
248
249 self.log('[%d] </> %s', self.indent, tag)
250 self.indent -= 1
251
252 def handle_entityref(self, name):
253 """
254 From Python docs:
255 This method is called to process a named character reference of the form
256 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
257 """
258 c = html.CHAR_ENTITY[name]
259 if self.in_heading:
260 self.cur_group[2].append(c)
261 else:
262 if self.cur_group:
263 self.cur_group[3].append(c)
264
265 def handle_data(self, data):
266 self.log('data %r', data)
267 if self.in_heading:
268 self.cur_group[2].append(data)
269 else:
270 if self.cur_group:
271 self.cur_group[3].append(data)
272
273 def end(self):
274 if self.cur_group:
275 self.out.append(self.cur_group)
276
277 # Maybe detect nesting?
278 if self.indent != 0:
279 raise RuntimeError(
280 'Unbalanced HTML tags: indent=%d, cur_group=%s' %
281 (self.indent, self.cur_group))
282
283
284def ExtractBody(s):
285 """Extract what's in between <body></body>
286
287 The splitter needs balanced tags, and what's in <head> isn't
288 balanced.
289 """
290 f = cStringIO.StringIO()
291 out = html.Output(s, f)
292 tag_lexer = html.TagLexer(s)
293
294 pos = 0
295 it = html.ValidTokens(s)
296 while True:
297 try:
298 tok_id, end_pos = next(it)
299 except StopIteration:
300 break
301
302 if tok_id == html.StartTag:
303 tag_lexer.Reset(pos, end_pos)
304 if tag_lexer.TagName() == 'body':
305 body_start_right = end_pos # right after <body>
306
307 out.SkipTo(body_start_right)
308 body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
309
310 out.PrintUntil(body_end_left)
311 break
312
313 pos = end_pos
314
315 return f.getvalue()
316
317
318def SplitIntoCards(heading_tags, contents):
319 contents = ExtractBody(contents)
320
321 groups = []
322 sp = Splitter(heading_tags, groups)
323 sp.feed(contents)
324 sp.end()
325
326 for tag, attrs, heading_parts, parts in groups:
327 heading = ''.join(heading_parts).strip()
328
329 # Don't strip leading space?
330 text = ''.join(parts)
331 text = text.strip('\n') + '\n'
332
333 #log('text = %r', text[:10])
334
335 yield tag, attrs, heading, text
336
337 #log('make_help.py: Parsed %d parts', len(groups))
338
339
340def HelpTopics(s):
341 """
342 Given a rendered toc-{osh,ysh}.html
343
344 yield groups (section_id, section_name, block of text)
345 """
346 tag_lexer = html.TagLexer(s)
347
348 pos = 0
349 it = html.ValidTokens(s)
350 while True:
351 try:
352 tok_id, end_pos = next(it)
353 except StopIteration:
354 break
355
356 if tok_id == html.StartTag:
357 tag_lexer.Reset(pos, end_pos)
358 #log('%r', tag_lexer.TagString())
359 #log('%r', tag_lexer.TagName())
360
361 # Capture <h2 id="foo"> first
362 if tag_lexer.TagName() == 'h2':
363 h2_start_right = end_pos
364
365 open_tag_right = end_pos
366 section_id = tag_lexer.GetAttrRaw('id')
367 assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
368
369 h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
370
371 anchor_html = s[h2_start_right:h2_end_left]
372 paren_pos = anchor_html.find('<') # remove HTML link
373 if paren_pos == -1:
374 section_name = anchor_html
375 else:
376 section_name = anchor_html[:paren_pos].strip()
377
378 # Now find the <code></code> span
379 _, code_start_right = html.ReadUntilStartTag(
380 it, tag_lexer, 'code')
381 css_class = tag_lexer.GetAttrRaw('class')
382 assert css_class is not None
383 assert css_class.startswith(
384 'language-chapter-links-'), tag_lexer.TagString()
385
386 code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
387
388 text = html.ToText(s, code_start_right, code_end_left)
389 yield section_id, section_name, text
390
391 pos = end_pos
392
393
394class DocNode(object):
395 """To visualize doc structure."""
396
397 def __init__(self, name, attrs=None, text=None):
398 self.name = name
399 self.attrs = attrs # for h2 and h3 links
400 self.text = text
401 self.children = []
402
403
404def CardsFromIndex(sh, out_prefix):
405 sections = []
406 for section_id, section_name, text in HelpTopics(sys.stdin.read()):
407 if 0:
408 log('section_id = %r', section_id)
409 log('section_name = %r', section_name)
410 log('')
411 #log('text = %r', text[:20])
412
413 topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
414
415 path = os.path.join(out_prefix, topic)
416 with open(path, 'w') as f:
417 f.write('%s\n\n' %
418 section_name) # section_id is printed dynamically
419 f.write(text)
420 #f.write('\n') # extra
421 log(' Wrote %s', path)
422 sections.append(section_id)
423
424 log(' (doctools/make_help) -> %d sections -> %s', len(sections),
425 out_prefix)
426
427
428def CardsFromChapters(out_dir, tag_level, paths):
429 """
430 Args:
431 paths: list of chap-*.html to read
432 """
433 topic_to_chap = {}
434
435 root_node = DocNode('/')
436 cur_h2_node = None
437
438 for path in paths:
439 with open(path) as f:
440 contents = f.read()
441
442 filename = os.path.basename(path)
443
444 tmp, _ = os.path.splitext(filename)
445 assert tmp.startswith('chap-')
446 chapter_name = tmp[len('chap-'):]
447
448 page_node = DocNode(filename)
449
450 cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
451
452 for tag, attrs, heading, text in cards:
453 values = [v for k, v in attrs if k == 'id']
454 id_value = values[0] if len(values) == 1 else None
455
456 topic_id = (id_value if id_value else html_lib.PrettyHref(
457 heading, preserve_anchor_case=True))
458
459 if tag == 'h2':
460 h2 = DocNode(topic_id, attrs=attrs)
461 page_node.children.append(h2)
462 cur_h2_node = h2
463 elif tag == 'h3':
464 # attach text so we can see which topics have empty bodies
465 h3 = DocNode(topic_id, attrs=attrs, text=text)
466 cur_h2_node.children.append(h3)
467
468 if tag != tag_level:
469 continue # we only care about h3 now
470
471 if 0:
472 log('tag = %r', tag)
473 log('topic_id = %r', topic_id)
474 log('heading = %r', heading)
475 log('text = %r', text[:20])
476
477 embed = ('oils-embed', '1') in attrs
478
479 if out_dir is not None and embed:
480 # indices start with _
481 path = os.path.join(out_dir, topic_id)
482 with open(path, 'w') as f:
483 f.write(text)
484
485 # help builtin will show URL if there's a chapter name
486 topic_to_chap[topic_id] = None if embed else chapter_name
487
488 root_node.children.append(page_node)
489
490 num_sections = sum(len(child.children) for child in root_node.children)
491
492 log(
493 '%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
494 len(paths), len(topic_to_chap), num_sections, out_dir)
495
496 return topic_to_chap, root_node
497
498
499class StrPool(object):
500
501 def __init__(self):
502 self.var_names = {}
503 self.global_strs = []
504 self.unique_id = 1
505
506 def Add(self, s):
507 if s in self.var_names:
508 return
509
510 var_name = 'gStr%d' % self.unique_id
511 self.unique_id += 1
512
513 import json
514 # Use JSON as approximation for C++ string
515 self.global_strs.append('GLOBAL_STR(%s, %s)' %
516 (var_name, json.dumps(s)))
517
518 self.var_names[s] = var_name
519
520
521def WriteTopicDict(topic_dict, header_f, cc_f):
522 header_f.write('''
523#include "mycpp/runtime.h"
524
525namespace help_meta {
526Dict<BigStr*, BigStr*>* TopicMetadata();
527}
528''')
529
530 pool = StrPool()
531
532 for k, v in topic_dict.iteritems():
533 pool.Add(k)
534 if v is not None:
535 pool.Add(v)
536 #log('%s %s', k, v)
537
538 num_items = len(topic_dict)
539 key_names = []
540 val_names = []
541
542 for k, v in topic_dict.iteritems():
543 key_names.append(pool.var_names[k])
544 if v is None:
545 v_str = 'nullptr'
546 else:
547 v_str = pool.var_names[v]
548 val_names.append(v_str)
549
550 cc_f.write('''
551#include "mycpp/runtime.h"
552
553namespace help_meta {
554
555%s
556
557GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
558
559Dict<BigStr*, BigStr*>* TopicMetadata() {
560 return gTopics;
561}
562}
563''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
564 ' COMMA '.join(val_names)))
565
566
567def main(argv):
568 action = argv[1]
569
570 if action == 'cards-from-index':
571 sh = argv[2] # osh or ysh
572 out_prefix = argv[3]
573
574 # Read HTML from stdin
575 # TODO: could pass a list of files to speed it up
576 CardsFromIndex(sh, out_prefix)
577
578 elif action == 'cards-from-chapters':
579
580 out_dir = argv[2]
581 py_out = argv[3]
582 cc_prefix = argv[4]
583 pages = argv[5:]
584
585 topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
586
587 # Write topic dict as Python and C++
588
589 with open(py_out, 'w') as f:
590 f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
591
592 f.write('''
593
594from typing import Dict
595
596def TopicMetadata():
597 # type: () -> Dict[str, str]
598 return TOPICS
599''')
600
601 h_path = cc_prefix + '.h'
602 cc_path = cc_prefix + '.cc'
603
604 with open(h_path, 'w') as header_f:
605 with open(cc_path, 'w') as cc_f:
606 WriteTopicDict(topic_to_chap, header_f, cc_f)
607
608 elif action == 'ref-check':
609 from doctools import cmark
610 from doctools import oils_doc
611 from doctools import ref_check
612
613 chapters = []
614 all_toc_nodes = []
615
616 for path in argv[2:]:
617 filename = os.path.basename(path)
618
619 if filename.endswith('.md'):
620 assert filename.startswith('toc-'), path
621
622 # First convert to HTML
623 with open(path) as in_file:
624 html = cmark.md2html(in_file.read())
625
626 # Now highlight code, which # which gives debug output for the
627 # language-chapter-links-*
628
629 box_nodes = []
630 html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
631 all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
632
633 elif filename.endswith('.html'):
634 assert filename.startswith('chap-'), path
635 chapters.append(path)
636
637 else:
638 raise RuntimeError('Expected toc-* or chap-*, got %r' %
639 filename)
640
641 topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
642
643 #log('%d chapters: %s', len(chapters), chapters[:5])
644 #log('%d topics: %s', len(topics), topics.keys()[:10])
645 log('')
646
647 # Compare TOC vs. chapters
648 ref_check.Check(all_toc_nodes, chap_tree)
649
650 else:
651 raise RuntimeError('Invalid action %r' % action)
652
653
654if __name__ == '__main__':
655 try:
656 main(sys.argv)
657 except RuntimeError as e:
658 print('FATAL: %s' % e, file=sys.stderr)
659 sys.exit(1)