doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

685 lines, 357 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4	"""help_gen.py
5
6	Ideas for HTML -> ANSI converter:
7
8	- `ls` -> <code>ls</code> -> is reverse video?
9	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10	- could also be bright blue
11	- <pre> is also indented 4 spaces, like the markdown
12	- red X <span class="X">X</span>
13
14	- comments in code examples could be green?
15
16	What about:
17
18	- headings h1, h2, h3, h4
19	- Right now cards use reverse video. Centering didn't look great.
20
21	- <ul> - you could use a Unicode bullet here
22	- <ol>
23
24	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25	thing.
26	"""
27
28	import cStringIO
29	import HTMLParser
30	import os
31	import pprint
32	import re
33	import sys
34
35	from typing import AnyStr
36
37	from _devbuild.gen.htm8_asdl import h8_id
38	from doctools import html_lib
39	from doctools.util import log
40	from lazylex import html
41
42	#from typing import List, Tuple
43
44	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
45	# turned into anchors.
46	SECTION_RE = re.compile(
47	r'''
48	\s*
49	\[
50	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
51	\]
52	''', re.VERBOSE)
53
54	# Complex heuristic to highlight topics.
55	TOPIC_RE = re.compile(
56	r'''
57	(X[ ])? # optional deprecation symbol X, then a single space
58	@? # optional @array, e.g. @BASH_SOURCE
59
60	([a-zA-Z_][a-zA-Z0-9/:_-]+)
61	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
62	# List/append, cmd/append
63
64	( [ ] [^a-zA-Z0-9 ] \S*
65	# trailer like >> or (make)
66	\|
67	\(\) # optional () for func()
68	)?
69
70	( # order of these 2 clauses matters
71	[ ]*\n # spaces/newline
72	\|
73	[ ]+ # 1 or more spaces
74	)
75	''', re.VERBOSE)
76	"""
77	''', re.VERBOSE)
78	"""
79
80
81	def _StringToHref(s):
82	# lower case to match what doctools/cmark.py does
83	return s.lower().replace(' ', '-')
84
85
86	X_LEFT_SPAN = '<span style="color: darkred">'
87
88
89	class TopicHtmlRenderer(object):
90
91	def __init__(self, chapter, debug_out, linkify_stop_col):
92	# type: (str, List, int) -> None
93	self.chapter = chapter
94	self.debug_out = debug_out
95	self.linkify_stop_col = linkify_stop_col
96
97	self.html_page = 'chap-%s.html' % chapter
98
99	def _PrintTopic(self, m, out, line_info):
100	# type: (Any, html.Output, Dict[str, Any]) -> None
101	# The X
102	topic_impl = True
103	if m.group(1):
104	out.PrintUntil(m.start(1))
105	out.Print(X_LEFT_SPAN)
106	out.PrintUntil(m.end(1))
107	out.Print('</span>')
108	topic_impl = False
109
110	# The topic name to link
111	topic = m.group(2)
112	line_info['topics'].append((topic, topic_impl))
113
114	out.PrintUntil(m.start(2))
115	out.Print('<a href="%s#%s">' % (self.html_page, topic))
116	out.PrintUntil(m.end(2))
117	out.Print('</a>')
118
119	def Render(self, line):
120	# type: (str) -> str
121	"""Convert a line of text to HTML.
122
123	Topics are highlighted and X made red.
124
125	Args:
126	chapter: where to link to
127	line: RAW SPAN of HTML that is already escaped.
128	debug_out: structured data
129
130	Returns:
131	The HTML with some tags inserted.
132	"""
133	f = cStringIO.StringIO()
134	out = html.Output(line, f)
135
136	pos = 0 # position within line
137
138	section_impl = True
139
140	if line.startswith('X '):
141	out.Print(X_LEFT_SPAN)
142	out.PrintUntil(2)
143	out.Print('</span>')
144	pos = 2
145	section_impl = False
146	elif line.startswith(' '):
147	pos = 2
148	else:
149	return line
150
151	# Highlight [Section] at the start of a line.
152	m = SECTION_RE.match(line, pos)
153	if m:
154	section_name = m.group(1)
155	#href = _StringToHref(section_name)
156	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
157
158	out.PrintUntil(m.start(1))
159	out.Print('<a href="%s#%s" class="level2">' %
160	(self.html_page, href))
161	out.PrintUntil(m.end(1)) # anchor
162	out.Print('</a>')
163
164	pos = m.end(0) # ADVANCE
165	else:
166	section_name = None
167
168	line_info = {
169	'section': section_name,
170	'impl': section_impl,
171	'topics': []
172	}
173	self.debug_out.append(line_info)
174
175	# Whitespace after section, or leading whitespace
176	_SPACE_1 = re.compile(r'[ ]+')
177	m = _SPACE_1.match(line, pos)
178	assert m, 'Expected whitespace %r' % line
179
180	pos = m.end()
181
182	# Keep matching topics until it doesn't match.
183	while True:
184	m = TOPIC_RE.match(line, pos)
185
186	if not m:
187	break
188
189	pos = m.end()
190
191	# The 1-based column number of the end of this topic
192	col = m.end(2) + 1
193	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
194	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
195	break
196
197	self._PrintTopic(m, out, line_info)
198
199	#log('trailing %r', line[pos:])
200
201	out.PrintTheRest()
202	return f.getvalue()
203
204
205	class Splitter(HTMLParser.HTMLParser):
206	"""Split an HTML stream starting at each of the heading tags.
207
208	For *-help.html.
209
210	TODO: Rewrite with this with lazylex!
211
212	Algorithm:
213	- ExtractBody() first, then match balanced tags
214	- SPLIT by h2, h3, h4
215	- Match <pre><code> blocks and re-indent
216	- Later:
217	- links <a href="">
218	- `` is turned into inline <code></code>
219	- for bold
220	- * * for emphasis
221	- <p> needs word wrapping! Oops.
222	- actually cmark seems to preserve this? OK maybe not.
223	- we just need space between <p>
224	"""
225
226	def __init__(self, heading_tags, out):
227	# type: (List[str], List) -> None
228	HTMLParser.HTMLParser.__init__(self)
229	self.heading_tags = heading_tags
230	self.out = out
231
232	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
233	self.in_heading = False
234
235	self.indent = 0
236
237	def log(self, msg, *args):
238	# type: (str, *Any) -> None
239	ind = self.indent * ' '
240	if 0:
241	log(ind + msg, *args)
242
243	def handle_starttag(self, tag, attrs):
244	# type: (AnyStr, List[Tuple[AnyStr, AnyStr]]) -> None
245	if tag in self.heading_tags:
246	self.in_heading = True
247	if self.cur_group:
248	self.out.append(self.cur_group)
249
250	self.cur_group = (tag, attrs, [], [])
251
252	self.log('[%d] <> %s %s', self.indent, tag, attrs)
253	self.indent += 1
254
255	def handle_endtag(self, tag):
256	# type: (AnyStr) -> None
257	if tag in self.heading_tags:
258	self.in_heading = False
259
260	self.log('[%d] </> %s', self.indent, tag)
261	self.indent -= 1
262
263	def handle_entityref(self, name):
264	# type: (AnyStr) -> None
265	"""
266	From Python docs:
267	This method is called to process a named character reference of the form
268	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
269	"""
270	c = html.CHAR_ENTITY[name]
271	if self.in_heading:
272	self.cur_group[2].append(c)
273	else:
274	if self.cur_group:
275	self.cur_group[3].append(c)
276
277	def handle_data(self, data):
278	# type: (AnyStr) -> None
279	self.log('data %r', data)
280	if self.in_heading:
281	self.cur_group[2].append(data)
282	else:
283	if self.cur_group:
284	self.cur_group[3].append(data)
285
286	def end(self):
287	# type: () -> None
288	if self.cur_group:
289	self.out.append(self.cur_group)
290
291	# Maybe detect nesting?
292	if self.indent != 0:
293	raise RuntimeError(
294	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
295	(self.indent, self.cur_group))
296
297
298	def ExtractBody(s):
299	# type: (str) -> str
300	"""Extract what's in between <body></body>
301
302	The splitter needs balanced tags, and what's in <head> isn't
303	balanced.
304	"""
305	f = cStringIO.StringIO()
306	out = html.Output(s, f)
307	tag_lexer = html.TagLexer(s)
308
309	pos = 0
310	it = html.ValidTokens(s)
311	while True:
312	try:
313	tok_id, end_pos = next(it)
314	except StopIteration:
315	break
316
317	if tok_id == h8_id.StartTag:
318	tag_lexer.Reset(pos, end_pos)
319	if tag_lexer.TagName() == 'body':
320	body_start_right = end_pos # right after <body>
321
322	out.SkipTo(body_start_right)
323	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
324
325	out.PrintUntil(body_end_left)
326	break
327
328	pos = end_pos
329
330	return f.getvalue()
331
332
333	def SplitIntoCards(heading_tags, contents):
334	# type: (List[str], str) -> Iterator
335	contents = ExtractBody(contents)
336
337	groups = []
338	sp = Splitter(heading_tags, groups)
339	sp.feed(contents)
340	sp.end()
341
342	for tag, attrs, heading_parts, parts in groups:
343	heading = ''.join(heading_parts).strip()
344
345	# Don't strip leading space?
346	text = ''.join(parts)
347	text = text.strip('\n') + '\n'
348
349	#log('text = %r', text[:10])
350
351	yield tag, attrs, heading, text
352
353	#log('make_help.py: Parsed %d parts', len(groups))
354
355
356	def HelpTopics(s):
357	"""
358	Given a rendered toc-{osh,ysh}.html
359
360	yield groups (section_id, section_name, block of text)
361	"""
362	tag_lexer = html.TagLexer(s)
363
364	pos = 0
365	it = html.ValidTokens(s)
366	while True:
367	try:
368	tok_id, end_pos = next(it)
369	except StopIteration:
370	break
371
372	if tok_id == h8_id.StartTag:
373	tag_lexer.Reset(pos, end_pos)
374	#log('%r', tag_lexer.TagString())
375	#log('%r', tag_lexer.TagName())
376
377	# Capture <h2 id="foo"> first
378	if tag_lexer.TagName() == 'h2':
379	h2_start_right = end_pos
380
381	open_tag_right = end_pos
382	section_id = tag_lexer.GetAttrRaw('id')
383	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
384
385	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
386
387	anchor_html = s[h2_start_right:h2_end_left]
388	paren_pos = anchor_html.find('<') # remove HTML link
389	if paren_pos == -1:
390	section_name = anchor_html
391	else:
392	section_name = anchor_html[:paren_pos].strip()
393
394	# Now find the <code></code> span
395	_, code_start_right = html.ReadUntilStartTag(
396	it, tag_lexer, 'code')
397	css_class = tag_lexer.GetAttrRaw('class')
398	assert css_class is not None
399	assert css_class.startswith(
400	'language-chapter-links-'), tag_lexer.TagString()
401
402	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
403
404	text = html.ToText(s, code_start_right, code_end_left)
405	yield section_id, section_name, text
406
407	pos = end_pos
408
409
410	class DocNode(object):
411	"""To visualize doc structure."""
412
413	def __init__(self, name, attrs=None, text=None):
414	# type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
415	self.name = name
416	self.attrs = attrs # for h2 and h3 links
417	self.text = text
418	self.children = []
419
420
421	def CardsFromIndex(sh, out_prefix):
422	sections = []
423	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
424	if 0:
425	log('section_id = %r', section_id)
426	log('section_name = %r', section_name)
427	log('')
428	#log('text = %r', text[:20])
429
430	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
431
432	path = os.path.join(out_prefix, topic)
433	with open(path, 'w') as f:
434	f.write('%s\n\n' %
435	section_name) # section_id is printed dynamically
436	f.write(text)
437	#f.write('\n') # extra
438	#log(' Wrote %s', path)
439	sections.append(section_id)
440
441	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
442	out_prefix)
443
444
445	def CardsFromChapters(
446	out_dir, # type: str
447	tag_level, # type: str
448	paths, # type: List[str]
449	):
450	# type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
451	"""
452	Args:
453	paths: list of chap-*.html to read
454	"""
455	topic_to_chap = {}
456
457	root_node = DocNode('/')
458	cur_h2_node = None
459
460	for path in paths:
461	with open(path) as f:
462	contents = f.read()
463
464	filename = os.path.basename(path)
465
466	tmp, _ = os.path.splitext(filename)
467	assert tmp.startswith('chap-')
468	chapter_name = tmp[len('chap-'):]
469
470	page_node = DocNode(filename)
471
472	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
473
474	for tag, attrs, heading, text in cards:
475	values = [v for k, v in attrs if k == 'id']
476	id_value = values[0] if len(values) == 1 else None
477
478	topic_id = (id_value if id_value else html_lib.PrettyHref(
479	heading, preserve_anchor_case=True))
480
481	if tag == 'h2':
482	h2 = DocNode(topic_id, attrs=attrs)
483	page_node.children.append(h2)
484	cur_h2_node = h2
485	elif tag == 'h3':
486	# attach text so we can see which topics have empty bodies
487	h3 = DocNode(topic_id, attrs=attrs, text=text)
488	cur_h2_node.children.append(h3)
489
490	if tag != tag_level:
491	continue # we only care about h3 now
492
493	if 0:
494	log('tag = %r', tag)
495	log('topic_id = %r', topic_id)
496	log('heading = %r', heading)
497	log('text = %r', text[:20])
498
499	embed = ('oils-embed', '1') in attrs
500
501	if out_dir is not None and embed:
502	# indices start with _
503	path = os.path.join(out_dir, topic_id)
504	with open(path, 'w') as f:
505	f.write(text)
506
507	# help builtin will show URL if there's a chapter name
508	topic_to_chap[topic_id] = None if embed else chapter_name
509
510	root_node.children.append(page_node)
511
512	num_sections = sum(len(child.children) for child in root_node.children)
513
514	log(
515	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
516	len(paths), len(topic_to_chap), num_sections, out_dir)
517
518	return topic_to_chap, root_node
519
520
521	class StrPool(object):
522
523	def __init__(self):
524	# type: () -> None
525	self.var_names = {}
526	self.global_strs = []
527	self.unique_id = 1
528
529	def Add(self, s):
530	# type: (str) -> None
531	if s in self.var_names:
532	return
533
534	var_name = 'gStr%d' % self.unique_id
535	self.unique_id += 1
536
537	import json
538	# Use JSON as approximation for C++ string
539	self.global_strs.append('GLOBAL_STR(%s, %s)' %
540	(var_name, json.dumps(s)))
541
542	self.var_names[s] = var_name
543
544
545	def WriteTopicDict(topic_dict, header_f, cc_f):
546	# type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
547	header_f.write('''
548	#include "mycpp/runtime.h"
549
550	namespace help_meta {
551	Dict<BigStr, BigStr>* TopicMetadata();
552	}
553	''')
554
555	pool = StrPool()
556
557	for k, v in topic_dict.iteritems():
558	pool.Add(k)
559	if v is not None:
560	pool.Add(v)
561	#log('%s %s', k, v)
562
563	num_items = len(topic_dict)
564	key_names = []
565	val_names = []
566
567	for k, v in topic_dict.iteritems():
568	key_names.append(pool.var_names[k])
569	if v is None:
570	v_str = 'nullptr'
571	else:
572	v_str = pool.var_names[v]
573	val_names.append(v_str)
574
575	cc_f.write('''
576	#include "mycpp/runtime.h"
577
578	namespace help_meta {
579
580	%s
581
582	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
583
584	Dict<BigStr, BigStr>* TopicMetadata() {
585	return gTopics;
586	}
587	}
588	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
589	' COMMA '.join(val_names)))
590
591
592	def main(argv):
593	# type: (List[str]) -> None
594	action = argv[1]
595
596	if action == 'cards-from-index':
597	sh = argv[2] # osh or ysh
598	out_prefix = argv[3]
599
600	# Read HTML from stdin
601	# TODO: could pass a list of files to speed it up
602	CardsFromIndex(sh, out_prefix)
603
604	elif action == 'cards-from-chapters':
605
606	out_dir = argv[2]
607	py_out = argv[3]
608	cc_prefix = argv[4]
609	pages = argv[5:]
610
611	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
612
613	# Write topic dict as Python and C++
614
615	with open(py_out, 'w') as f:
616	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
617
618	f.write('''
619
620	from typing import Dict
621
622	def TopicMetadata():
623	# type: () -> Dict[str, str]
624	return TOPICS
625	''')
626
627	h_path = cc_prefix + '.h'
628	cc_path = cc_prefix + '.cc'
629
630	with open(h_path, 'w') as header_f:
631	with open(cc_path, 'w') as cc_f:
632	WriteTopicDict(topic_to_chap, header_f, cc_f)
633
634	elif action == 'ref-check':
635	from doctools import cmark
636	from doctools import oils_doc
637	from doctools import ref_check
638
639	chapters = []
640	all_toc_nodes = []
641
642	for path in argv[2:]:
643	filename = os.path.basename(path)
644
645	if filename.endswith('.md'):
646	assert filename.startswith('toc-'), path
647
648	# First convert to HTML
649	with open(path) as in_file:
650	html = cmark.md2html(in_file.read())
651
652	# Now highlight code, which # which gives debug output for the
653	# language-chapter-links-*
654
655	box_nodes = []
656	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
657	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
658
659	elif filename.endswith('.html'):
660	assert filename.startswith('chap-'), path
661	chapters.append(path)
662
663	else:
664	raise RuntimeError('Expected toc-* or chap-*, got %r' %
665	filename)
666
667	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
668
669	#log('%d chapters: %s', len(chapters), chapters[:5])
670	#log('%d topics: %s', len(topics), topics.keys()[:10])
671	log('')
672
673	# Compare TOC vs. chapters
674	ref_check.Check(all_toc_nodes, chap_tree)
675
676	else:
677	raise RuntimeError('Invalid action %r' % action)
678
679
680	if __name__ == '__main__':
681	try:
682	main(sys.argv)
683	except RuntimeError as e:
684	print('FATAL: %s' % e, file=sys.stderr)
685	sys.exit(1)