doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

686 lines, 358 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4	"""help_gen.py
5
6	Ideas for HTML -> ANSI converter:
7
8	- `ls` -> <code>ls</code> -> is reverse video?
9	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10	- could also be bright blue
11	- <pre> is also indented 4 spaces, like the markdown
12	- red X <span class="X">X</span>
13
14	- comments in code examples could be green?
15
16	What about:
17
18	- headings h1, h2, h3, h4
19	- Right now cards use reverse video. Centering didn't look great.
20
21	- <ul> - you could use a Unicode bullet here
22	- <ol>
23
24	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25	thing.
26	"""
27
28	import cStringIO
29	import HTMLParser
30	import os
31	import pprint
32	import re
33	import sys
34
35	from typing import AnyStr
36
37	from _devbuild.gen.htm8_asdl import h8_id
38	from data_lang import htm8
39	from doctools import html_lib
40	from doctools.util import log
41	from lazylex import html
42
43	#from typing import List, Tuple
44
45	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
46	# turned into anchors.
47	SECTION_RE = re.compile(
48	r'''
49	\s*
50	\[
51	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
52	\]
53	''', re.VERBOSE)
54
55	# Complex heuristic to highlight topics.
56	TOPIC_RE = re.compile(
57	r'''
58	(X[ ])? # optional deprecation symbol X, then a single space
59	@? # optional @array, e.g. @BASH_SOURCE
60
61	([a-zA-Z_][a-zA-Z0-9/:_-]+)
62	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
63	# List/append, cmd/append
64
65	( [ ] [^a-zA-Z0-9 ] \S*
66	# trailer like >> or (make)
67	\|
68	\(\) # optional () for func()
69	)?
70
71	( # order of these 2 clauses matters
72	[ ]*\n # spaces/newline
73	\|
74	[ ]+ # 1 or more spaces
75	)
76	''', re.VERBOSE)
77	"""
78	''', re.VERBOSE)
79	"""
80
81
82	def _StringToHref(s):
83	# lower case to match what doctools/cmark.py does
84	return s.lower().replace(' ', '-')
85
86
87	X_LEFT_SPAN = '<span style="color: darkred">'
88
89
90	class TopicHtmlRenderer(object):
91
92	def __init__(self, chapter, debug_out, linkify_stop_col):
93	# type: (str, List, int) -> None
94	self.chapter = chapter
95	self.debug_out = debug_out
96	self.linkify_stop_col = linkify_stop_col
97
98	self.html_page = 'chap-%s.html' % chapter
99
100	def _PrintTopic(self, m, out, line_info):
101	# type: (Any, htm8.Output, Dict[str, Any]) -> None
102	# The X
103	topic_impl = True
104	if m.group(1):
105	out.PrintUntil(m.start(1))
106	out.Print(X_LEFT_SPAN)
107	out.PrintUntil(m.end(1))
108	out.Print('</span>')
109	topic_impl = False
110
111	# The topic name to link
112	topic = m.group(2)
113	line_info['topics'].append((topic, topic_impl))
114
115	out.PrintUntil(m.start(2))
116	out.Print('<a href="%s#%s">' % (self.html_page, topic))
117	out.PrintUntil(m.end(2))
118	out.Print('</a>')
119
120	def Render(self, line):
121	# type: (str) -> str
122	"""Convert a line of text to HTML.
123
124	Topics are highlighted and X made red.
125
126	Args:
127	chapter: where to link to
128	line: RAW SPAN of HTML that is already escaped.
129	debug_out: structured data
130
131	Returns:
132	The HTML with some tags inserted.
133	"""
134	f = cStringIO.StringIO()
135	out = htm8.Output(line, f)
136
137	pos = 0 # position within line
138
139	section_impl = True
140
141	if line.startswith('X '):
142	out.Print(X_LEFT_SPAN)
143	out.PrintUntil(2)
144	out.Print('</span>')
145	pos = 2
146	section_impl = False
147	elif line.startswith(' '):
148	pos = 2
149	else:
150	return line
151
152	# Highlight [Section] at the start of a line.
153	m = SECTION_RE.match(line, pos)
154	if m:
155	section_name = m.group(1)
156	#href = _StringToHref(section_name)
157	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
158
159	out.PrintUntil(m.start(1))
160	out.Print('<a href="%s#%s" class="level2">' %
161	(self.html_page, href))
162	out.PrintUntil(m.end(1)) # anchor
163	out.Print('</a>')
164
165	pos = m.end(0) # ADVANCE
166	else:
167	section_name = None
168
169	line_info = {
170	'section': section_name,
171	'impl': section_impl,
172	'topics': []
173	}
174	self.debug_out.append(line_info)
175
176	# Whitespace after section, or leading whitespace
177	_SPACE_1 = re.compile(r'[ ]+')
178	m = _SPACE_1.match(line, pos)
179	assert m, 'Expected whitespace %r' % line
180
181	pos = m.end()
182
183	# Keep matching topics until it doesn't match.
184	while True:
185	m = TOPIC_RE.match(line, pos)
186
187	if not m:
188	break
189
190	pos = m.end()
191
192	# The 1-based column number of the end of this topic
193	col = m.end(2) + 1
194	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
195	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
196	break
197
198	self._PrintTopic(m, out, line_info)
199
200	#log('trailing %r', line[pos:])
201
202	out.PrintTheRest()
203	return f.getvalue()
204
205
206	class Splitter(HTMLParser.HTMLParser):
207	"""Split an HTML stream starting at each of the heading tags.
208
209	For *-help.html.
210
211	TODO: Rewrite with this with lazylex!
212
213	Algorithm:
214	- ExtractBody() first, then match balanced tags
215	- SPLIT by h2, h3, h4
216	- Match <pre><code> blocks and re-indent
217	- Later:
218	- links <a href="">
219	- `` is turned into inline <code></code>
220	- for bold
221	- * * for emphasis
222	- <p> needs word wrapping! Oops.
223	- actually cmark seems to preserve this? OK maybe not.
224	- we just need space between <p>
225	"""
226
227	def __init__(self, heading_tags, out):
228	# type: (List[str], List) -> None
229	HTMLParser.HTMLParser.__init__(self)
230	self.heading_tags = heading_tags
231	self.out = out
232
233	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
234	self.in_heading = False
235
236	self.indent = 0
237
238	def log(self, msg, *args):
239	# type: (str, *Any) -> None
240	ind = self.indent * ' '
241	if 0:
242	log(ind + msg, *args)
243
244	def handle_starttag(self, tag, attrs):
245	# type: (AnyStr, List[Tuple[AnyStr, AnyStr]]) -> None
246	if tag in self.heading_tags:
247	self.in_heading = True
248	if self.cur_group:
249	self.out.append(self.cur_group)
250
251	self.cur_group = (tag, attrs, [], [])
252
253	self.log('[%d] <> %s %s', self.indent, tag, attrs)
254	self.indent += 1
255
256	def handle_endtag(self, tag):
257	# type: (AnyStr) -> None
258	if tag in self.heading_tags:
259	self.in_heading = False
260
261	self.log('[%d] </> %s', self.indent, tag)
262	self.indent -= 1
263
264	def handle_entityref(self, name):
265	# type: (AnyStr) -> None
266	"""
267	From Python docs:
268	This method is called to process a named character reference of the form
269	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
270	"""
271	c = html.CHAR_ENTITY[name]
272	if self.in_heading:
273	self.cur_group[2].append(c)
274	else:
275	if self.cur_group:
276	self.cur_group[3].append(c)
277
278	def handle_data(self, data):
279	# type: (AnyStr) -> None
280	self.log('data %r', data)
281	if self.in_heading:
282	self.cur_group[2].append(data)
283	else:
284	if self.cur_group:
285	self.cur_group[3].append(data)
286
287	def end(self):
288	# type: () -> None
289	if self.cur_group:
290	self.out.append(self.cur_group)
291
292	# Maybe detect nesting?
293	if self.indent != 0:
294	raise RuntimeError(
295	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
296	(self.indent, self.cur_group))
297
298
299	def ExtractBody(s):
300	# type: (str) -> str
301	"""Extract what's in between <body></body>
302
303	The splitter needs balanced tags, and what's in <head> isn't
304	balanced.
305	"""
306	f = cStringIO.StringIO()
307	out = htm8.Output(s, f)
308	tag_lexer = htm8.TagLexer(s)
309
310	pos = 0
311	it = html.ValidTokens(s)
312	while True:
313	try:
314	tok_id, end_pos = next(it)
315	except StopIteration:
316	break
317
318	if tok_id == h8_id.StartTag:
319	tag_lexer.Reset(pos, end_pos)
320	if tag_lexer.TagName() == 'body':
321	body_start_right = end_pos # right after <body>
322
323	out.SkipTo(body_start_right)
324	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
325
326	out.PrintUntil(body_end_left)
327	break
328
329	pos = end_pos
330
331	return f.getvalue()
332
333
334	def SplitIntoCards(heading_tags, contents):
335	# type: (List[str], str) -> Iterator
336	contents = ExtractBody(contents)
337
338	groups = []
339	sp = Splitter(heading_tags, groups)
340	sp.feed(contents)
341	sp.end()
342
343	for tag, attrs, heading_parts, parts in groups:
344	heading = ''.join(heading_parts).strip()
345
346	# Don't strip leading space?
347	text = ''.join(parts)
348	text = text.strip('\n') + '\n'
349
350	#log('text = %r', text[:10])
351
352	yield tag, attrs, heading, text
353
354	#log('make_help.py: Parsed %d parts', len(groups))
355
356
357	def HelpTopics(s):
358	"""
359	Given a rendered toc-{osh,ysh}.html
360
361	yield groups (section_id, section_name, block of text)
362	"""
363	tag_lexer = htm8.TagLexer(s)
364
365	pos = 0
366	it = html.ValidTokens(s)
367	while True:
368	try:
369	tok_id, end_pos = next(it)
370	except StopIteration:
371	break
372
373	if tok_id == h8_id.StartTag:
374	tag_lexer.Reset(pos, end_pos)
375	#log('%r', tag_lexer.TagString())
376	#log('%r', tag_lexer.TagName())
377
378	# Capture <h2 id="foo"> first
379	if tag_lexer.TagName() == 'h2':
380	h2_start_right = end_pos
381
382	open_tag_right = end_pos
383	section_id = tag_lexer.GetAttrRaw('id')
384	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
385
386	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
387
388	anchor_html = s[h2_start_right:h2_end_left]
389	paren_pos = anchor_html.find('<') # remove HTML link
390	if paren_pos == -1:
391	section_name = anchor_html
392	else:
393	section_name = anchor_html[:paren_pos].strip()
394
395	# Now find the <code></code> span
396	_, code_start_right = html.ReadUntilStartTag(
397	it, tag_lexer, 'code')
398	css_class = tag_lexer.GetAttrRaw('class')
399	assert css_class is not None
400	assert css_class.startswith(
401	'language-chapter-links-'), tag_lexer.TagString()
402
403	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
404
405	text = html.ToText(s, code_start_right, code_end_left)
406	yield section_id, section_name, text
407
408	pos = end_pos
409
410
411	class DocNode(object):
412	"""To visualize doc structure."""
413
414	def __init__(self, name, attrs=None, text=None):
415	# type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
416	self.name = name
417	self.attrs = attrs # for h2 and h3 links
418	self.text = text
419	self.children = []
420
421
422	def CardsFromIndex(sh, out_prefix):
423	sections = []
424	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
425	if 0:
426	log('section_id = %r', section_id)
427	log('section_name = %r', section_name)
428	log('')
429	#log('text = %r', text[:20])
430
431	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
432
433	path = os.path.join(out_prefix, topic)
434	with open(path, 'w') as f:
435	f.write('%s\n\n' %
436	section_name) # section_id is printed dynamically
437	f.write(text)
438	#f.write('\n') # extra
439	#log(' Wrote %s', path)
440	sections.append(section_id)
441
442	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
443	out_prefix)
444
445
446	def CardsFromChapters(
447	out_dir, # type: str
448	tag_level, # type: str
449	paths, # type: List[str]
450	):
451	# type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
452	"""
453	Args:
454	paths: list of chap-*.html to read
455	"""
456	topic_to_chap = {}
457
458	root_node = DocNode('/')
459	cur_h2_node = None
460
461	for path in paths:
462	with open(path) as f:
463	contents = f.read()
464
465	filename = os.path.basename(path)
466
467	tmp, _ = os.path.splitext(filename)
468	assert tmp.startswith('chap-')
469	chapter_name = tmp[len('chap-'):]
470
471	page_node = DocNode(filename)
472
473	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
474
475	for tag, attrs, heading, text in cards:
476	values = [v for k, v in attrs if k == 'id']
477	id_value = values[0] if len(values) == 1 else None
478
479	topic_id = (id_value if id_value else html_lib.PrettyHref(
480	heading, preserve_anchor_case=True))
481
482	if tag == 'h2':
483	h2 = DocNode(topic_id, attrs=attrs)
484	page_node.children.append(h2)
485	cur_h2_node = h2
486	elif tag == 'h3':
487	# attach text so we can see which topics have empty bodies
488	h3 = DocNode(topic_id, attrs=attrs, text=text)
489	cur_h2_node.children.append(h3)
490
491	if tag != tag_level:
492	continue # we only care about h3 now
493
494	if 0:
495	log('tag = %r', tag)
496	log('topic_id = %r', topic_id)
497	log('heading = %r', heading)
498	log('text = %r', text[:20])
499
500	embed = ('oils-embed', '1') in attrs
501
502	if out_dir is not None and embed:
503	# indices start with _
504	path = os.path.join(out_dir, topic_id)
505	with open(path, 'w') as f:
506	f.write(text)
507
508	# help builtin will show URL if there's a chapter name
509	topic_to_chap[topic_id] = None if embed else chapter_name
510
511	root_node.children.append(page_node)
512
513	num_sections = sum(len(child.children) for child in root_node.children)
514
515	log(
516	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
517	len(paths), len(topic_to_chap), num_sections, out_dir)
518
519	return topic_to_chap, root_node
520
521
522	class StrPool(object):
523
524	def __init__(self):
525	# type: () -> None
526	self.var_names = {}
527	self.global_strs = []
528	self.unique_id = 1
529
530	def Add(self, s):
531	# type: (str) -> None
532	if s in self.var_names:
533	return
534
535	var_name = 'gStr%d' % self.unique_id
536	self.unique_id += 1
537
538	import json
539	# Use JSON as approximation for C++ string
540	self.global_strs.append('GLOBAL_STR(%s, %s)' %
541	(var_name, json.dumps(s)))
542
543	self.var_names[s] = var_name
544
545
546	def WriteTopicDict(topic_dict, header_f, cc_f):
547	# type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
548	header_f.write('''
549	#include "mycpp/runtime.h"
550
551	namespace help_meta {
552	Dict<BigStr, BigStr>* TopicMetadata();
553	}
554	''')
555
556	pool = StrPool()
557
558	for k, v in topic_dict.iteritems():
559	pool.Add(k)
560	if v is not None:
561	pool.Add(v)
562	#log('%s %s', k, v)
563
564	num_items = len(topic_dict)
565	key_names = []
566	val_names = []
567
568	for k, v in topic_dict.iteritems():
569	key_names.append(pool.var_names[k])
570	if v is None:
571	v_str = 'nullptr'
572	else:
573	v_str = pool.var_names[v]
574	val_names.append(v_str)
575
576	cc_f.write('''
577	#include "mycpp/runtime.h"
578
579	namespace help_meta {
580
581	%s
582
583	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
584
585	Dict<BigStr, BigStr>* TopicMetadata() {
586	return gTopics;
587	}
588	}
589	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
590	' COMMA '.join(val_names)))
591
592
593	def main(argv):
594	# type: (List[str]) -> None
595	action = argv[1]
596
597	if action == 'cards-from-index':
598	sh = argv[2] # osh or ysh
599	out_prefix = argv[3]
600
601	# Read HTML from stdin
602	# TODO: could pass a list of files to speed it up
603	CardsFromIndex(sh, out_prefix)
604
605	elif action == 'cards-from-chapters':
606
607	out_dir = argv[2]
608	py_out = argv[3]
609	cc_prefix = argv[4]
610	pages = argv[5:]
611
612	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
613
614	# Write topic dict as Python and C++
615
616	with open(py_out, 'w') as f:
617	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
618
619	f.write('''
620
621	from typing import Dict
622
623	def TopicMetadata():
624	# type: () -> Dict[str, str]
625	return TOPICS
626	''')
627
628	h_path = cc_prefix + '.h'
629	cc_path = cc_prefix + '.cc'
630
631	with open(h_path, 'w') as header_f:
632	with open(cc_path, 'w') as cc_f:
633	WriteTopicDict(topic_to_chap, header_f, cc_f)
634
635	elif action == 'ref-check':
636	from doctools import cmark
637	from doctools import oils_doc
638	from doctools import ref_check
639
640	chapters = []
641	all_toc_nodes = []
642
643	for path in argv[2:]:
644	filename = os.path.basename(path)
645
646	if filename.endswith('.md'):
647	assert filename.startswith('toc-'), path
648
649	# First convert to HTML
650	with open(path) as in_file:
651	html = cmark.md2html(in_file.read())
652
653	# Now highlight code, which # which gives debug output for the
654	# language-chapter-links-*
655
656	box_nodes = []
657	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
658	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
659
660	elif filename.endswith('.html'):
661	assert filename.startswith('chap-'), path
662	chapters.append(path)
663
664	else:
665	raise RuntimeError('Expected toc-* or chap-*, got %r' %
666	filename)
667
668	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
669
670	#log('%d chapters: %s', len(chapters), chapters[:5])
671	#log('%d topics: %s', len(topics), topics.keys()[:10])
672	log('')
673
674	# Compare TOC vs. chapters
675	ref_check.Check(all_toc_nodes, chap_tree)
676
677	else:
678	raise RuntimeError('Invalid action %r' % action)
679
680
681	if __name__ == '__main__':
682	try:
683	main(sys.argv)
684	except RuntimeError as e:
685	print('FATAL: %s' % e, file=sys.stderr)
686	sys.exit(1)