doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

670 lines, 354 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List
4	from typing import Any
5	from typing import Dict
6	from typing import Iterator
7	"""help_gen.py
8
9	Ideas for HTML -> ANSI converter:
10
11	- `ls` -> <code>ls</code> -> is reverse video?
12	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
13	- could also be bright blue
14	- <pre> is also indented 4 spaces, like the markdown
15	- red X <span class="X">X</span>
16
17	- comments in code examples could be green?
18
19	What about:
20
21	- headings h1, h2, h3, h4
22	- Right now cards use reverse video. Centering didn't look great.
23
24	- <ul> - you could use a Unicode bullet here
25	- <ol>
26
27	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
28	thing.
29	"""
30
31	import cStringIO
32	import HTMLParser
33	import os
34	import pprint
35	import re
36	import sys
37
38	from doctools import html_lib
39	from doctools.util import log
40	from lazylex import html
41
42	#from typing import List, Tuple
43
44	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
45	# turned into anchors.
46	SECTION_RE = re.compile(
47	r'''
48	\s*
49	\[
50	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
51	\]
52	''', re.VERBOSE)
53
54	# Complex heuristic to highlight topics.
55	TOPIC_RE = re.compile(
56	r'''
57	(X[ ])? # optional deprecation symbol X, then a single space
58	@? # optional @array, e.g. @BASH_SOURCE
59
60	([a-zA-Z_][a-zA-Z0-9/:_-]+)
61	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
62	# List/append, cmd/append
63
64	( [ ] [^a-zA-Z0-9 ] \S*
65	# trailer like >> or (make)
66	\|
67	\(\) # optional () for func()
68	)?
69
70	( # order of these 2 clauses matters
71	[ ]*\n # spaces/newline
72	\|
73	[ ]+ # 1 or more spaces
74	)
75	''', re.VERBOSE)
76	"""
77	''', re.VERBOSE)
78	"""
79
80
81	def _StringToHref(s):
82	# lower case to match what doctools/cmark.py does
83	return s.lower().replace(' ', '-')
84
85
86	X_LEFT_SPAN = '<span style="color: darkred">'
87
88
89	class TopicHtmlRenderer(object):
90
91	def __init__(self, chapter, debug_out, linkify_stop_col):
92	# type: (str, List, int) -> None
93	self.chapter = chapter
94	self.debug_out = debug_out
95	self.linkify_stop_col = linkify_stop_col
96
97	self.html_page = 'chap-%s.html' % chapter
98
99	def _PrintTopic(self, m, out, line_info):
100	# type: (Any, html.Output, Dict[str, Any]) -> None
101	# The X
102	topic_impl = True
103	if m.group(1):
104	out.PrintUntil(m.start(1))
105	out.Print(X_LEFT_SPAN)
106	out.PrintUntil(m.end(1))
107	out.Print('</span>')
108	topic_impl = False
109
110	# The topic name to link
111	topic = m.group(2)
112	line_info['topics'].append((topic, topic_impl))
113
114	out.PrintUntil(m.start(2))
115	out.Print('<a href="%s#%s">' % (self.html_page, topic))
116	out.PrintUntil(m.end(2))
117	out.Print('</a>')
118
119	def Render(self, line):
120	# type: (str) -> str
121	"""Convert a line of text to HTML.
122
123	Topics are highlighted and X made red.
124
125	Args:
126	chapter: where to link to
127	line: RAW SPAN of HTML that is already escaped.
128	debug_out: structured data
129
130	Returns:
131	The HTML with some tags inserted.
132	"""
133	f = cStringIO.StringIO()
134	out = html.Output(line, f)
135
136	pos = 0 # position within line
137
138	section_impl = True
139
140	if line.startswith('X '):
141	out.Print(X_LEFT_SPAN)
142	out.PrintUntil(2)
143	out.Print('</span>')
144	pos = 2
145	section_impl = False
146	elif line.startswith(' '):
147	pos = 2
148	else:
149	return line
150
151	# Highlight [Section] at the start of a line.
152	m = SECTION_RE.match(line, pos)
153	if m:
154	section_name = m.group(1)
155	#href = _StringToHref(section_name)
156	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
157
158	out.PrintUntil(m.start(1))
159	out.Print('<a href="%s#%s" class="level2">' %
160	(self.html_page, href))
161	out.PrintUntil(m.end(1)) # anchor
162	out.Print('</a>')
163
164	pos = m.end(0) # ADVANCE
165	else:
166	section_name = None
167
168	line_info = {
169	'section': section_name,
170	'impl': section_impl,
171	'topics': []
172	}
173	self.debug_out.append(line_info)
174
175	# Whitespace after section, or leading whitespace
176	_SPACE_1 = re.compile(r'[ ]+')
177	m = _SPACE_1.match(line, pos)
178	assert m, 'Expected whitespace %r' % line
179
180	pos = m.end()
181
182	# Keep matching topics until it doesn't match.
183	while True:
184	m = TOPIC_RE.match(line, pos)
185
186	if not m:
187	break
188
189	pos = m.end()
190
191	# The 1-based column number of the end of this topic
192	col = m.end(2) + 1
193	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
194	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
195	break
196
197	self._PrintTopic(m, out, line_info)
198
199	#log('trailing %r', line[pos:])
200
201	out.PrintTheRest()
202	return f.getvalue()
203
204
205	class Splitter(HTMLParser.HTMLParser):
206	"""Split an HTML stream starting at each of the heading tags.
207
208	For *-help.html.
209
210	TODO: Rewrite with this with lazylex!
211
212	Algorithm:
213	- ExtractBody() first, then match balanced tags
214	- SPLIT by h2, h3, h4
215	- Match <pre><code> blocks and re-indent
216	- Later:
217	- links <a href="">
218	- `` is turned into inline <code></code>
219	- for bold
220	- * * for emphasis
221	- <p> needs word wrapping! Oops.
222	- actually cmark seems to preserve this? OK maybe not.
223	- we just need space between <p>
224	"""
225
226	def __init__(self, heading_tags, out):
227	# type: (List[str], List) -> None
228	HTMLParser.HTMLParser.__init__(self)
229	self.heading_tags = heading_tags
230	self.out = out
231
232	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
233	self.in_heading = False
234
235	self.indent = 0
236
237	def log(self, msg, *args):
238	ind = self.indent * ' '
239	if 0:
240	log(ind + msg, *args)
241
242	def handle_starttag(self, tag, attrs):
243	if tag in self.heading_tags:
244	self.in_heading = True
245	if self.cur_group:
246	self.out.append(self.cur_group)
247
248	self.cur_group = (tag, attrs, [], [])
249
250	self.log('[%d] <> %s %s', self.indent, tag, attrs)
251	self.indent += 1
252
253	def handle_endtag(self, tag):
254	if tag in self.heading_tags:
255	self.in_heading = False
256
257	self.log('[%d] </> %s', self.indent, tag)
258	self.indent -= 1
259
260	def handle_entityref(self, name):
261	"""
262	From Python docs:
263	This method is called to process a named character reference of the form
264	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
265	"""
266	c = html.CHAR_ENTITY[name]
267	if self.in_heading:
268	self.cur_group[2].append(c)
269	else:
270	if self.cur_group:
271	self.cur_group[3].append(c)
272
273	def handle_data(self, data):
274	self.log('data %r', data)
275	if self.in_heading:
276	self.cur_group[2].append(data)
277	else:
278	if self.cur_group:
279	self.cur_group[3].append(data)
280
281	def end(self):
282	# type: () -> None
283	if self.cur_group:
284	self.out.append(self.cur_group)
285
286	# Maybe detect nesting?
287	if self.indent != 0:
288	raise RuntimeError(
289	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
290	(self.indent, self.cur_group))
291
292
293	def ExtractBody(s):
294	# type: (str) -> str
295	"""Extract what's in between <body></body>
296
297	The splitter needs balanced tags, and what's in <head> isn't
298	balanced.
299	"""
300	f = cStringIO.StringIO()
301	out = html.Output(s, f)
302	tag_lexer = html.TagLexer(s)
303
304	pos = 0
305	it = html.ValidTokens(s)
306	while True:
307	try:
308	tok_id, end_pos = next(it)
309	except StopIteration:
310	break
311
312	if tok_id == html.StartTag:
313	tag_lexer.Reset(pos, end_pos)
314	if tag_lexer.TagName() == 'body':
315	body_start_right = end_pos # right after <body>
316
317	out.SkipTo(body_start_right)
318	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
319
320	out.PrintUntil(body_end_left)
321	break
322
323	pos = end_pos
324
325	return f.getvalue()
326
327
328	def SplitIntoCards(heading_tags, contents):
329	# type: (List[str], str) -> Iterator
330	contents = ExtractBody(contents)
331
332	groups = []
333	sp = Splitter(heading_tags, groups)
334	sp.feed(contents)
335	sp.end()
336
337	for tag, attrs, heading_parts, parts in groups:
338	heading = ''.join(heading_parts).strip()
339
340	# Don't strip leading space?
341	text = ''.join(parts)
342	text = text.strip('\n') + '\n'
343
344	#log('text = %r', text[:10])
345
346	yield tag, attrs, heading, text
347
348	#log('make_help.py: Parsed %d parts', len(groups))
349
350
351	def HelpTopics(s):
352	"""
353	Given a rendered toc-{osh,ysh}.html
354
355	yield groups (section_id, section_name, block of text)
356	"""
357	tag_lexer = html.TagLexer(s)
358
359	pos = 0
360	it = html.ValidTokens(s)
361	while True:
362	try:
363	tok_id, end_pos = next(it)
364	except StopIteration:
365	break
366
367	if tok_id == html.StartTag:
368	tag_lexer.Reset(pos, end_pos)
369	#log('%r', tag_lexer.TagString())
370	#log('%r', tag_lexer.TagName())
371
372	# Capture <h2 id="foo"> first
373	if tag_lexer.TagName() == 'h2':
374	h2_start_right = end_pos
375
376	open_tag_right = end_pos
377	section_id = tag_lexer.GetAttrRaw('id')
378	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
379
380	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
381
382	anchor_html = s[h2_start_right:h2_end_left]
383	paren_pos = anchor_html.find('<') # remove HTML link
384	if paren_pos == -1:
385	section_name = anchor_html
386	else:
387	section_name = anchor_html[:paren_pos].strip()
388
389	# Now find the <code></code> span
390	_, code_start_right = html.ReadUntilStartTag(
391	it, tag_lexer, 'code')
392	css_class = tag_lexer.GetAttrRaw('class')
393	assert css_class is not None
394	assert css_class.startswith(
395	'language-chapter-links-'), tag_lexer.TagString()
396
397	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
398
399	text = html.ToText(s, code_start_right, code_end_left)
400	yield section_id, section_name, text
401
402	pos = end_pos
403
404
405	class DocNode(object):
406	"""To visualize doc structure."""
407
408	def __init__(self, name, attrs=None, text=None):
409	self.name = name
410	self.attrs = attrs # for h2 and h3 links
411	self.text = text
412	self.children = []
413
414
415	def CardsFromIndex(sh, out_prefix):
416	sections = []
417	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
418	if 0:
419	log('section_id = %r', section_id)
420	log('section_name = %r', section_name)
421	log('')
422	#log('text = %r', text[:20])
423
424	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
425
426	path = os.path.join(out_prefix, topic)
427	with open(path, 'w') as f:
428	f.write('%s\n\n' %
429	section_name) # section_id is printed dynamically
430	f.write(text)
431	#f.write('\n') # extra
432	#log(' Wrote %s', path)
433	sections.append(section_id)
434
435	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
436	out_prefix)
437
438
439	def CardsFromChapters(out_dir, tag_level, paths):
440	"""
441	Args:
442	paths: list of chap-*.html to read
443	"""
444	topic_to_chap = {}
445
446	root_node = DocNode('/')
447	cur_h2_node = None
448
449	for path in paths:
450	with open(path) as f:
451	contents = f.read()
452
453	filename = os.path.basename(path)
454
455	tmp, _ = os.path.splitext(filename)
456	assert tmp.startswith('chap-')
457	chapter_name = tmp[len('chap-'):]
458
459	page_node = DocNode(filename)
460
461	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
462
463	for tag, attrs, heading, text in cards:
464	values = [v for k, v in attrs if k == 'id']
465	id_value = values[0] if len(values) == 1 else None
466
467	topic_id = (id_value if id_value else html_lib.PrettyHref(
468	heading, preserve_anchor_case=True))
469
470	if tag == 'h2':
471	h2 = DocNode(topic_id, attrs=attrs)
472	page_node.children.append(h2)
473	cur_h2_node = h2
474	elif tag == 'h3':
475	# attach text so we can see which topics have empty bodies
476	h3 = DocNode(topic_id, attrs=attrs, text=text)
477	cur_h2_node.children.append(h3)
478
479	if tag != tag_level:
480	continue # we only care about h3 now
481
482	if 0:
483	log('tag = %r', tag)
484	log('topic_id = %r', topic_id)
485	log('heading = %r', heading)
486	log('text = %r', text[:20])
487
488	embed = ('oils-embed', '1') in attrs
489
490	if out_dir is not None and embed:
491	# indices start with _
492	path = os.path.join(out_dir, topic_id)
493	with open(path, 'w') as f:
494	f.write(text)
495
496	# help builtin will show URL if there's a chapter name
497	topic_to_chap[topic_id] = None if embed else chapter_name
498
499	root_node.children.append(page_node)
500
501	num_sections = sum(len(child.children) for child in root_node.children)
502
503	log(
504	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
505	len(paths), len(topic_to_chap), num_sections, out_dir)
506
507	return topic_to_chap, root_node
508
509
510	class StrPool(object):
511
512	def __init__(self):
513	self.var_names = {}
514	self.global_strs = []
515	self.unique_id = 1
516
517	def Add(self, s):
518	if s in self.var_names:
519	return
520
521	var_name = 'gStr%d' % self.unique_id
522	self.unique_id += 1
523
524	import json
525	# Use JSON as approximation for C++ string
526	self.global_strs.append('GLOBAL_STR(%s, %s)' %
527	(var_name, json.dumps(s)))
528
529	self.var_names[s] = var_name
530
531
532	def WriteTopicDict(topic_dict, header_f, cc_f):
533	header_f.write('''
534	#include "mycpp/runtime.h"
535
536	namespace help_meta {
537	Dict<BigStr, BigStr>* TopicMetadata();
538	}
539	''')
540
541	pool = StrPool()
542
543	for k, v in topic_dict.iteritems():
544	pool.Add(k)
545	if v is not None:
546	pool.Add(v)
547	#log('%s %s', k, v)
548
549	num_items = len(topic_dict)
550	key_names = []
551	val_names = []
552
553	for k, v in topic_dict.iteritems():
554	key_names.append(pool.var_names[k])
555	if v is None:
556	v_str = 'nullptr'
557	else:
558	v_str = pool.var_names[v]
559	val_names.append(v_str)
560
561	cc_f.write('''
562	#include "mycpp/runtime.h"
563
564	namespace help_meta {
565
566	%s
567
568	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
569
570	Dict<BigStr, BigStr>* TopicMetadata() {
571	return gTopics;
572	}
573	}
574	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
575	' COMMA '.join(val_names)))
576
577
578	def main(argv):
579	action = argv[1]
580
581	if action == 'cards-from-index':
582	sh = argv[2] # osh or ysh
583	out_prefix = argv[3]
584
585	# Read HTML from stdin
586	# TODO: could pass a list of files to speed it up
587	CardsFromIndex(sh, out_prefix)
588
589	elif action == 'cards-from-chapters':
590
591	out_dir = argv[2]
592	py_out = argv[3]
593	cc_prefix = argv[4]
594	pages = argv[5:]
595
596	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
597
598	# Write topic dict as Python and C++
599
600	with open(py_out, 'w') as f:
601	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
602
603	f.write('''
604
605	from typing import Dict
606
607	def TopicMetadata():
608	# type: () -> Dict[str, str]
609	return TOPICS
610	''')
611
612	h_path = cc_prefix + '.h'
613	cc_path = cc_prefix + '.cc'
614
615	with open(h_path, 'w') as header_f:
616	with open(cc_path, 'w') as cc_f:
617	WriteTopicDict(topic_to_chap, header_f, cc_f)
618
619	elif action == 'ref-check':
620	from doctools import cmark
621	from doctools import oils_doc
622	from doctools import ref_check
623
624	chapters = []
625	all_toc_nodes = []
626
627	for path in argv[2:]:
628	filename = os.path.basename(path)
629
630	if filename.endswith('.md'):
631	assert filename.startswith('toc-'), path
632
633	# First convert to HTML
634	with open(path) as in_file:
635	html = cmark.md2html(in_file.read())
636
637	# Now highlight code, which # which gives debug output for the
638	# language-chapter-links-*
639
640	box_nodes = []
641	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
642	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
643
644	elif filename.endswith('.html'):
645	assert filename.startswith('chap-'), path
646	chapters.append(path)
647
648	else:
649	raise RuntimeError('Expected toc-* or chap-*, got %r' %
650	filename)
651
652	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
653
654	#log('%d chapters: %s', len(chapters), chapters[:5])
655	#log('%d topics: %s', len(topics), topics.keys()[:10])
656	log('')
657
658	# Compare TOC vs. chapters
659	ref_check.Check(all_toc_nodes, chap_tree)
660
661	else:
662	raise RuntimeError('Invalid action %r' % action)
663
664
665	if __name__ == '__main__':
666	try:
667	main(sys.argv)
668	except RuntimeError as e:
669	print('FATAL: %s' % e, file=sys.stderr)
670	sys.exit(1)