doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

671 lines, 355 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List
4	from typing import Any
5	from typing import Dict
6	from typing import Iterator
7	"""help_gen.py
8
9	Ideas for HTML -> ANSI converter:
10
11	- `ls` -> <code>ls</code> -> is reverse video?
12	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
13	- could also be bright blue
14	- <pre> is also indented 4 spaces, like the markdown
15	- red X <span class="X">X</span>
16
17	- comments in code examples could be green?
18
19	What about:
20
21	- headings h1, h2, h3, h4
22	- Right now cards use reverse video. Centering didn't look great.
23
24	- <ul> - you could use a Unicode bullet here
25	- <ol>
26
27	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
28	thing.
29	"""
30
31	import cStringIO
32	import HTMLParser
33	import os
34	import pprint
35	import re
36	import sys
37
38	from _devbuild.gen.htm8_asdl import h8_id
39	from doctools import html_lib
40	from doctools.util import log
41	from lazylex import html
42
43	#from typing import List, Tuple
44
45	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
46	# turned into anchors.
47	SECTION_RE = re.compile(
48	r'''
49	\s*
50	\[
51	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
52	\]
53	''', re.VERBOSE)
54
55	# Complex heuristic to highlight topics.
56	TOPIC_RE = re.compile(
57	r'''
58	(X[ ])? # optional deprecation symbol X, then a single space
59	@? # optional @array, e.g. @BASH_SOURCE
60
61	([a-zA-Z_][a-zA-Z0-9/:_-]+)
62	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
63	# List/append, cmd/append
64
65	( [ ] [^a-zA-Z0-9 ] \S*
66	# trailer like >> or (make)
67	\|
68	\(\) # optional () for func()
69	)?
70
71	( # order of these 2 clauses matters
72	[ ]*\n # spaces/newline
73	\|
74	[ ]+ # 1 or more spaces
75	)
76	''', re.VERBOSE)
77	"""
78	''', re.VERBOSE)
79	"""
80
81
82	def _StringToHref(s):
83	# lower case to match what doctools/cmark.py does
84	return s.lower().replace(' ', '-')
85
86
87	X_LEFT_SPAN = '<span style="color: darkred">'
88
89
90	class TopicHtmlRenderer(object):
91
92	def __init__(self, chapter, debug_out, linkify_stop_col):
93	# type: (str, List, int) -> None
94	self.chapter = chapter
95	self.debug_out = debug_out
96	self.linkify_stop_col = linkify_stop_col
97
98	self.html_page = 'chap-%s.html' % chapter
99
100	def _PrintTopic(self, m, out, line_info):
101	# type: (Any, html.Output, Dict[str, Any]) -> None
102	# The X
103	topic_impl = True
104	if m.group(1):
105	out.PrintUntil(m.start(1))
106	out.Print(X_LEFT_SPAN)
107	out.PrintUntil(m.end(1))
108	out.Print('</span>')
109	topic_impl = False
110
111	# The topic name to link
112	topic = m.group(2)
113	line_info['topics'].append((topic, topic_impl))
114
115	out.PrintUntil(m.start(2))
116	out.Print('<a href="%s#%s">' % (self.html_page, topic))
117	out.PrintUntil(m.end(2))
118	out.Print('</a>')
119
120	def Render(self, line):
121	# type: (str) -> str
122	"""Convert a line of text to HTML.
123
124	Topics are highlighted and X made red.
125
126	Args:
127	chapter: where to link to
128	line: RAW SPAN of HTML that is already escaped.
129	debug_out: structured data
130
131	Returns:
132	The HTML with some tags inserted.
133	"""
134	f = cStringIO.StringIO()
135	out = html.Output(line, f)
136
137	pos = 0 # position within line
138
139	section_impl = True
140
141	if line.startswith('X '):
142	out.Print(X_LEFT_SPAN)
143	out.PrintUntil(2)
144	out.Print('</span>')
145	pos = 2
146	section_impl = False
147	elif line.startswith(' '):
148	pos = 2
149	else:
150	return line
151
152	# Highlight [Section] at the start of a line.
153	m = SECTION_RE.match(line, pos)
154	if m:
155	section_name = m.group(1)
156	#href = _StringToHref(section_name)
157	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
158
159	out.PrintUntil(m.start(1))
160	out.Print('<a href="%s#%s" class="level2">' %
161	(self.html_page, href))
162	out.PrintUntil(m.end(1)) # anchor
163	out.Print('</a>')
164
165	pos = m.end(0) # ADVANCE
166	else:
167	section_name = None
168
169	line_info = {
170	'section': section_name,
171	'impl': section_impl,
172	'topics': []
173	}
174	self.debug_out.append(line_info)
175
176	# Whitespace after section, or leading whitespace
177	_SPACE_1 = re.compile(r'[ ]+')
178	m = _SPACE_1.match(line, pos)
179	assert m, 'Expected whitespace %r' % line
180
181	pos = m.end()
182
183	# Keep matching topics until it doesn't match.
184	while True:
185	m = TOPIC_RE.match(line, pos)
186
187	if not m:
188	break
189
190	pos = m.end()
191
192	# The 1-based column number of the end of this topic
193	col = m.end(2) + 1
194	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
195	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
196	break
197
198	self._PrintTopic(m, out, line_info)
199
200	#log('trailing %r', line[pos:])
201
202	out.PrintTheRest()
203	return f.getvalue()
204
205
206	class Splitter(HTMLParser.HTMLParser):
207	"""Split an HTML stream starting at each of the heading tags.
208
209	For *-help.html.
210
211	TODO: Rewrite with this with lazylex!
212
213	Algorithm:
214	- ExtractBody() first, then match balanced tags
215	- SPLIT by h2, h3, h4
216	- Match <pre><code> blocks and re-indent
217	- Later:
218	- links <a href="">
219	- `` is turned into inline <code></code>
220	- for bold
221	- * * for emphasis
222	- <p> needs word wrapping! Oops.
223	- actually cmark seems to preserve this? OK maybe not.
224	- we just need space between <p>
225	"""
226
227	def __init__(self, heading_tags, out):
228	# type: (List[str], List) -> None
229	HTMLParser.HTMLParser.__init__(self)
230	self.heading_tags = heading_tags
231	self.out = out
232
233	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
234	self.in_heading = False
235
236	self.indent = 0
237
238	def log(self, msg, *args):
239	ind = self.indent * ' '
240	if 0:
241	log(ind + msg, *args)
242
243	def handle_starttag(self, tag, attrs):
244	if tag in self.heading_tags:
245	self.in_heading = True
246	if self.cur_group:
247	self.out.append(self.cur_group)
248
249	self.cur_group = (tag, attrs, [], [])
250
251	self.log('[%d] <> %s %s', self.indent, tag, attrs)
252	self.indent += 1
253
254	def handle_endtag(self, tag):
255	if tag in self.heading_tags:
256	self.in_heading = False
257
258	self.log('[%d] </> %s', self.indent, tag)
259	self.indent -= 1
260
261	def handle_entityref(self, name):
262	"""
263	From Python docs:
264	This method is called to process a named character reference of the form
265	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
266	"""
267	c = html.CHAR_ENTITY[name]
268	if self.in_heading:
269	self.cur_group[2].append(c)
270	else:
271	if self.cur_group:
272	self.cur_group[3].append(c)
273
274	def handle_data(self, data):
275	self.log('data %r', data)
276	if self.in_heading:
277	self.cur_group[2].append(data)
278	else:
279	if self.cur_group:
280	self.cur_group[3].append(data)
281
282	def end(self):
283	# type: () -> None
284	if self.cur_group:
285	self.out.append(self.cur_group)
286
287	# Maybe detect nesting?
288	if self.indent != 0:
289	raise RuntimeError(
290	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
291	(self.indent, self.cur_group))
292
293
294	def ExtractBody(s):
295	# type: (str) -> str
296	"""Extract what's in between <body></body>
297
298	The splitter needs balanced tags, and what's in <head> isn't
299	balanced.
300	"""
301	f = cStringIO.StringIO()
302	out = html.Output(s, f)
303	tag_lexer = html.TagLexer(s)
304
305	pos = 0
306	it = html.ValidTokens(s)
307	while True:
308	try:
309	tok_id, end_pos = next(it)
310	except StopIteration:
311	break
312
313	if tok_id == h8_id.StartTag:
314	tag_lexer.Reset(pos, end_pos)
315	if tag_lexer.TagName() == 'body':
316	body_start_right = end_pos # right after <body>
317
318	out.SkipTo(body_start_right)
319	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
320
321	out.PrintUntil(body_end_left)
322	break
323
324	pos = end_pos
325
326	return f.getvalue()
327
328
329	def SplitIntoCards(heading_tags, contents):
330	# type: (List[str], str) -> Iterator
331	contents = ExtractBody(contents)
332
333	groups = []
334	sp = Splitter(heading_tags, groups)
335	sp.feed(contents)
336	sp.end()
337
338	for tag, attrs, heading_parts, parts in groups:
339	heading = ''.join(heading_parts).strip()
340
341	# Don't strip leading space?
342	text = ''.join(parts)
343	text = text.strip('\n') + '\n'
344
345	#log('text = %r', text[:10])
346
347	yield tag, attrs, heading, text
348
349	#log('make_help.py: Parsed %d parts', len(groups))
350
351
352	def HelpTopics(s):
353	"""
354	Given a rendered toc-{osh,ysh}.html
355
356	yield groups (section_id, section_name, block of text)
357	"""
358	tag_lexer = html.TagLexer(s)
359
360	pos = 0
361	it = html.ValidTokens(s)
362	while True:
363	try:
364	tok_id, end_pos = next(it)
365	except StopIteration:
366	break
367
368	if tok_id == h8_id.StartTag:
369	tag_lexer.Reset(pos, end_pos)
370	#log('%r', tag_lexer.TagString())
371	#log('%r', tag_lexer.TagName())
372
373	# Capture <h2 id="foo"> first
374	if tag_lexer.TagName() == 'h2':
375	h2_start_right = end_pos
376
377	open_tag_right = end_pos
378	section_id = tag_lexer.GetAttrRaw('id')
379	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
380
381	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
382
383	anchor_html = s[h2_start_right:h2_end_left]
384	paren_pos = anchor_html.find('<') # remove HTML link
385	if paren_pos == -1:
386	section_name = anchor_html
387	else:
388	section_name = anchor_html[:paren_pos].strip()
389
390	# Now find the <code></code> span
391	_, code_start_right = html.ReadUntilStartTag(
392	it, tag_lexer, 'code')
393	css_class = tag_lexer.GetAttrRaw('class')
394	assert css_class is not None
395	assert css_class.startswith(
396	'language-chapter-links-'), tag_lexer.TagString()
397
398	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
399
400	text = html.ToText(s, code_start_right, code_end_left)
401	yield section_id, section_name, text
402
403	pos = end_pos
404
405
406	class DocNode(object):
407	"""To visualize doc structure."""
408
409	def __init__(self, name, attrs=None, text=None):
410	self.name = name
411	self.attrs = attrs # for h2 and h3 links
412	self.text = text
413	self.children = []
414
415
416	def CardsFromIndex(sh, out_prefix):
417	sections = []
418	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
419	if 0:
420	log('section_id = %r', section_id)
421	log('section_name = %r', section_name)
422	log('')
423	#log('text = %r', text[:20])
424
425	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
426
427	path = os.path.join(out_prefix, topic)
428	with open(path, 'w') as f:
429	f.write('%s\n\n' %
430	section_name) # section_id is printed dynamically
431	f.write(text)
432	#f.write('\n') # extra
433	#log(' Wrote %s', path)
434	sections.append(section_id)
435
436	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
437	out_prefix)
438
439
440	def CardsFromChapters(out_dir, tag_level, paths):
441	"""
442	Args:
443	paths: list of chap-*.html to read
444	"""
445	topic_to_chap = {}
446
447	root_node = DocNode('/')
448	cur_h2_node = None
449
450	for path in paths:
451	with open(path) as f:
452	contents = f.read()
453
454	filename = os.path.basename(path)
455
456	tmp, _ = os.path.splitext(filename)
457	assert tmp.startswith('chap-')
458	chapter_name = tmp[len('chap-'):]
459
460	page_node = DocNode(filename)
461
462	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
463
464	for tag, attrs, heading, text in cards:
465	values = [v for k, v in attrs if k == 'id']
466	id_value = values[0] if len(values) == 1 else None
467
468	topic_id = (id_value if id_value else html_lib.PrettyHref(
469	heading, preserve_anchor_case=True))
470
471	if tag == 'h2':
472	h2 = DocNode(topic_id, attrs=attrs)
473	page_node.children.append(h2)
474	cur_h2_node = h2
475	elif tag == 'h3':
476	# attach text so we can see which topics have empty bodies
477	h3 = DocNode(topic_id, attrs=attrs, text=text)
478	cur_h2_node.children.append(h3)
479
480	if tag != tag_level:
481	continue # we only care about h3 now
482
483	if 0:
484	log('tag = %r', tag)
485	log('topic_id = %r', topic_id)
486	log('heading = %r', heading)
487	log('text = %r', text[:20])
488
489	embed = ('oils-embed', '1') in attrs
490
491	if out_dir is not None and embed:
492	# indices start with _
493	path = os.path.join(out_dir, topic_id)
494	with open(path, 'w') as f:
495	f.write(text)
496
497	# help builtin will show URL if there's a chapter name
498	topic_to_chap[topic_id] = None if embed else chapter_name
499
500	root_node.children.append(page_node)
501
502	num_sections = sum(len(child.children) for child in root_node.children)
503
504	log(
505	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
506	len(paths), len(topic_to_chap), num_sections, out_dir)
507
508	return topic_to_chap, root_node
509
510
511	class StrPool(object):
512
513	def __init__(self):
514	self.var_names = {}
515	self.global_strs = []
516	self.unique_id = 1
517
518	def Add(self, s):
519	if s in self.var_names:
520	return
521
522	var_name = 'gStr%d' % self.unique_id
523	self.unique_id += 1
524
525	import json
526	# Use JSON as approximation for C++ string
527	self.global_strs.append('GLOBAL_STR(%s, %s)' %
528	(var_name, json.dumps(s)))
529
530	self.var_names[s] = var_name
531
532
533	def WriteTopicDict(topic_dict, header_f, cc_f):
534	header_f.write('''
535	#include "mycpp/runtime.h"
536
537	namespace help_meta {
538	Dict<BigStr, BigStr>* TopicMetadata();
539	}
540	''')
541
542	pool = StrPool()
543
544	for k, v in topic_dict.iteritems():
545	pool.Add(k)
546	if v is not None:
547	pool.Add(v)
548	#log('%s %s', k, v)
549
550	num_items = len(topic_dict)
551	key_names = []
552	val_names = []
553
554	for k, v in topic_dict.iteritems():
555	key_names.append(pool.var_names[k])
556	if v is None:
557	v_str = 'nullptr'
558	else:
559	v_str = pool.var_names[v]
560	val_names.append(v_str)
561
562	cc_f.write('''
563	#include "mycpp/runtime.h"
564
565	namespace help_meta {
566
567	%s
568
569	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
570
571	Dict<BigStr, BigStr>* TopicMetadata() {
572	return gTopics;
573	}
574	}
575	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
576	' COMMA '.join(val_names)))
577
578
579	def main(argv):
580	action = argv[1]
581
582	if action == 'cards-from-index':
583	sh = argv[2] # osh or ysh
584	out_prefix = argv[3]
585
586	# Read HTML from stdin
587	# TODO: could pass a list of files to speed it up
588	CardsFromIndex(sh, out_prefix)
589
590	elif action == 'cards-from-chapters':
591
592	out_dir = argv[2]
593	py_out = argv[3]
594	cc_prefix = argv[4]
595	pages = argv[5:]
596
597	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
598
599	# Write topic dict as Python and C++
600
601	with open(py_out, 'w') as f:
602	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
603
604	f.write('''
605
606	from typing import Dict
607
608	def TopicMetadata():
609	# type: () -> Dict[str, str]
610	return TOPICS
611	''')
612
613	h_path = cc_prefix + '.h'
614	cc_path = cc_prefix + '.cc'
615
616	with open(h_path, 'w') as header_f:
617	with open(cc_path, 'w') as cc_f:
618	WriteTopicDict(topic_to_chap, header_f, cc_f)
619
620	elif action == 'ref-check':
621	from doctools import cmark
622	from doctools import oils_doc
623	from doctools import ref_check
624
625	chapters = []
626	all_toc_nodes = []
627
628	for path in argv[2:]:
629	filename = os.path.basename(path)
630
631	if filename.endswith('.md'):
632	assert filename.startswith('toc-'), path
633
634	# First convert to HTML
635	with open(path) as in_file:
636	html = cmark.md2html(in_file.read())
637
638	# Now highlight code, which # which gives debug output for the
639	# language-chapter-links-*
640
641	box_nodes = []
642	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
643	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
644
645	elif filename.endswith('.html'):
646	assert filename.startswith('chap-'), path
647	chapters.append(path)
648
649	else:
650	raise RuntimeError('Expected toc-* or chap-*, got %r' %
651	filename)
652
653	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
654
655	#log('%d chapters: %s', len(chapters), chapters[:5])
656	#log('%d topics: %s', len(topics), topics.keys()[:10])
657	log('')
658
659	# Compare TOC vs. chapters
660	ref_check.Check(all_toc_nodes, chap_tree)
661
662	else:
663	raise RuntimeError('Invalid action %r' % action)
664
665
666	if __name__ == '__main__':
667	try:
668	main(sys.argv)
669	except RuntimeError as e:
670	print('FATAL: %s' % e, file=sys.stderr)
671	sys.exit(1)