doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

683 lines, 356 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4	"""help_gen.py
5
6	Ideas for HTML -> ANSI converter:
7
8	- `ls` -> <code>ls</code> -> is reverse video?
9	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10	- could also be bright blue
11	- <pre> is also indented 4 spaces, like the markdown
12	- red X <span class="X">X</span>
13
14	- comments in code examples could be green?
15
16	What about:
17
18	- headings h1, h2, h3, h4
19	- Right now cards use reverse video. Centering didn't look great.
20
21	- <ul> - you could use a Unicode bullet here
22	- <ol>
23
24	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25	thing.
26	"""
27
28	import cStringIO
29	import HTMLParser
30	import os
31	import pprint
32	import re
33	import sys
34
35	from _devbuild.gen.htm8_asdl import h8_id
36	from doctools import html_lib
37	from doctools.util import log
38	from lazylex import html
39
40	#from typing import List, Tuple
41
42	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
43	# turned into anchors.
44	SECTION_RE = re.compile(
45	r'''
46	\s*
47	\[
48	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
49	\]
50	''', re.VERBOSE)
51
52	# Complex heuristic to highlight topics.
53	TOPIC_RE = re.compile(
54	r'''
55	(X[ ])? # optional deprecation symbol X, then a single space
56	@? # optional @array, e.g. @BASH_SOURCE
57
58	([a-zA-Z_][a-zA-Z0-9/:_-]+)
59	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
60	# List/append, cmd/append
61
62	( [ ] [^a-zA-Z0-9 ] \S*
63	# trailer like >> or (make)
64	\|
65	\(\) # optional () for func()
66	)?
67
68	( # order of these 2 clauses matters
69	[ ]*\n # spaces/newline
70	\|
71	[ ]+ # 1 or more spaces
72	)
73	''', re.VERBOSE)
74	"""
75	''', re.VERBOSE)
76	"""
77
78
79	def _StringToHref(s):
80	# lower case to match what doctools/cmark.py does
81	return s.lower().replace(' ', '-')
82
83
84	X_LEFT_SPAN = '<span style="color: darkred">'
85
86
87	class TopicHtmlRenderer(object):
88
89	def __init__(self, chapter, debug_out, linkify_stop_col):
90	# type: (str, List, int) -> None
91	self.chapter = chapter
92	self.debug_out = debug_out
93	self.linkify_stop_col = linkify_stop_col
94
95	self.html_page = 'chap-%s.html' % chapter
96
97	def _PrintTopic(self, m, out, line_info):
98	# type: (Any, html.Output, Dict[str, Any]) -> None
99	# The X
100	topic_impl = True
101	if m.group(1):
102	out.PrintUntil(m.start(1))
103	out.Print(X_LEFT_SPAN)
104	out.PrintUntil(m.end(1))
105	out.Print('</span>')
106	topic_impl = False
107
108	# The topic name to link
109	topic = m.group(2)
110	line_info['topics'].append((topic, topic_impl))
111
112	out.PrintUntil(m.start(2))
113	out.Print('<a href="%s#%s">' % (self.html_page, topic))
114	out.PrintUntil(m.end(2))
115	out.Print('</a>')
116
117	def Render(self, line):
118	# type: (str) -> str
119	"""Convert a line of text to HTML.
120
121	Topics are highlighted and X made red.
122
123	Args:
124	chapter: where to link to
125	line: RAW SPAN of HTML that is already escaped.
126	debug_out: structured data
127
128	Returns:
129	The HTML with some tags inserted.
130	"""
131	f = cStringIO.StringIO()
132	out = html.Output(line, f)
133
134	pos = 0 # position within line
135
136	section_impl = True
137
138	if line.startswith('X '):
139	out.Print(X_LEFT_SPAN)
140	out.PrintUntil(2)
141	out.Print('</span>')
142	pos = 2
143	section_impl = False
144	elif line.startswith(' '):
145	pos = 2
146	else:
147	return line
148
149	# Highlight [Section] at the start of a line.
150	m = SECTION_RE.match(line, pos)
151	if m:
152	section_name = m.group(1)
153	#href = _StringToHref(section_name)
154	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
155
156	out.PrintUntil(m.start(1))
157	out.Print('<a href="%s#%s" class="level2">' %
158	(self.html_page, href))
159	out.PrintUntil(m.end(1)) # anchor
160	out.Print('</a>')
161
162	pos = m.end(0) # ADVANCE
163	else:
164	section_name = None
165
166	line_info = {
167	'section': section_name,
168	'impl': section_impl,
169	'topics': []
170	}
171	self.debug_out.append(line_info)
172
173	# Whitespace after section, or leading whitespace
174	_SPACE_1 = re.compile(r'[ ]+')
175	m = _SPACE_1.match(line, pos)
176	assert m, 'Expected whitespace %r' % line
177
178	pos = m.end()
179
180	# Keep matching topics until it doesn't match.
181	while True:
182	m = TOPIC_RE.match(line, pos)
183
184	if not m:
185	break
186
187	pos = m.end()
188
189	# The 1-based column number of the end of this topic
190	col = m.end(2) + 1
191	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
192	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
193	break
194
195	self._PrintTopic(m, out, line_info)
196
197	#log('trailing %r', line[pos:])
198
199	out.PrintTheRest()
200	return f.getvalue()
201
202
203	class Splitter(HTMLParser.HTMLParser):
204	"""Split an HTML stream starting at each of the heading tags.
205
206	For *-help.html.
207
208	TODO: Rewrite with this with lazylex!
209
210	Algorithm:
211	- ExtractBody() first, then match balanced tags
212	- SPLIT by h2, h3, h4
213	- Match <pre><code> blocks and re-indent
214	- Later:
215	- links <a href="">
216	- `` is turned into inline <code></code>
217	- for bold
218	- * * for emphasis
219	- <p> needs word wrapping! Oops.
220	- actually cmark seems to preserve this? OK maybe not.
221	- we just need space between <p>
222	"""
223
224	def __init__(self, heading_tags, out):
225	# type: (List[str], List) -> None
226	HTMLParser.HTMLParser.__init__(self)
227	self.heading_tags = heading_tags
228	self.out = out
229
230	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
231	self.in_heading = False
232
233	self.indent = 0
234
235	def log(self, msg, *args):
236	# type: (str, *Any) -> None
237	ind = self.indent * ' '
238	if 0:
239	log(ind + msg, *args)
240
241	def handle_starttag(self, tag, attrs):
242	# type: (str, List[Tuple[str, str]]) -> None
243	if tag in self.heading_tags:
244	self.in_heading = True
245	if self.cur_group:
246	self.out.append(self.cur_group)
247
248	self.cur_group = (tag, attrs, [], [])
249
250	self.log('[%d] <> %s %s', self.indent, tag, attrs)
251	self.indent += 1
252
253	def handle_endtag(self, tag):
254	# type: (str) -> None
255	if tag in self.heading_tags:
256	self.in_heading = False
257
258	self.log('[%d] </> %s', self.indent, tag)
259	self.indent -= 1
260
261	def handle_entityref(self, name):
262	# type: (str) -> None
263	"""
264	From Python docs:
265	This method is called to process a named character reference of the form
266	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
267	"""
268	c = html.CHAR_ENTITY[name]
269	if self.in_heading:
270	self.cur_group[2].append(c)
271	else:
272	if self.cur_group:
273	self.cur_group[3].append(c)
274
275	def handle_data(self, data):
276	# type: (str) -> None
277	self.log('data %r', data)
278	if self.in_heading:
279	self.cur_group[2].append(data)
280	else:
281	if self.cur_group:
282	self.cur_group[3].append(data)
283
284	def end(self):
285	# type: () -> None
286	if self.cur_group:
287	self.out.append(self.cur_group)
288
289	# Maybe detect nesting?
290	if self.indent != 0:
291	raise RuntimeError(
292	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
293	(self.indent, self.cur_group))
294
295
296	def ExtractBody(s):
297	# type: (str) -> str
298	"""Extract what's in between <body></body>
299
300	The splitter needs balanced tags, and what's in <head> isn't
301	balanced.
302	"""
303	f = cStringIO.StringIO()
304	out = html.Output(s, f)
305	tag_lexer = html.TagLexer(s)
306
307	pos = 0
308	it = html.ValidTokens(s)
309	while True:
310	try:
311	tok_id, end_pos = next(it)
312	except StopIteration:
313	break
314
315	if tok_id == h8_id.StartTag:
316	tag_lexer.Reset(pos, end_pos)
317	if tag_lexer.TagName() == 'body':
318	body_start_right = end_pos # right after <body>
319
320	out.SkipTo(body_start_right)
321	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
322
323	out.PrintUntil(body_end_left)
324	break
325
326	pos = end_pos
327
328	return f.getvalue()
329
330
331	def SplitIntoCards(heading_tags, contents):
332	# type: (List[str], str) -> Iterator
333	contents = ExtractBody(contents)
334
335	groups = []
336	sp = Splitter(heading_tags, groups)
337	sp.feed(contents)
338	sp.end()
339
340	for tag, attrs, heading_parts, parts in groups:
341	heading = ''.join(heading_parts).strip()
342
343	# Don't strip leading space?
344	text = ''.join(parts)
345	text = text.strip('\n') + '\n'
346
347	#log('text = %r', text[:10])
348
349	yield tag, attrs, heading, text
350
351	#log('make_help.py: Parsed %d parts', len(groups))
352
353
354	def HelpTopics(s):
355	"""
356	Given a rendered toc-{osh,ysh}.html
357
358	yield groups (section_id, section_name, block of text)
359	"""
360	tag_lexer = html.TagLexer(s)
361
362	pos = 0
363	it = html.ValidTokens(s)
364	while True:
365	try:
366	tok_id, end_pos = next(it)
367	except StopIteration:
368	break
369
370	if tok_id == h8_id.StartTag:
371	tag_lexer.Reset(pos, end_pos)
372	#log('%r', tag_lexer.TagString())
373	#log('%r', tag_lexer.TagName())
374
375	# Capture <h2 id="foo"> first
376	if tag_lexer.TagName() == 'h2':
377	h2_start_right = end_pos
378
379	open_tag_right = end_pos
380	section_id = tag_lexer.GetAttrRaw('id')
381	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
382
383	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
384
385	anchor_html = s[h2_start_right:h2_end_left]
386	paren_pos = anchor_html.find('<') # remove HTML link
387	if paren_pos == -1:
388	section_name = anchor_html
389	else:
390	section_name = anchor_html[:paren_pos].strip()
391
392	# Now find the <code></code> span
393	_, code_start_right = html.ReadUntilStartTag(
394	it, tag_lexer, 'code')
395	css_class = tag_lexer.GetAttrRaw('class')
396	assert css_class is not None
397	assert css_class.startswith(
398	'language-chapter-links-'), tag_lexer.TagString()
399
400	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
401
402	text = html.ToText(s, code_start_right, code_end_left)
403	yield section_id, section_name, text
404
405	pos = end_pos
406
407
408	class DocNode(object):
409	"""To visualize doc structure."""
410
411	def __init__(self, name, attrs=None, text=None):
412	# type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
413	self.name = name
414	self.attrs = attrs # for h2 and h3 links
415	self.text = text
416	self.children = []
417
418
419	def CardsFromIndex(sh, out_prefix):
420	sections = []
421	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
422	if 0:
423	log('section_id = %r', section_id)
424	log('section_name = %r', section_name)
425	log('')
426	#log('text = %r', text[:20])
427
428	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
429
430	path = os.path.join(out_prefix, topic)
431	with open(path, 'w') as f:
432	f.write('%s\n\n' %
433	section_name) # section_id is printed dynamically
434	f.write(text)
435	#f.write('\n') # extra
436	#log(' Wrote %s', path)
437	sections.append(section_id)
438
439	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
440	out_prefix)
441
442
443	def CardsFromChapters(
444	out_dir, # type: str
445	tag_level, # type: str
446	paths, # type: List[str]
447	):
448	# type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
449	"""
450	Args:
451	paths: list of chap-*.html to read
452	"""
453	topic_to_chap = {}
454
455	root_node = DocNode('/')
456	cur_h2_node = None
457
458	for path in paths:
459	with open(path) as f:
460	contents = f.read()
461
462	filename = os.path.basename(path)
463
464	tmp, _ = os.path.splitext(filename)
465	assert tmp.startswith('chap-')
466	chapter_name = tmp[len('chap-'):]
467
468	page_node = DocNode(filename)
469
470	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
471
472	for tag, attrs, heading, text in cards:
473	values = [v for k, v in attrs if k == 'id']
474	id_value = values[0] if len(values) == 1 else None
475
476	topic_id = (id_value if id_value else html_lib.PrettyHref(
477	heading, preserve_anchor_case=True))
478
479	if tag == 'h2':
480	h2 = DocNode(topic_id, attrs=attrs)
481	page_node.children.append(h2)
482	cur_h2_node = h2
483	elif tag == 'h3':
484	# attach text so we can see which topics have empty bodies
485	h3 = DocNode(topic_id, attrs=attrs, text=text)
486	cur_h2_node.children.append(h3)
487
488	if tag != tag_level:
489	continue # we only care about h3 now
490
491	if 0:
492	log('tag = %r', tag)
493	log('topic_id = %r', topic_id)
494	log('heading = %r', heading)
495	log('text = %r', text[:20])
496
497	embed = ('oils-embed', '1') in attrs
498
499	if out_dir is not None and embed:
500	# indices start with _
501	path = os.path.join(out_dir, topic_id)
502	with open(path, 'w') as f:
503	f.write(text)
504
505	# help builtin will show URL if there's a chapter name
506	topic_to_chap[topic_id] = None if embed else chapter_name
507
508	root_node.children.append(page_node)
509
510	num_sections = sum(len(child.children) for child in root_node.children)
511
512	log(
513	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
514	len(paths), len(topic_to_chap), num_sections, out_dir)
515
516	return topic_to_chap, root_node
517
518
519	class StrPool(object):
520
521	def __init__(self):
522	# type: () -> None
523	self.var_names = {}
524	self.global_strs = []
525	self.unique_id = 1
526
527	def Add(self, s):
528	# type: (str) -> None
529	if s in self.var_names:
530	return
531
532	var_name = 'gStr%d' % self.unique_id
533	self.unique_id += 1
534
535	import json
536	# Use JSON as approximation for C++ string
537	self.global_strs.append('GLOBAL_STR(%s, %s)' %
538	(var_name, json.dumps(s)))
539
540	self.var_names[s] = var_name
541
542
543	def WriteTopicDict(topic_dict, header_f, cc_f):
544	# type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
545	header_f.write('''
546	#include "mycpp/runtime.h"
547
548	namespace help_meta {
549	Dict<BigStr, BigStr>* TopicMetadata();
550	}
551	''')
552
553	pool = StrPool()
554
555	for k, v in topic_dict.iteritems():
556	pool.Add(k)
557	if v is not None:
558	pool.Add(v)
559	#log('%s %s', k, v)
560
561	num_items = len(topic_dict)
562	key_names = []
563	val_names = []
564
565	for k, v in topic_dict.iteritems():
566	key_names.append(pool.var_names[k])
567	if v is None:
568	v_str = 'nullptr'
569	else:
570	v_str = pool.var_names[v]
571	val_names.append(v_str)
572
573	cc_f.write('''
574	#include "mycpp/runtime.h"
575
576	namespace help_meta {
577
578	%s
579
580	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
581
582	Dict<BigStr, BigStr>* TopicMetadata() {
583	return gTopics;
584	}
585	}
586	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
587	' COMMA '.join(val_names)))
588
589
590	def main(argv):
591	# type: (List[str]) -> None
592	action = argv[1]
593
594	if action == 'cards-from-index':
595	sh = argv[2] # osh or ysh
596	out_prefix = argv[3]
597
598	# Read HTML from stdin
599	# TODO: could pass a list of files to speed it up
600	CardsFromIndex(sh, out_prefix)
601
602	elif action == 'cards-from-chapters':
603
604	out_dir = argv[2]
605	py_out = argv[3]
606	cc_prefix = argv[4]
607	pages = argv[5:]
608
609	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
610
611	# Write topic dict as Python and C++
612
613	with open(py_out, 'w') as f:
614	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
615
616	f.write('''
617
618	from typing import Dict
619
620	def TopicMetadata():
621	# type: () -> Dict[str, str]
622	return TOPICS
623	''')
624
625	h_path = cc_prefix + '.h'
626	cc_path = cc_prefix + '.cc'
627
628	with open(h_path, 'w') as header_f:
629	with open(cc_path, 'w') as cc_f:
630	WriteTopicDict(topic_to_chap, header_f, cc_f)
631
632	elif action == 'ref-check':
633	from doctools import cmark
634	from doctools import oils_doc
635	from doctools import ref_check
636
637	chapters = []
638	all_toc_nodes = []
639
640	for path in argv[2:]:
641	filename = os.path.basename(path)
642
643	if filename.endswith('.md'):
644	assert filename.startswith('toc-'), path
645
646	# First convert to HTML
647	with open(path) as in_file:
648	html = cmark.md2html(in_file.read())
649
650	# Now highlight code, which # which gives debug output for the
651	# language-chapter-links-*
652
653	box_nodes = []
654	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
655	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
656
657	elif filename.endswith('.html'):
658	assert filename.startswith('chap-'), path
659	chapters.append(path)
660
661	else:
662	raise RuntimeError('Expected toc-* or chap-*, got %r' %
663	filename)
664
665	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
666
667	#log('%d chapters: %s', len(chapters), chapters[:5])
668	#log('%d topics: %s', len(topics), topics.keys()[:10])
669	log('')
670
671	# Compare TOC vs. chapters
672	ref_check.Check(all_toc_nodes, chap_tree)
673
674	else:
675	raise RuntimeError('Invalid action %r' % action)
676
677
678	if __name__ == '__main__':
679	try:
680	main(sys.argv)
681	except RuntimeError as e:
682	print('FATAL: %s' % e, file=sys.stderr)
683	sys.exit(1)