doctools/help

OILS / doctools / help_gen.py View on Github | oils.pub

692 lines, 360 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	from typing import List, Any, Dict, Iterator, Tuple, Optional, IO
4	"""help_gen.py
5
6	Ideas for HTML -> ANSI converter:
7
8	- `ls` -> <code>ls</code> -> is reverse video?
9	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10	- could also be bright blue
11	- <pre> is also indented 4 spaces, like the markdown
12	- red X <span class="X">X</span>
13
14	- comments in code examples could be green?
15
16	What about:
17
18	- headings h1, h2, h3, h4
19	- Right now cards use reverse video. Centering didn't look great.
20
21	- <ul> - you could use a Unicode bullet here
22	- <ol>
23
24	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25	thing.
26	"""
27
28	import cStringIO
29	import HTMLParser
30	import os
31	import pprint
32	import re
33	import sys
34
35	from typing import AnyStr
36
37	from _devbuild.gen.htm8_asdl import h8_id
38	from data_lang import htm8
39	from doctools import html_lib
40	from doctools.util import log
41	from lazylex import html
42
43	#from typing import List, Tuple
44
45	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
46	# turned into anchors.
47	SECTION_RE = re.compile(
48	r'''
49	\s*
50	\[
51	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
52	\]
53	''', re.VERBOSE)
54
55	# Complex heuristic to highlight topics.
56	TOPIC_RE = re.compile(
57	r'''
58	(X[ ])? # optional deprecation symbol X, then a single space
59	@? # optional @array, e.g. @BASH_SOURCE
60
61	([a-zA-Z_][a-zA-Z0-9/:_-]+)
62	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
63	# List/append, cmd/append
64
65	( [ ] [^a-zA-Z0-9 ] \S*
66	# trailer like >> or (make)
67	\|
68	\(\) # optional () for func()
69	)?
70
71	( # order of these 2 clauses matters
72	[ ]*\n # spaces/newline
73	\|
74	[ ]+ # 1 or more spaces
75	)
76	''', re.VERBOSE)
77	"""
78	''', re.VERBOSE)
79	"""
80
81
82	def _StringToHref(s):
83	# lower case to match what doctools/cmark.py does
84	return s.lower().replace(' ', '-')
85
86
87	X_LEFT_SPAN = '<span style="color: darkred">'
88
89
90	class TopicHtmlRenderer(object):
91
92	def __init__(self, chapter, debug_out, linkify_stop_col):
93	# type: (str, List, int) -> None
94	self.chapter = chapter
95	self.debug_out = debug_out
96	self.linkify_stop_col = linkify_stop_col
97
98	self.html_page = 'chap-%s.html' % chapter
99
100	def _PrintTopic(self, m, out, line_info):
101	# type: (Any, htm8.Output, Dict[str, Any]) -> None
102	# The X
103	topic_impl = True
104	if m.group(1):
105	out.PrintUntil(m.start(1))
106	out.Print(X_LEFT_SPAN)
107	out.PrintUntil(m.end(1))
108	out.Print('</span>')
109	topic_impl = False
110
111	# The topic name to link
112	topic = m.group(2)
113	line_info['topics'].append((topic, topic_impl))
114
115	out.PrintUntil(m.start(2))
116	out.Print('<a href="%s#%s">' % (self.html_page, topic))
117	out.PrintUntil(m.end(2))
118	out.Print('</a>')
119
120	def Render(self, line):
121	# type: (str) -> str
122	"""Convert a line of text to HTML.
123
124	Topics are highlighted and X made red.
125
126	Args:
127	chapter: where to link to
128	line: RAW SPAN of HTML that is already escaped.
129	debug_out: structured data
130
131	Returns:
132	The HTML with some tags inserted.
133	"""
134	f = cStringIO.StringIO()
135	out = htm8.Output(line, f)
136
137	pos = 0 # position within line
138
139	section_impl = True
140
141	if line.startswith('X '):
142	out.Print(X_LEFT_SPAN)
143	out.PrintUntil(2)
144	out.Print('</span>')
145	pos = 2
146	section_impl = False
147	elif line.startswith(' '):
148	pos = 2
149	else:
150	return line
151
152	# Highlight [Section] at the start of a line.
153	m = SECTION_RE.match(line, pos)
154	if m:
155	section_name = m.group(1)
156	#href = _StringToHref(section_name)
157	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
158
159	out.PrintUntil(m.start(1))
160	out.Print('<a href="%s#%s" class="level2">' %
161	(self.html_page, href))
162	out.PrintUntil(m.end(1)) # anchor
163	out.Print('</a>')
164
165	pos = m.end(0) # ADVANCE
166	else:
167	section_name = None
168
169	line_info = {
170	'section': section_name,
171	'impl': section_impl,
172	'topics': []
173	}
174	self.debug_out.append(line_info)
175
176	# Whitespace after section, or leading whitespace
177	_SPACE_1 = re.compile(r'[ ]+')
178	m = _SPACE_1.match(line, pos)
179	assert m, 'Expected whitespace %r' % line
180
181	pos = m.end()
182
183	# Keep matching topics until it doesn't match.
184	while True:
185	m = TOPIC_RE.match(line, pos)
186
187	if not m:
188	break
189
190	pos = m.end()
191
192	# The 1-based column number of the end of this topic
193	col = m.end(2) + 1
194	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
195	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
196	break
197
198	self._PrintTopic(m, out, line_info)
199
200	#log('trailing %r', line[pos:])
201
202	out.PrintTheRest()
203	return f.getvalue()
204
205
206	CurGroup = Tuple[AnyStr, List[Tuple[AnyStr, AnyStr]], AnyStr, List[Any]]
207
208
209	class Splitter(HTMLParser.HTMLParser):
210	"""Split an HTML stream starting at each of the heading tags.
211
212	For *-help.html.
213
214	TODO: Rewrite with this with lazylex!
215
216	Algorithm:
217	- ExtractBody() first, then match balanced tags
218	- SPLIT by h2, h3, h4
219	- Match <pre><code> blocks and re-indent
220	- Later:
221	- links <a href="">
222	- `` is turned into inline <code></code>
223	- for bold
224	- * * for emphasis
225	- <p> needs word wrapping! Oops.
226	- actually cmark seems to preserve this? OK maybe not.
227	- we just need space between <p>
228	"""
229
230	def __init__(self, heading_tags, out):
231	# type: (List[str], List[CurGroup]) -> None
232	HTMLParser.HTMLParser.__init__(self)
233	self.heading_tags = heading_tags
234	self.out = out
235
236	self.cur_group = None # type: CurGroup
237	self.in_heading = False
238
239	self.indent = 0
240
241	def log(self, msg, *args):
242	# type: (str, *Any) -> None
243	ind = self.indent * ' '
244	if 0:
245	log(ind + msg, *args)
246
247	def handle_starttag(self, tag, attrs):
248	# type: (AnyStr, List[Tuple[AnyStr, AnyStr]]) -> None
249	if tag in self.heading_tags:
250	self.in_heading = True
251	if self.cur_group:
252	self.out.append(self.cur_group)
253
254	self.cur_group = (tag, attrs, [], [])
255
256	self.log('[%d] <> %s %s', self.indent, tag, attrs)
257	self.indent += 1
258
259	def handle_endtag(self, tag):
260	# type: (AnyStr) -> None
261	if tag in self.heading_tags:
262	self.in_heading = False
263
264	self.log('[%d] </> %s', self.indent, tag)
265	self.indent -= 1
266
267	def handle_entityref(self, name):
268	# type: (AnyStr) -> None
269	"""
270	From Python docs:
271	This method is called to process a named character reference of the form
272	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
273	"""
274	c = html.CHAR_ENTITY[name]
275	if self.in_heading:
276	self.cur_group[2].append(c)
277	else:
278	if self.cur_group:
279	self.cur_group[3].append(c)
280
281	def handle_data(self, data):
282	# type: (AnyStr) -> None
283	self.log('data %r', data)
284	if self.in_heading:
285	self.cur_group[2].append(data)
286	else:
287	if self.cur_group:
288	self.cur_group[3].append(data)
289
290	def end(self):
291	# type: () -> None
292	if self.cur_group:
293	self.out.append(self.cur_group)
294
295	# Maybe detect nesting?
296	if self.indent != 0:
297	raise RuntimeError(
298	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
299	(self.indent, self.cur_group))
300
301
302	def ExtractBody(s):
303	# type: (str) -> str
304	"""Extract what's in between <body></body>
305
306	The splitter needs balanced tags, and what's in <head> isn't
307	balanced.
308	"""
309	f = cStringIO.StringIO()
310	out = htm8.Output(s, f)
311	tag_lexer = htm8.TagLexer(s)
312
313	pos = 0
314	it = html.ValidTokens(s)
315	while True:
316	try:
317	tok_id, end_pos = next(it)
318	except StopIteration:
319	break
320
321	if tok_id == h8_id.StartTag:
322	tag_lexer.Reset(pos, end_pos)
323	if tag_lexer.GetTagName() == 'body':
324	body_start_right = end_pos # right after <body>
325
326	out.SkipTo(body_start_right)
327	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
328
329	out.PrintUntil(body_end_left)
330	break
331
332	pos = end_pos
333
334	return f.getvalue()
335
336
337	def SplitIntoCards(heading_tags, contents):
338	# type: (List[str], str) -> Iterator[str, Any, str, str]
339	contents = ExtractBody(contents)
340
341	groups = []
342	sp = Splitter(heading_tags, groups)
343	sp.feed(contents)
344	sp.end()
345
346	for tag, attrs, heading_parts, parts in groups:
347	heading = ''.join(heading_parts).strip()
348
349	# Don't strip leading space?
350	text = ''.join(parts)
351	text = text.strip('\n') + '\n'
352
353	#log('text = %r', text[:10])
354
355	yield tag, attrs, heading, text
356
357	#log('make_help.py: Parsed %d parts', len(groups))
358
359
360	def HelpTopics(s):
361	# type: (str) -> Iterator[Tuple[str, str, str]]
362	"""
363	Given a rendered toc-{osh,ysh}.html
364
365	yield groups (section_id, section_name, block of text)
366	"""
367	tag_lexer = htm8.TagLexer(s)
368
369	pos = 0
370	it = html.ValidTokens(s)
371	while True:
372	try:
373	tok_id, end_pos = next(it)
374	except StopIteration:
375	break
376
377	if tok_id == h8_id.StartTag:
378	tag_lexer.Reset(pos, end_pos)
379	#log('%r', tag_lexer.TagString())
380	#log('%r', tag_lexer.GetTagName())
381
382	# Capture <h2 id="foo"> first
383	if tag_lexer.GetTagName() == 'h2':
384	h2_start_right = end_pos
385
386	open_tag_right = end_pos
387	section_id = tag_lexer.GetAttrRaw('id')
388	assert section_id, 'Expected id= in %r' % tag_lexer.WholeTagString(
389	)
390
391	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
392
393	anchor_html = s[h2_start_right:h2_end_left]
394	paren_pos = anchor_html.find('<') # remove HTML link
395	if paren_pos == -1:
396	section_name = anchor_html
397	else:
398	section_name = anchor_html[:paren_pos].strip()
399
400	# Now find the <code></code> span
401	_, code_start_right = html.ReadUntilStartTag(
402	it, tag_lexer, 'code')
403	css_class = tag_lexer.GetAttrRaw('class')
404	assert css_class is not None
405	assert (css_class.startswith('language-chapter-links-')
406	), tag_lexer.WholeTagString()
407
408	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
409
410	text = html.ToText(s, code_start_right, code_end_left)
411	yield section_id, section_name, text
412
413	pos = end_pos
414
415
416	class DocNode(object):
417	"""To visualize doc structure."""
418
419	def __init__(self, name, attrs=None, text=None):
420	# type: (str, Optional[List[Tuple[str, str]]], Optional[str]) -> None
421	self.name = name
422	self.attrs = attrs # for h2 and h3 links
423	self.text = text
424	self.children = []
425
426
427	def CardsFromIndex(sh, out_prefix):
428	# type: (str, str) -> None
429	sections = []
430	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
431	if 0:
432	log('section_id = %r', section_id)
433	log('section_name = %r', section_name)
434	log('')
435	#log('text = %r', text[:20])
436
437	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
438
439	path = os.path.join(out_prefix, topic)
440	with open(path, 'w') as f:
441	f.write('%s\n\n' %
442	section_name) # section_id is printed dynamically
443	f.write(text)
444	#f.write('\n') # extra
445	#log(' Wrote %s', path)
446	sections.append(section_id)
447
448	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
449	out_prefix)
450
451
452	def CardsFromChapters(
453	out_dir, # type: str
454	tag_level, # type: str
455	paths, # type: List[str]
456	):
457	# type: (...) -> Tuple[Dict[str, Optional[str]], DocNode]
458	"""
459	Args:
460	paths: list of chap-*.html to read
461	"""
462	topic_to_chap = {}
463
464	root_node = DocNode('/')
465	cur_h2_node = None
466
467	for path in paths:
468	with open(path) as f:
469	contents = f.read()
470
471	filename = os.path.basename(path)
472
473	tmp, _ = os.path.splitext(filename)
474	assert tmp.startswith('chap-')
475	chapter_name = tmp[len('chap-'):]
476
477	page_node = DocNode(filename)
478
479	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
480
481	for tag, attrs, heading, text in cards:
482	values = [v for k, v in attrs if k == 'id']
483	id_value = values[0] if len(values) == 1 else None
484
485	topic_id = (id_value if id_value else html_lib.PrettyHref(
486	heading, preserve_anchor_case=True))
487
488	if tag == 'h2':
489	h2 = DocNode(topic_id, attrs=attrs)
490	page_node.children.append(h2)
491	cur_h2_node = h2
492	elif tag == 'h3':
493	# attach text so we can see which topics have empty bodies
494	h3 = DocNode(topic_id, attrs=attrs, text=text)
495	cur_h2_node.children.append(h3)
496
497	if tag != tag_level:
498	continue # we only care about h3 now
499
500	if 0:
501	log('tag = %r', tag)
502	log('topic_id = %r', topic_id)
503	log('heading = %r', heading)
504	log('text = %r', text[:20])
505
506	embed = ('oils-embed', '1') in attrs
507
508	if out_dir is not None and embed:
509	# indices start with _
510	path = os.path.join(out_dir, topic_id)
511	with open(path, 'w') as f:
512	f.write(text)
513
514	# help builtin will show URL if there's a chapter name
515	topic_to_chap[topic_id] = None if embed else chapter_name
516
517	root_node.children.append(page_node)
518
519	num_sections = sum(len(child.children) for child in root_node.children)
520
521	log(
522	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
523	len(paths), len(topic_to_chap), num_sections, out_dir)
524
525	return topic_to_chap, root_node
526
527
528	class StrPool(object):
529
530	def __init__(self):
531	# type: () -> None
532	self.var_names = {}
533	self.global_strs = []
534	self.unique_id = 1
535
536	def Add(self, s):
537	# type: (str) -> None
538	if s in self.var_names:
539	return
540
541	var_name = 'gStr%d' % self.unique_id
542	self.unique_id += 1
543
544	import json
545	# Use JSON as approximation for C++ string
546	self.global_strs.append('GLOBAL_STR(%s, %s)' %
547	(var_name, json.dumps(s)))
548
549	self.var_names[s] = var_name
550
551
552	def WriteTopicDict(topic_dict, header_f, cc_f):
553	# type: (Dict[str, Optional[str]], IO[bytes], IO[bytes]) -> None
554	header_f.write('''
555	#include "mycpp/runtime.h"
556
557	namespace help_meta {
558	Dict<BigStr, BigStr>* TopicMetadata();
559	}
560	''')
561
562	pool = StrPool()
563
564	for k, v in topic_dict.iteritems():
565	pool.Add(k)
566	if v is not None:
567	pool.Add(v)
568	#log('%s %s', k, v)
569
570	num_items = len(topic_dict)
571	key_names = []
572	val_names = []
573
574	for k, v in topic_dict.iteritems():
575	key_names.append(pool.var_names[k])
576	if v is None:
577	v_str = 'nullptr'
578	else:
579	v_str = pool.var_names[v]
580	val_names.append(v_str)
581
582	cc_f.write('''
583	#include "mycpp/runtime.h"
584
585	namespace help_meta {
586
587	%s
588
589	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
590
591	Dict<BigStr, BigStr>* TopicMetadata() {
592	return gTopics;
593	}
594	}
595	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
596	' COMMA '.join(val_names)))
597
598
599	def main(argv):
600	# type: (List[str]) -> None
601	action = argv[1]
602
603	if action == 'cards-from-index':
604	sh = argv[2] # osh or ysh
605	out_prefix = argv[3]
606
607	# Read HTML from stdin
608	# TODO: could pass a list of files to speed it up
609	CardsFromIndex(sh, out_prefix)
610
611	elif action == 'cards-from-chapters':
612
613	out_dir = argv[2]
614	py_out = argv[3]
615	cc_prefix = argv[4]
616	pages = argv[5:]
617
618	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
619
620	# Write topic dict as Python and C++
621
622	with open(py_out, 'w') as f:
623	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
624
625	f.write('''
626
627	from typing import Dict
628
629	def TopicMetadata():
630	# type: () -> Dict[str, str]
631	return TOPICS
632	''')
633
634	h_path = cc_prefix + '.h'
635	cc_path = cc_prefix + '.cc'
636
637	with open(h_path, 'w') as header_f:
638	with open(cc_path, 'w') as cc_f:
639	WriteTopicDict(topic_to_chap, header_f, cc_f)
640
641	elif action == 'ref-check':
642	from doctools import cmark
643	from doctools import oils_doc
644	from doctools import ref_check
645
646	chapters = []
647	all_toc_nodes = []
648
649	for path in argv[2:]:
650	filename = os.path.basename(path)
651
652	if filename.endswith('.md'):
653	assert filename.startswith('toc-'), path
654
655	# First convert to HTML
656	with open(path) as in_file:
657	html = cmark.md2html(in_file.read())
658
659	# Now highlight code, which # which gives debug output for the
660	# language-chapter-links-*
661
662	box_nodes = []
663	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
664	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
665
666	elif filename.endswith('.html'):
667	assert filename.startswith('chap-'), path
668	chapters.append(path)
669
670	else:
671	raise RuntimeError('Expected toc-* or chap-*, got %r' %
672	filename)
673
674	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
675
676	#log('%d chapters: %s', len(chapters), chapters[:5])
677	#log('%d topics: %s', len(topics), topics.keys()[:10])
678	log('')
679
680	# Compare TOC vs. chapters
681	ref_check.Check(all_toc_nodes, chap_tree)
682
683	else:
684	raise RuntimeError('Invalid action %r' % action)
685
686
687	if __name__ == '__main__':
688	try:
689	main(sys.argv)
690	except RuntimeError as e:
691	print('FATAL: %s' % e, file=sys.stderr)
692	sys.exit(1)