doctools/help

OILS / doctools / help_gen.py View on Github | oilshell.org

659 lines, 351 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""help_gen.py
4
5	Ideas for HTML -> ANSI converter:
6
7	- `ls` -> <code>ls</code> -> is reverse video?
8	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
9	- could also be bright blue
10	- <pre> is also indented 4 spaces, like the markdown
11	- red X <span class="X">X</span>
12
13	- comments in code examples could be green?
14
15	What about:
16
17	- headings h1, h2, h3, h4
18	- Right now cards use reverse video. Centering didn't look great.
19
20	- <ul> - you could use a Unicode bullet here
21	- <ol>
22
23	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
24	thing.
25	"""
26
27	import cStringIO
28	import HTMLParser
29	import os
30	import pprint
31	import re
32	import sys
33
34	from doctools import html_lib
35	from doctools.util import log
36	from lazylex import html
37
38	#from typing import List, Tuple
39
40	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
41	# turned into anchors.
42	SECTION_RE = re.compile(
43	r'''
44	\s*
45	\[
46	([a-zA-Z0-9 /:-]+) # allow ysh:upgrade, byo-server-lib, etc.
47	\]
48	''', re.VERBOSE)
49
50	# Complex heuristic to highlight topics.
51	TOPIC_RE = re.compile(
52	r'''
53	(X[ ])? # optional deprecation symbol X, then a single space
54	@? # optional @array, e.g. @BASH_SOURCE
55
56	([a-zA-Z_][a-zA-Z0-9/:_-]+)
57	# topic names: osh-usage, _status, ysh:all, BASH_REMATCH
58	# List/append, cmd/append
59
60	( [ ] [^a-zA-Z0-9 ] \S*
61	# trailer like >> or (make)
62	\|
63	\(\) # optional () for func()
64	)?
65
66	( # order of these 2 clauses matters
67	[ ]*\n # spaces/newline
68	\|
69	[ ]+ # 1 or more spaces
70	)
71	''', re.VERBOSE)
72	"""
73	''', re.VERBOSE)
74	"""
75
76
77	def _StringToHref(s):
78	# lower case to match what doctools/cmark.py does
79	return s.lower().replace(' ', '-')
80
81
82	X_LEFT_SPAN = '<span style="color: darkred">'
83
84
85	class TopicHtmlRenderer(object):
86
87	def __init__(self, chapter, debug_out, linkify_stop_col):
88	self.chapter = chapter
89	self.debug_out = debug_out
90	self.linkify_stop_col = linkify_stop_col
91
92	self.html_page = 'chap-%s.html' % chapter
93
94	def _PrintTopic(self, m, out, line_info):
95	# The X
96	topic_impl = True
97	if m.group(1):
98	out.PrintUntil(m.start(1))
99	out.Print(X_LEFT_SPAN)
100	out.PrintUntil(m.end(1))
101	out.Print('</span>')
102	topic_impl = False
103
104	# The topic name to link
105	topic = m.group(2)
106	line_info['topics'].append((topic, topic_impl))
107
108	out.PrintUntil(m.start(2))
109	out.Print('<a href="%s#%s">' % (self.html_page, topic))
110	out.PrintUntil(m.end(2))
111	out.Print('</a>')
112
113	def Render(self, line):
114	"""Convert a line of text to HTML.
115
116	Topics are highlighted and X made red.
117
118	Args:
119	chapter: where to link to
120	line: RAW SPAN of HTML that is already escaped.
121	debug_out: structured data
122
123	Returns:
124	The HTML with some tags inserted.
125	"""
126	f = cStringIO.StringIO()
127	out = html.Output(line, f)
128
129	pos = 0 # position within line
130
131	section_impl = True
132
133	if line.startswith('X '):
134	out.Print(X_LEFT_SPAN)
135	out.PrintUntil(2)
136	out.Print('</span>')
137	pos = 2
138	section_impl = False
139	elif line.startswith(' '):
140	pos = 2
141	else:
142	return line
143
144	# Highlight [Section] at the start of a line.
145	m = SECTION_RE.match(line, pos)
146	if m:
147	section_name = m.group(1)
148	#href = _StringToHref(section_name)
149	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
150
151	out.PrintUntil(m.start(1))
152	out.Print('<a href="%s#%s" class="level2">' %
153	(self.html_page, href))
154	out.PrintUntil(m.end(1)) # anchor
155	out.Print('</a>')
156
157	pos = m.end(0) # ADVANCE
158	else:
159	section_name = None
160
161	line_info = {
162	'section': section_name,
163	'impl': section_impl,
164	'topics': []
165	}
166	self.debug_out.append(line_info)
167
168	# Whitespace after section, or leading whitespace
169	_SPACE_1 = re.compile(r'[ ]+')
170	m = _SPACE_1.match(line, pos)
171	assert m, 'Expected whitespace %r' % line
172
173	pos = m.end()
174
175	# Keep matching topics until it doesn't match.
176	while True:
177	m = TOPIC_RE.match(line, pos)
178
179	if not m:
180	break
181
182	pos = m.end()
183
184	# The 1-based column number of the end of this topic
185	col = m.end(2) + 1
186	if self.linkify_stop_col != -1 and col > self.linkify_stop_col:
187	#log('STOPPING %d > %d' % (col, self.linkify_stop_col))
188	break
189
190	self._PrintTopic(m, out, line_info)
191
192	#log('trailing %r', line[pos:])
193
194	out.PrintTheRest()
195	return f.getvalue()
196
197
198	class Splitter(HTMLParser.HTMLParser):
199	"""Split an HTML stream starting at each of the heading tags.
200
201	For *-help.html.
202
203	TODO: Rewrite with this with lazylex!
204
205	Algorithm:
206	- ExtractBody() first, then match balanced tags
207	- SPLIT by h2, h3, h4
208	- Match <pre><code> blocks and re-indent
209	- Later:
210	- links <a href="">
211	- `` is turned into inline <code></code>
212	- for bold
213	- * * for emphasis
214	- <p> needs word wrapping! Oops.
215	- actually cmark seems to preserve this? OK maybe not.
216	- we just need space between <p>
217	"""
218
219	def __init__(self, heading_tags, out):
220	HTMLParser.HTMLParser.__init__(self)
221	self.heading_tags = heading_tags
222	self.out = out
223
224	self.cur_group = None # type-not-checked: List[Tuple[str, str, List, List]]
225	self.in_heading = False
226
227	self.indent = 0
228
229	def log(self, msg, *args):
230	ind = self.indent * ' '
231	if 0:
232	log(ind + msg, *args)
233
234	def handle_starttag(self, tag, attrs):
235	if tag in self.heading_tags:
236	self.in_heading = True
237	if self.cur_group:
238	self.out.append(self.cur_group)
239
240	self.cur_group = (tag, attrs, [], [])
241
242	self.log('[%d] <> %s %s', self.indent, tag, attrs)
243	self.indent += 1
244
245	def handle_endtag(self, tag):
246	if tag in self.heading_tags:
247	self.in_heading = False
248
249	self.log('[%d] </> %s', self.indent, tag)
250	self.indent -= 1
251
252	def handle_entityref(self, name):
253	"""
254	From Python docs:
255	This method is called to process a named character reference of the form
256	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
257	"""
258	c = html.CHAR_ENTITY[name]
259	if self.in_heading:
260	self.cur_group[2].append(c)
261	else:
262	if self.cur_group:
263	self.cur_group[3].append(c)
264
265	def handle_data(self, data):
266	self.log('data %r', data)
267	if self.in_heading:
268	self.cur_group[2].append(data)
269	else:
270	if self.cur_group:
271	self.cur_group[3].append(data)
272
273	def end(self):
274	if self.cur_group:
275	self.out.append(self.cur_group)
276
277	# Maybe detect nesting?
278	if self.indent != 0:
279	raise RuntimeError(
280	'Unbalanced HTML tags: indent=%d, cur_group=%s' %
281	(self.indent, self.cur_group))
282
283
284	def ExtractBody(s):
285	"""Extract what's in between <body></body>
286
287	The splitter needs balanced tags, and what's in <head> isn't
288	balanced.
289	"""
290	f = cStringIO.StringIO()
291	out = html.Output(s, f)
292	tag_lexer = html.TagLexer(s)
293
294	pos = 0
295	it = html.ValidTokens(s)
296	while True:
297	try:
298	tok_id, end_pos = next(it)
299	except StopIteration:
300	break
301
302	if tok_id == html.StartTag:
303	tag_lexer.Reset(pos, end_pos)
304	if tag_lexer.TagName() == 'body':
305	body_start_right = end_pos # right after <body>
306
307	out.SkipTo(body_start_right)
308	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
309
310	out.PrintUntil(body_end_left)
311	break
312
313	pos = end_pos
314
315	return f.getvalue()
316
317
318	def SplitIntoCards(heading_tags, contents):
319	contents = ExtractBody(contents)
320
321	groups = []
322	sp = Splitter(heading_tags, groups)
323	sp.feed(contents)
324	sp.end()
325
326	for tag, attrs, heading_parts, parts in groups:
327	heading = ''.join(heading_parts).strip()
328
329	# Don't strip leading space?
330	text = ''.join(parts)
331	text = text.strip('\n') + '\n'
332
333	#log('text = %r', text[:10])
334
335	yield tag, attrs, heading, text
336
337	#log('make_help.py: Parsed %d parts', len(groups))
338
339
340	def HelpTopics(s):
341	"""
342	Given a rendered toc-{osh,ysh}.html
343
344	yield groups (section_id, section_name, block of text)
345	"""
346	tag_lexer = html.TagLexer(s)
347
348	pos = 0
349	it = html.ValidTokens(s)
350	while True:
351	try:
352	tok_id, end_pos = next(it)
353	except StopIteration:
354	break
355
356	if tok_id == html.StartTag:
357	tag_lexer.Reset(pos, end_pos)
358	#log('%r', tag_lexer.TagString())
359	#log('%r', tag_lexer.TagName())
360
361	# Capture <h2 id="foo"> first
362	if tag_lexer.TagName() == 'h2':
363	h2_start_right = end_pos
364
365	open_tag_right = end_pos
366	section_id = tag_lexer.GetAttr('id')
367	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
368
369	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
370
371	anchor_html = s[h2_start_right:h2_end_left]
372	paren_pos = anchor_html.find('<') # remove HTML link
373	if paren_pos == -1:
374	section_name = anchor_html
375	else:
376	section_name = anchor_html[:paren_pos].strip()
377
378	# Now find the <code></code> span
379	_, code_start_right = html.ReadUntilStartTag(
380	it, tag_lexer, 'code')
381	css_class = tag_lexer.GetAttr('class')
382	assert css_class is not None
383	assert css_class.startswith(
384	'language-chapter-links-'), tag_lexer.TagString()
385
386	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
387
388	text = html.ToText(s, code_start_right, code_end_left)
389	yield section_id, section_name, text
390
391	pos = end_pos
392
393
394	class DocNode(object):
395	"""To visualize doc structure."""
396
397	def __init__(self, name, attrs=None, text=None):
398	self.name = name
399	self.attrs = attrs # for h2 and h3 links
400	self.text = text
401	self.children = []
402
403
404	def CardsFromIndex(sh, out_prefix):
405	sections = []
406	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
407	if 0:
408	log('section_id = %r', section_id)
409	log('section_name = %r', section_name)
410	log('')
411	#log('text = %r', text[:20])
412
413	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
414
415	path = os.path.join(out_prefix, topic)
416	with open(path, 'w') as f:
417	f.write('%s\n\n' %
418	section_name) # section_id is printed dynamically
419	f.write(text)
420	#f.write('\n') # extra
421	log(' Wrote %s', path)
422	sections.append(section_id)
423
424	log(' (doctools/make_help) -> %d sections -> %s', len(sections),
425	out_prefix)
426
427
428	def CardsFromChapters(out_dir, tag_level, paths):
429	"""
430	Args:
431	paths: list of chap-*.html to read
432	"""
433	topic_to_chap = {}
434
435	root_node = DocNode('/')
436	cur_h2_node = None
437
438	for path in paths:
439	with open(path) as f:
440	contents = f.read()
441
442	filename = os.path.basename(path)
443
444	tmp, _ = os.path.splitext(filename)
445	assert tmp.startswith('chap-')
446	chapter_name = tmp[len('chap-'):]
447
448	page_node = DocNode(filename)
449
450	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
451
452	for tag, attrs, heading, text in cards:
453	values = [v for k, v in attrs if k == 'id']
454	id_value = values[0] if len(values) == 1 else None
455
456	topic_id = (id_value if id_value else html_lib.PrettyHref(
457	heading, preserve_anchor_case=True))
458
459	if tag == 'h2':
460	h2 = DocNode(topic_id, attrs=attrs)
461	page_node.children.append(h2)
462	cur_h2_node = h2
463	elif tag == 'h3':
464	# attach text so we can see which topics have empty bodies
465	h3 = DocNode(topic_id, attrs=attrs, text=text)
466	cur_h2_node.children.append(h3)
467
468	if tag != tag_level:
469	continue # we only care about h3 now
470
471	if 0:
472	log('tag = %r', tag)
473	log('topic_id = %r', topic_id)
474	log('heading = %r', heading)
475	log('text = %r', text[:20])
476
477	embed = ('oils-embed', '1') in attrs
478
479	if out_dir is not None and embed:
480	# indices start with _
481	path = os.path.join(out_dir, topic_id)
482	with open(path, 'w') as f:
483	f.write(text)
484
485	# help builtin will show URL if there's a chapter name
486	topic_to_chap[topic_id] = None if embed else chapter_name
487
488	root_node.children.append(page_node)
489
490	num_sections = sum(len(child.children) for child in root_node.children)
491
492	log(
493	'%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
494	len(paths), len(topic_to_chap), num_sections, out_dir)
495
496	return topic_to_chap, root_node
497
498
499	class StrPool(object):
500
501	def __init__(self):
502	self.var_names = {}
503	self.global_strs = []
504	self.unique_id = 1
505
506	def Add(self, s):
507	if s in self.var_names:
508	return
509
510	var_name = 'gStr%d' % self.unique_id
511	self.unique_id += 1
512
513	import json
514	# Use JSON as approximation for C++ string
515	self.global_strs.append('GLOBAL_STR(%s, %s)' %
516	(var_name, json.dumps(s)))
517
518	self.var_names[s] = var_name
519
520
521	def WriteTopicDict(topic_dict, header_f, cc_f):
522	header_f.write('''
523	#include "mycpp/runtime.h"
524
525	namespace help_meta {
526	Dict<BigStr, BigStr>* TopicMetadata();
527	}
528	''')
529
530	pool = StrPool()
531
532	for k, v in topic_dict.iteritems():
533	pool.Add(k)
534	if v is not None:
535	pool.Add(v)
536	#log('%s %s', k, v)
537
538	num_items = len(topic_dict)
539	key_names = []
540	val_names = []
541
542	for k, v in topic_dict.iteritems():
543	key_names.append(pool.var_names[k])
544	if v is None:
545	v_str = 'nullptr'
546	else:
547	v_str = pool.var_names[v]
548	val_names.append(v_str)
549
550	cc_f.write('''
551	#include "mycpp/runtime.h"
552
553	namespace help_meta {
554
555	%s
556
557	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
558
559	Dict<BigStr, BigStr>* TopicMetadata() {
560	return gTopics;
561	}
562	}
563	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
564	' COMMA '.join(val_names)))
565
566
567	def main(argv):
568	action = argv[1]
569
570	if action == 'cards-from-index':
571	sh = argv[2] # osh or ysh
572	out_prefix = argv[3]
573
574	# Read HTML from stdin
575	# TODO: could pass a list of files to speed it up
576	CardsFromIndex(sh, out_prefix)
577
578	elif action == 'cards-from-chapters':
579
580	out_dir = argv[2]
581	py_out = argv[3]
582	cc_prefix = argv[4]
583	pages = argv[5:]
584
585	topic_to_chap, _ = CardsFromChapters(out_dir, 'h3', pages)
586
587	# Write topic dict as Python and C++
588
589	with open(py_out, 'w') as f:
590	f.write('TOPICS = %s\n' % pprint.pformat(topic_to_chap))
591
592	f.write('''
593
594	from typing import Dict
595
596	def TopicMetadata():
597	# type: () -> Dict[str, str]
598	return TOPICS
599	''')
600
601	h_path = cc_prefix + '.h'
602	cc_path = cc_prefix + '.cc'
603
604	with open(h_path, 'w') as header_f:
605	with open(cc_path, 'w') as cc_f:
606	WriteTopicDict(topic_to_chap, header_f, cc_f)
607
608	elif action == 'ref-check':
609	from doctools import cmark
610	from doctools import oils_doc
611	from doctools import ref_check
612
613	chapters = []
614	all_toc_nodes = []
615
616	for path in argv[2:]:
617	filename = os.path.basename(path)
618
619	if filename.endswith('.md'):
620	assert filename.startswith('toc-'), path
621
622	# First convert to HTML
623	with open(path) as in_file:
624	html = cmark.md2html(in_file.read())
625
626	# Now highlight code, which # which gives debug output for the
627	# language-chapter-links-*
628
629	box_nodes = []
630	html = oils_doc.HighlightCode(html, None, debug_out=box_nodes)
631	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
632
633	elif filename.endswith('.html'):
634	assert filename.startswith('chap-'), path
635	chapters.append(path)
636
637	else:
638	raise RuntimeError('Expected toc-* or chap-*, got %r' %
639	filename)
640
641	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
642
643	#log('%d chapters: %s', len(chapters), chapters[:5])
644	#log('%d topics: %s', len(topics), topics.keys()[:10])
645	log('')
646
647	# Compare TOC vs. chapters
648	ref_check.Check(all_toc_nodes, chap_tree)
649
650	else:
651	raise RuntimeError('Invalid action %r' % action)
652
653
654	if __name__ == '__main__':
655	try:
656	main(sys.argv)
657	except RuntimeError as e:
658	print('FATAL: %s' % e, file=sys.stderr)
659	sys.exit(1)