doctools/cmark.py

OILS / doctools / cmark.py View on Github | oils.pub

538 lines, 306 significant

1	#!/usr/bin/env python2
2	"""Convert Markdown to HTML, with our enhancements
3
4	- Parse the HTML
5	- insert a TOC
6	- <pstrip> hack - this is obsolete with ul-table?
7	- Expand $xref links
8	- Highlight code blocks
9
10	I started from cmark-0.28.3/wrappers/wrapper.py.
11	"""
12	from __future__ import print_function
13
14	try:
15	from HTMLParser import HTMLParser
16	except ImportError:
17	# python3
18	from html.parser import HTMLParser # type: ignore
19	import json
20	import optparse
21	import os
22	import pprint
23	import subprocess
24	import sys
25
26	from doctools import html_lib
27	from doctools import doc_html # templates
28	from doctools import oils_doc
29	from doctools import ul_table
30	from data_lang import htm8
31
32	if sys.version_info.major == 2:
33	from typing import Any, List, Dict, Tuple, Union, Optional, IO
34
35
36	def log(msg, *args):
37	# type: (str, Any) -> None
38	if args:
39	msg = msg % args
40
41	if 0:
42	print(msg, file=sys.stderr)
43
44
45	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
46	NEW_CMARK_WEDGE_DIR = os.path.join(this_dir, '../../oils.DEPS/wedge/cmark/0.29.0')
47
48	OLD_CMARK_WEDGE_DIR = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0'
49
50
51	def cmark_bin(md):
52	# type: (str) -> str
53
54	b1 = os.path.join(NEW_CMARK_WEDGE_DIR, 'bin/cmark')
55	b2 = os.path.join(OLD_CMARK_WEDGE_DIR, 'bin/cmark')
56	if os.path.exists(b1):
57	cmark_path = b1
58	elif os.path.exists(b2):
59	cmark_path = b2
60	else:
61	raise AssertionError('bin/cmark not found')
62
63	# Need to render raw HTML
64	p = subprocess.Popen([cmark_path, '--unsafe'],
65	stdin=subprocess.PIPE,
66	stdout=subprocess.PIPE)
67	stdout, _ = p.communicate(input=md)
68	return stdout
69
70
71	class TocExtractor(HTMLParser):
72	"""Extract Table of Contents
73
74	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
75	the line number.
76
77	Later, we insert two things:
78	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
79	- The TOC after <div id="toc">
80	"""
81
82	def __init__(self):
83	# type: () -> None
84	HTMLParser.__init__(self)
85
86	# make targets for these, regardless of whether the TOC links to them.
87	self.h_tags = ['h2', 'h3', 'h4']
88	self.indent = 0
89
90	# The TOC will be inserted after this.
91	self.toc_begin_line = -1
92	self.dense_toc_begin_line = -1
93
94	self.capturing = False
95
96	# Flat list of (line_num, tag, id, HTML)?
97	# HTML is like innerHTML. There can be <code> annotations and so forth.
98	# id is optional -- it can be used for generating headings.
99	self.headings = []
100
101	def handle_starttag(self, tag, attrs):
102	# type: (str, List[Tuple[str, str]]) -> None
103	if tag == 'div':
104	if attrs == [('id', 'toc')]:
105	log('%s> %s %s', self.indent * ' ', tag, attrs)
106	self.indent += 1
107	self.toc_begin_line, _ = self.getpos()
108	elif attrs == [('id', 'dense-toc')]:
109	self.indent += 1
110	self.dense_toc_begin_line, _ = self.getpos()
111
112	# Can't have nested <a> tags
113	if self.capturing and tag != 'a':
114	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
115
116	if tag in self.h_tags:
117	log('%s> %s %s', self.indent * ' ', tag, attrs)
118	self.indent += 1
119	line_num, _ = self.getpos()
120
121	css_id = None
122	for k, v in attrs:
123	if k == 'id':
124	css_id = v
125	break
126	self.headings.append((line_num, tag, css_id, [], []))
127	self.capturing = True # record the text inside <h2></h2> etc.
128
129	def handle_endtag(self, tag):
130	# type: (str) -> None
131	# Debug print
132	if tag == 'div':
133	self.indent -= 1
134	log('%s< %s', self.indent * ' ', tag)
135
136	if tag in self.h_tags:
137	self.indent -= 1
138	log('%s< %s', self.indent * ' ', tag)
139	self.capturing = False
140
141	# Can't have nested <a> tags
142	if self.capturing and tag != 'a':
143	self._AppendHtml('</%s>' % tag)
144
145	def handle_entityref(self, data):
146	# type: (str) -> None
147	"""
148	From Python docs:
149	This method is called to process a named character reference of the form
150	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
151	"""
152	# BUG FIX: For when we have say " or < in subheadings
153	if self.capturing:
154	self._AppendHtml('&%s;' % data)
155
156	def handle_data(self, data):
157	# type: (str) -> None
158	# Debug print
159	if self.indent > 0:
160	log('%s\| %r', self.indent * ' ', data)
161
162	if self.capturing:
163	self._AppendHtml(data)
164	self._AppendText(data)
165
166	def _AppendText(self, text):
167	# type: (str) -> None
168	"""Accumulate text of the last heading."""
169	_, _, _, _, text_parts = self.headings[-1]
170	text_parts.append(text)
171
172	def _AppendHtml(self, html):
173	# type: (str) -> None
174	"""Accumulate HTML of the last heading."""
175	_, _, _, html_parts, _ = self.headings[-1]
176	html_parts.append(html)
177
178
179	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
180
181	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
182	# anchors.
183	# But it's easier to insert an entire line, rather than part ofa line.
184	ANCHOR_FMT = '<a name="%s"></a>\n'
185
186
187	def _MakeTocInsertions(
188	opts, # type: Any
189	toc_tags, # type: Union[List[str], Tuple[str, str]]
190	headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
191	toc_pos, # type: int
192	preserve_anchor_case, # type: bool
193	):
194	# type: (...) -> List[Tuple[int, str]]
195	"""Given extract headings list and TOC position, return a list of insertions.
196
197	The insertions <div> for the TOC itself, and <a name=""> for the targets.
198
199	Args:
200	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
201	all of them.
202	"""
203	# Example:
204	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
205	#
206	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
207	# that's easy.
208
209	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
210	insertions = []
211
212	i = 0
213	for line_num, tag, css_id, html_parts, text_parts in headings:
214	css_class = TAG_TO_CSS[tag]
215
216	# Add BOTH href, for stability.
217	numeric_href = 'toc_%d' % i
218
219	# If there was an explicit CSS ID written by the user, use that as the href.
220	# I used this in the blog a few times.
221
222	pretty_href = html_lib.PrettyHref(
223	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
224
225	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
226	toc_href = css_id
227	else:
228	# Always use the pretty version now. The old numeric version is still a
229	# target, but not in the TOC.
230	toc_href = pretty_href
231
232	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
233	css_class, toc_href, ''.join(html_parts))
234	if tag in toc_tags:
235	toc_lines.append(line)
236
237	targets = []
238	if opts.toc_pretty_href: # NEW WAY
239	targets.append(ANCHOR_FMT % pretty_href)
240	elif css_id: # Old blog explicit
241	targets.append(ANCHOR_FMT % css_id)
242	targets.append(ANCHOR_FMT % numeric_href)
243	else: # Old blog implicit
244	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
245	targets.append(ANCHOR_FMT % numeric_href)
246
247	insertions.append((line_num, ''.join(targets)))
248
249	i += 1
250
251	# +1 to insert AFTER the <div>
252	toc_insert = (toc_pos + 1, ''.join(toc_lines))
253	insertions.insert(0, toc_insert) # The first insertion is TOC
254
255	return insertions
256
257
258	def _MakeTocInsertionsDense(
259	headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
260	toc_pos, # type: int
261	preserve_anchor_case, # type: bool
262	):
263	# type: (...) -> List[Tuple[int, str]]
264	"""For the dense-toc style with columns, used by doc/ref
265
266	The style above is simpler: it outputs a div for every line:
267
268	<div id="toctitle">Table of Contents</div>
269
270	<div class="toclevel1><a ...> Level 1 </a></div>
271	<div class="toclevel2><a ...> 1.A </a></div>
272	<div class="toclevel2><a ...> 1.B </a></div>
273	<div class="toclevel1><a ...> Level 2 </a></div>
274	...
275
276	We want something like this:
277
278	<div id="dense-toc-title">Table of Contents</div>
279
280	<div class="dense-toc-group">
281	<a ...> Level 1 </a> <br/>
282
283	<a class="dense-toc-h3" ...> 1.A </a> <br/>
284	<a class="dense-toc-h3" ...> 1.B </a> <br/>
285
286	</div> # NO BREAKING within this div
287
288	<div class="dense-toc-group">
289	<a ...> Level 2 </a> <br/>
290	</div>
291	"""
292
293	heading_tree = []
294	current_h2 = None
295
296	insertions = []
297
298	for line_num, tag, css_id, html_parts, text_parts in headings:
299
300	pretty_href = html_lib.PrettyHref(
301	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
302
303	if css_id: # doc/ref can use <h3 id="explicit"></h3>
304	toc_href = css_id
305	else:
306	# Always use the pretty version now. The old numeric version is still a
307	# target, but not in the TOC.
308	toc_href = pretty_href
309
310	anchor_html = ''.join(html_parts)
311
312	# Create a two level tree
313	if tag == 'h2':
314	current_h2 = (anchor_html, toc_href, [])
315	heading_tree.append(current_h2)
316	elif tag == 'h3':
317	assert current_h2 is not None, "h3 shouldn't come before any h2"
318	current_h2[2].append((anchor_html, toc_href))
319
320	# Insert the target <a name="">
321	insertions.append((line_num, ANCHOR_FMT % pretty_href))
322
323	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
324
325	if 1:
326	log('Heading Tree:')
327	log(pprint.pformat(heading_tree))
328	log('')
329
330	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
331	toc_lines.append('<div id="dense-toc-cols">\n')
332
333	for h2_html, h2_href, children in heading_tree:
334	toc_lines.append('<div class="dense-toc-group">\n')
335	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
336	for h3_html, h3_href in children:
337	toc_lines.append(
338	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
339	(h3_href, h3_html))
340	toc_lines.append('</div>\n')
341
342	toc_lines.append('</div>\n')
343
344	if 1:
345	log('TOC lines')
346	log(pprint.pformat(toc_lines))
347	log('')
348
349	# +1 to insert AFTER the <div>
350	toc_insert = (toc_pos + 1, ''.join(toc_lines))
351	insertions.insert(0, toc_insert) # The first insertion is TOC
352
353	return insertions
354
355
356	def _ApplyInsertions(lines, insertions, out_file):
357	# type: (List[str], List[Tuple[int, str]], IO[str]) -> None
358	assert insertions, "Should be at least one insertion"
359	j = 0
360	n = len(insertions)
361
362	for i, line in enumerate(lines):
363	current_line = i + 1 # 1-based
364
365	if j < n:
366	line_num, s = insertions[j]
367	if current_line == line_num:
368	out_file.write(s)
369	j += 1
370
371	out_file.write(line)
372
373
374	def Render(
375	opts, # type: Any
376	meta, # type: Dict
377	in_file, # type: IO[str]
378	out_file, # type: IO[str]
379	use_fastlex=True, # type: bool
380	debug_out=None, # type: Optional[Any]
381	):
382	# type: (...) -> None
383	if debug_out is None:
384	debug_out = []
385
386	# First convert to HTML
387	html = cmark_bin(in_file.read())
388	#print(html, file=sys.stderr)
389
390	# Now process HTML with oils_doc
391	if use_fastlex:
392	# Note: extract code BEFORE doing the HTML highlighting.
393	if opts.code_block_output:
394	with open(opts.code_block_output, 'w') as f:
395	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
396	opts.code_block_output)
397	text = oils_doc.ExtractCode(html, f)
398
399	html = ul_table.RemoveComments(html)
400
401	# Hack for allowing tables without <p> in cells, which CommonMark seems
402	# to require?
403	html = html.replace('<p><pstrip>', '')
404	html = html.replace('</pstrip></p>', '')
405
406	try:
407	html = ul_table.ReplaceTables(html)
408	except htm8.ParseError as e:
409	print('Error rendering file %r' % in_file, file=sys.stderr)
410	raise
411
412	# Expand $xref, etc.
413	html = oils_doc.ExpandLinks(html)
414
415	# <code> blocks
416	# Including class=language-oil-help-topics
417	html = oils_doc.HighlightCode(html,
418	meta.get('default_highlighter'),
419	debug_out=debug_out)
420
421	# h2 is the title. h1 is unused.
422	if opts.toc_tags:
423	toc_tags = opts.toc_tags
424	else:
425	toc_tags = ('h3', 'h4')
426
427	parser = TocExtractor()
428	parser.feed(html)
429
430	log('')
431	log('*** HTML headings:')
432	for heading in parser.headings:
433	log(heading)
434
435	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
436
437	if parser.toc_begin_line != -1:
438	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
439	parser.toc_begin_line,
440	preserve_anchor_case)
441	elif parser.dense_toc_begin_line != -1:
442	insertions = _MakeTocInsertionsDense(parser.headings,
443	parser.dense_toc_begin_line,
444	preserve_anchor_case)
445	else: # No TOC found Not found!
446	out_file.write(html) # Pass through
447	return
448
449	log('')
450	log('*** Text Insertions:')
451	for ins in insertions:
452	log(ins)
453
454	log('')
455	log('*** Output:')
456
457	lines = html.splitlines(True) # keep newlines
458	_ApplyInsertions(lines, insertions, out_file)
459
460
461	def Options():
462	# type: () -> Any
463	p = optparse.OptionParser('cmark.py [options]')
464
465	p.add_option('--common-mark',
466	action='store_true',
467	default=False,
468	help='Only do CommonMark conversion')
469
470	p.add_option(
471	'--toc-pretty-href',
472	action='store_true',
473	default=False,
474	help='Generate textual hrefs #like-this rather than like #toc10')
475	p.add_option('--toc-tag',
476	dest='toc_tags',
477	action='append',
478	default=[],
479	help='h tags to include in the TOC, e.g. h2 h3')
480	p.add_option('--disable-fastlex',
481	dest='disable_fastlex',
482	action='store_true',
483	default=False,
484	help='Hack for old blog posts')
485
486	p.add_option('--code-block-output',
487	dest='code_block_output',
488	default=None,
489	help='Extract and print code blocks to this file')
490
491	return p
492
493
494	# width 40 by default
495	DEFAULT_META = {'body_css_class': 'width40'}
496
497
498	def main(argv):
499	o = Options()
500	opts, argv = o.parse_args(argv)
501	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
502
503	if opts.common_mark:
504	print(cmark_bin(sys.stdin.read()))
505	return
506
507	meta = dict(DEFAULT_META)
508
509	if len(argv) == 3:
510	# Oils docs take 2 args: JSON and content HTML
511	with open(argv[1]) as f:
512	meta.update(json.load(f))
513
514	# Docs have a special header and footer.
515	with open(argv[2]) as content_f:
516	doc_html.Header(meta, sys.stdout, draft_warning=True)
517	Render(opts, meta, content_f, sys.stdout)
518	doc_html.Footer(meta, sys.stdout)
519	else:
520	# Filter usage for blog and for benchmarks.
521
522	# Metadata is optional here
523	try:
524	with open(argv[1]) as f:
525	meta.update(json.load(f))
526	except IndexError:
527	pass
528
529	# Old style for blog: it's a filter
530	Render(opts,
531	meta,
532	sys.stdin,
533	sys.stdout,
534	use_fastlex=not opts.disable_fastlex)
535
536
537	if __name__ == '__main__':
538	main(sys.argv)