doctools/cmark.py

OILS / doctools / cmark.py View on Github | oils.pub

532 lines, 303 significant

1	#!/usr/bin/env python2
2	"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3	and insert anchors.
4
5	I started from cmark-0.28.3/wrappers/wrapper.py.
6	"""
7	from __future__ import print_function
8
9	import ctypes
10	try:
11	from HTMLParser import HTMLParser
12	except ImportError:
13	from html.parser import HTMLParser # python3
14	import json
15	import optparse
16	import os
17	import pprint
18	import sys
19
20	from doctools import html_lib
21	from doctools import doc_html # templates
22	from doctools import oils_doc
23	from doctools import ul_table
24	from lazylex import html as lazylex_html
25
26	if sys.version_info.major == 2:
27	from typing import Any
28
29	# Geez find_library returns the filename and not the path? Just hardcode it as
30	# a workaround.
31	# https://bugs.python.org/issue21042
32
33	#from ctypes.util import find_library
34	#libname = find_library("cmark")
35	#assert libname, "cmark not found"
36
37	# There's some ongoing discussion about how to deal with the same in Nix.
38	# I think normally you'd just patch/substitute this path during the Nix build.
39	# See note in shell.nix
40	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
41
42	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
43	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
44	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
45
46	if cmark1 is not None and os.path.exists(cmark1):
47	libname = cmark1
48	elif os.path.exists(cmark2):
49	libname = cmark2
50	elif os.path.exists(cmark3):
51	libname = cmark3
52	else:
53	raise AssertionError("Couldn't find libcmark.so")
54
55	cmark = ctypes.CDLL(libname)
56
57	markdown = cmark.cmark_markdown_to_html
58	markdown.restype = ctypes.c_char_p
59	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
60
61
62	def log(msg, *args):
63	# type: (str, Any) -> None
64	if args:
65	msg = msg % args
66
67	if 0:
68	print(msg, file=sys.stderr)
69
70
71	# Version 0.29.0 disallowed raw HTML by default!
72	CMARK_OPT_UNSAFE = (1 << 17)
73
74
75	def md2html(md):
76	if sys.version_info.major == 2:
77	md_bytes = md
78	else:
79	md_bytes = md.encode('utf-8')
80
81	md_len = len(md)
82	html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
83
84	if sys.version_info.major == 2:
85	return html
86	else:
87	return html.decode('utf-8')
88
89
90	def demo():
91	sys.stdout.write(md2html('hi'))
92
93
94	class TocExtractor(HTMLParser):
95	"""Extract Table of Contents
96
97	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
98	the line number.
99
100	Later, we insert two things:
101	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
102	- The TOC after <div id="toc">
103	"""
104
105	def __init__(self):
106	HTMLParser.__init__(self)
107
108	# make targets for these, regardless of whether the TOC links to them.
109	self.h_tags = ['h2', 'h3', 'h4']
110	self.indent = 0
111
112	# The TOC will be inserted after this.
113	self.toc_begin_line = -1
114	self.dense_toc_begin_line = -1
115
116	self.capturing = False
117
118	# Flat list of (line_num, tag, id, HTML)?
119	# HTML is like innerHTML. There can be <code> annotations and so forth.
120	# id is optional -- it can be used for generating headings.
121	self.headings = []
122
123	def handle_starttag(self, tag, attrs):
124	if tag == 'div':
125	if attrs == [('id', 'toc')]:
126	log('%s> %s %s', self.indent * ' ', tag, attrs)
127	self.indent += 1
128	self.toc_begin_line, _ = self.getpos()
129	elif attrs == [('id', 'dense-toc')]:
130	self.indent += 1
131	self.dense_toc_begin_line, _ = self.getpos()
132
133	# Can't have nested <a> tags
134	if self.capturing and tag != 'a':
135	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
136
137	if tag in self.h_tags:
138	log('%s> %s %s', self.indent * ' ', tag, attrs)
139	self.indent += 1
140	line_num, _ = self.getpos()
141
142	css_id = None
143	for k, v in attrs:
144	if k == 'id':
145	css_id = v
146	break
147	self.headings.append((line_num, tag, css_id, [], []))
148	self.capturing = True # record the text inside <h2></h2> etc.
149
150	def handle_endtag(self, tag):
151	# Debug print
152	if tag == 'div':
153	self.indent -= 1
154	log('%s< %s', self.indent * ' ', tag)
155
156	if tag in self.h_tags:
157	self.indent -= 1
158	log('%s< %s', self.indent * ' ', tag)
159	self.capturing = False
160
161	# Can't have nested <a> tags
162	if self.capturing and tag != 'a':
163	self._AppendHtml('</%s>' % tag)
164
165	def handle_entityref(self, data):
166	"""
167	From Python docs:
168	This method is called to process a named character reference of the form
169	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
170	"""
171	# BUG FIX: For when we have say " or < in subheadings
172	if self.capturing:
173	self._AppendHtml('&%s;' % data)
174
175	def handle_data(self, data):
176	# Debug print
177	if self.indent > 0:
178	log('%s\| %r', self.indent * ' ', data)
179
180	if self.capturing:
181	self._AppendHtml(data)
182	self._AppendText(data)
183
184	def _AppendText(self, text):
185	"""Accumulate text of the last heading."""
186	_, _, _, _, text_parts = self.headings[-1]
187	text_parts.append(text)
188
189	def _AppendHtml(self, html):
190	"""Accumulate HTML of the last heading."""
191	_, _, _, html_parts, _ = self.headings[-1]
192	html_parts.append(html)
193
194
195	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
196
197	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
198	# anchors.
199	# But it's easier to insert an entire line, rather than part ofa line.
200	ANCHOR_FMT = '<a name="%s"></a>\n'
201
202
203	def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
204	preserve_anchor_case):
205	"""Given extract headings list and TOC position, return a list of insertions.
206
207	The insertions <div> for the TOC itself, and <a name=""> for the targets.
208
209	Args:
210	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
211	all of them.
212	"""
213	# Example:
214	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
215	#
216	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
217	# that's easy.
218
219	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
220	insertions = []
221
222	i = 0
223	for line_num, tag, css_id, html_parts, text_parts in headings:
224	css_class = TAG_TO_CSS[tag]
225
226	# Add BOTH href, for stability.
227	numeric_href = 'toc_%d' % i
228
229	# If there was an explicit CSS ID written by the user, use that as the href.
230	# I used this in the blog a few times.
231
232	pretty_href = html_lib.PrettyHref(
233	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
234
235	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
236	toc_href = css_id
237	else:
238	# Always use the pretty version now. The old numeric version is still a
239	# target, but not in the TOC.
240	toc_href = pretty_href
241
242	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
243	css_class, toc_href, ''.join(html_parts))
244	if tag in toc_tags:
245	toc_lines.append(line)
246
247	targets = []
248	if opts.toc_pretty_href: # NEW WAY
249	targets.append(ANCHOR_FMT % pretty_href)
250	elif css_id: # Old blog explicit
251	targets.append(ANCHOR_FMT % css_id)
252	targets.append(ANCHOR_FMT % numeric_href)
253	else: # Old blog implicit
254	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
255	targets.append(ANCHOR_FMT % numeric_href)
256
257	insertions.append((line_num, ''.join(targets)))
258
259	i += 1
260
261	# +1 to insert AFTER the <div>
262	toc_insert = (toc_pos + 1, ''.join(toc_lines))
263	insertions.insert(0, toc_insert) # The first insertion is TOC
264
265	return insertions
266
267
268	def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
269	"""For the dense-toc style with columns, used by doc/ref
270
271	The style above is simpler: it outputs a div for every line:
272
273	<div id="toctitle">Table of Contents</div>
274
275	<div class="toclevel1><a ...> Level 1 </a></div>
276	<div class="toclevel2><a ...> 1.A </a></div>
277	<div class="toclevel2><a ...> 1.B </a></div>
278	<div class="toclevel1><a ...> Level 2 </a></div>
279	...
280
281	We want something like this:
282
283	<div id="dense-toc-title">Table of Contents</div>
284
285	<div class="dense-toc-group">
286	<a ...> Level 1 </a> <br/>
287
288	<a class="dense-toc-h3" ...> 1.A </a> <br/>
289	<a class="dense-toc-h3" ...> 1.B </a> <br/>
290
291	</div> # NO BREAKING within this div
292
293	<div class="dense-toc-group">
294	<a ...> Level 2 </a> <br/>
295	</div>
296	"""
297
298	heading_tree = []
299	current_h2 = None
300
301	insertions = []
302
303	for line_num, tag, css_id, html_parts, text_parts in headings:
304
305	pretty_href = html_lib.PrettyHref(
306	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
307
308	if css_id: # doc/ref can use <h3 id="explicit"></h3>
309	toc_href = css_id
310	else:
311	# Always use the pretty version now. The old numeric version is still a
312	# target, but not in the TOC.
313	toc_href = pretty_href
314
315	anchor_html = ''.join(html_parts)
316
317	# Create a two level tree
318	if tag == 'h2':
319	current_h2 = (anchor_html, toc_href, [])
320	heading_tree.append(current_h2)
321	elif tag == 'h3':
322	assert current_h2 is not None, "h3 shouldn't come before any h2"
323	current_h2[2].append((anchor_html, toc_href))
324
325	# Insert the target <a name="">
326	insertions.append((line_num, ANCHOR_FMT % pretty_href))
327
328	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
329
330	if 1:
331	log('Heading Tree:')
332	log(pprint.pformat(heading_tree))
333	log('')
334
335	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
336	toc_lines.append('<div id="dense-toc-cols">\n')
337
338	for h2_html, h2_href, children in heading_tree:
339	toc_lines.append('<div class="dense-toc-group">\n')
340	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
341	for h3_html, h3_href in children:
342	toc_lines.append(
343	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
344	(h3_href, h3_html))
345	toc_lines.append('</div>\n')
346
347	toc_lines.append('</div>\n')
348
349	if 1:
350	log('TOC lines')
351	log(pprint.pformat(toc_lines))
352	log('')
353
354	# +1 to insert AFTER the <div>
355	toc_insert = (toc_pos + 1, ''.join(toc_lines))
356	insertions.insert(0, toc_insert) # The first insertion is TOC
357
358	return insertions
359
360
361	def _ApplyInsertions(lines, insertions, out_file):
362	assert insertions, "Should be at least one insertion"
363	j = 0
364	n = len(insertions)
365
366	for i, line in enumerate(lines):
367	current_line = i + 1 # 1-based
368
369	if j < n:
370	line_num, s = insertions[j]
371	if current_line == line_num:
372	out_file.write(s)
373	j += 1
374
375	out_file.write(line)
376
377
378	def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
379	if debug_out is None:
380	debug_out = []
381
382	# First convert to HTML
383	html = md2html(in_file.read())
384	#print(html, file=sys.stderr)
385
386	# Now process HTML with oils_doc
387	if use_fastlex:
388	# Note: extract code BEFORE doing the HTML highlighting.
389	if opts.code_block_output:
390	with open(opts.code_block_output, 'w') as f:
391	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
392	opts.code_block_output)
393	text = oils_doc.ExtractCode(html, f)
394
395	html = oils_doc.RemoveComments(html)
396
397	# Hack for allowing tables without <p> in cells, which CommonMark seems
398	# to require?
399	html = html.replace('<p><pstrip>', '')
400	html = html.replace('</pstrip></p>', '')
401
402	try:
403	html = ul_table.ReplaceTables(html)
404	except lazylex_html.ParseError as e:
405	print('Error rendering file %r' % in_file, file=sys.stderr)
406	raise
407
408	# Expand $xref, etc.
409	html = oils_doc.ExpandLinks(html)
410
411	# <code> blocks
412	# Including class=language-oil-help-topics
413	html = oils_doc.HighlightCode(html,
414	meta.get('default_highlighter'),
415	debug_out=debug_out)
416
417	# h2 is the title. h1 is unused.
418	if opts.toc_tags:
419	toc_tags = opts.toc_tags
420	else:
421	toc_tags = ('h3', 'h4')
422
423	parser = TocExtractor()
424	parser.feed(html)
425
426	log('')
427	log('*** HTML headings:')
428	for heading in parser.headings:
429	log(heading)
430
431	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
432
433	if parser.toc_begin_line != -1:
434	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
435	parser.toc_begin_line,
436	preserve_anchor_case)
437	elif parser.dense_toc_begin_line != -1:
438	insertions = _MakeTocInsertionsDense(parser.headings,
439	parser.dense_toc_begin_line,
440	preserve_anchor_case)
441	else: # No TOC found Not found!
442	out_file.write(html) # Pass through
443	return
444
445	log('')
446	log('*** Text Insertions:')
447	for ins in insertions:
448	log(ins)
449
450	log('')
451	log('*** Output:')
452
453	lines = html.splitlines(True) # keep newlines
454	_ApplyInsertions(lines, insertions, out_file)
455
456
457	def Options():
458	p = optparse.OptionParser('cmark.py [options]')
459
460	p.add_option('--common-mark',
461	action='store_true',
462	default=False,
463	help='Only do CommonMark conversion')
464
465	p.add_option(
466	'--toc-pretty-href',
467	action='store_true',
468	default=False,
469	help='Generate textual hrefs #like-this rather than like #toc10')
470	p.add_option('--toc-tag',
471	dest='toc_tags',
472	action='append',
473	default=[],
474	help='h tags to include in the TOC, e.g. h2 h3')
475	p.add_option('--disable-fastlex',
476	dest='disable_fastlex',
477	action='store_true',
478	default=False,
479	help='Hack for old blog posts')
480
481	p.add_option('--code-block-output',
482	dest='code_block_output',
483	default=None,
484	help='Extract and print code blocks to this file')
485
486	return p
487
488
489	# width 40 by default
490	DEFAULT_META = {'body_css_class': 'width40'}
491
492
493	def main(argv):
494	o = Options()
495	opts, argv = o.parse_args(argv)
496	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
497
498	if opts.common_mark:
499	print(md2html(sys.stdin.read()))
500	return
501
502	meta = dict(DEFAULT_META)
503
504	if len(argv) == 3: # It's Oils documentation
505	with open(argv[1]) as f:
506	meta.update(json.load(f))
507
508	# Docs have a special header and footer.
509	with open(argv[2]) as content_f:
510	doc_html.Header(meta, sys.stdout, draft_warning=True)
511	Render(opts, meta, content_f, sys.stdout)
512	doc_html.Footer(meta, sys.stdout)
513	else:
514	# Filter for blog and for benchmarks.
515
516	# Metadata is optional here
517	try:
518	with open(argv[1]) as f:
519	meta.update(json.load(f))
520	except IndexError:
521	pass
522
523	# Old style for blog: it's a filter
524	Render(opts,
525	meta,
526	sys.stdin,
527	sys.stdout,
528	use_fastlex=not opts.disable_fastlex)
529
530
531	if __name__ == '__main__':
532	main(sys.argv)