doctools/cmark.py

OILS / doctools / cmark.py View on Github | oilshell.org

516 lines, 291 significant

1	#!/usr/bin/env python2
2	"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3	and insert anchors.
4
5	I started from cmark-0.28.3/wrappers/wrapper.py.
6	"""
7	from __future__ import print_function
8
9	import ctypes
10	import HTMLParser
11	import json
12	import optparse
13	import os
14	import pprint
15	import sys
16
17	from doctools import html_lib
18	from doctools import doc_html # templates
19	from doctools import oils_doc
20	from doctools import ul_table
21	from lazylex import html as lazylex_html
22
23	# Geez find_library returns the filename and not the path? Just hardcode it as
24	# a workaround.
25	# https://bugs.python.org/issue21042
26
27	#from ctypes.util import find_library
28	#libname = find_library("cmark")
29	#assert libname, "cmark not found"
30
31	# There's some ongoing discussion about how to deal with the same in Nix.
32	# I think normally you'd just patch/substitute this path during the Nix build.
33	# See note in shell.nix
34	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
35
36	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
37	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
38	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
39
40	if cmark1 is not None and os.path.exists(cmark1):
41	libname = cmark1
42	elif os.path.exists(cmark2):
43	libname = cmark2
44	elif os.path.exists(cmark3):
45	libname = cmark3
46	else:
47	raise AssertionError("Couldn't find libcmark.so")
48
49	cmark = ctypes.CDLL(libname)
50
51	markdown = cmark.cmark_markdown_to_html
52	markdown.restype = ctypes.c_char_p
53	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
54
55
56	def log(msg, *args):
57	if args:
58	msg = msg % args
59
60	if 0:
61	print(msg, file=sys.stderr)
62
63
64	# Version 0.29.0 disallowed raw HTML by default!
65	CMARK_OPT_UNSAFE = (1 << 17)
66
67
68	def md2html(text):
69	textbytes = text
70	textlen = len(text)
71	return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
72
73
74	def demo():
75	sys.stdout.write(md2html('hi'))
76
77
78	class TocExtractor(HTMLParser.HTMLParser):
79	"""Extract Table of Contents
80
81	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
82	the line number.
83
84	Later, we insert two things:
85	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
86	- The TOC after <div id="toc">
87	"""
88
89	def __init__(self):
90	HTMLParser.HTMLParser.__init__(self)
91
92	# make targets for these, regardless of whether the TOC links to them.
93	self.h_tags = ['h2', 'h3', 'h4']
94	self.indent = 0
95
96	# The TOC will be inserted after this.
97	self.toc_begin_line = -1
98	self.dense_toc_begin_line = -1
99
100	self.capturing = False
101
102	# Flat list of (line_num, tag, id, HTML)?
103	# HTML is like innerHTML. There can be <code> annotations and so forth.
104	# id is optional -- it can be used for generating headings.
105	self.headings = []
106
107	def handle_starttag(self, tag, attrs):
108	if tag == 'div':
109	if attrs == [('id', 'toc')]:
110	log('%s> %s %s', self.indent * ' ', tag, attrs)
111	self.indent += 1
112	self.toc_begin_line, _ = self.getpos()
113	elif attrs == [('id', 'dense-toc')]:
114	self.indent += 1
115	self.dense_toc_begin_line, _ = self.getpos()
116
117	# Can't have nested <a> tags
118	if self.capturing and tag != 'a':
119	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
120
121	if tag in self.h_tags:
122	log('%s> %s %s', self.indent * ' ', tag, attrs)
123	self.indent += 1
124	line_num, _ = self.getpos()
125
126	css_id = None
127	for k, v in attrs:
128	if k == 'id':
129	css_id = v
130	break
131	self.headings.append((line_num, tag, css_id, [], []))
132	self.capturing = True # record the text inside <h2></h2> etc.
133
134	def handle_endtag(self, tag):
135	# Debug print
136	if tag == 'div':
137	self.indent -= 1
138	log('%s< %s', self.indent * ' ', tag)
139
140	if tag in self.h_tags:
141	self.indent -= 1
142	log('%s< %s', self.indent * ' ', tag)
143	self.capturing = False
144
145	# Can't have nested <a> tags
146	if self.capturing and tag != 'a':
147	self._AppendHtml('</%s>' % tag)
148
149	def handle_entityref(self, data):
150	"""
151	From Python docs:
152	This method is called to process a named character reference of the form
153	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
154	"""
155	# BUG FIX: For when we have say " or < in subheadings
156	if self.capturing:
157	self._AppendHtml('&%s;' % data)
158
159	def handle_data(self, data):
160	# Debug print
161	if self.indent > 0:
162	log('%s\| %r', self.indent * ' ', data)
163
164	if self.capturing:
165	self._AppendHtml(data)
166	self._AppendText(data)
167
168	def _AppendText(self, text):
169	"""Accumulate text of the last heading."""
170	_, _, _, _, text_parts = self.headings[-1]
171	text_parts.append(text)
172
173	def _AppendHtml(self, html):
174	"""Accumulate HTML of the last heading."""
175	_, _, _, html_parts, _ = self.headings[-1]
176	html_parts.append(html)
177
178
179	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
180
181	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
182	# anchors.
183	# But it's easier to insert an entire line, rather than part ofa line.
184	ANCHOR_FMT = '<a name="%s"></a>\n'
185
186
187	def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
188	preserve_anchor_case):
189	"""Given extract headings list and TOC position, return a list of insertions.
190
191	The insertions <div> for the TOC itself, and <a name=""> for the targets.
192
193	Args:
194	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
195	all of them.
196	"""
197	# Example:
198	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
199	#
200	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
201	# that's easy.
202
203	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
204	insertions = []
205
206	i = 0
207	for line_num, tag, css_id, html_parts, text_parts in headings:
208	css_class = TAG_TO_CSS[tag]
209
210	# Add BOTH href, for stability.
211	numeric_href = 'toc_%d' % i
212
213	# If there was an explicit CSS ID written by the user, use that as the href.
214	# I used this in the blog a few times.
215
216	pretty_href = html_lib.PrettyHref(
217	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
218
219	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
220	toc_href = css_id
221	else:
222	# Always use the pretty version now. The old numeric version is still a
223	# target, but not in the TOC.
224	toc_href = pretty_href
225
226	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
227	css_class, toc_href, ''.join(html_parts))
228	if tag in toc_tags:
229	toc_lines.append(line)
230
231	targets = []
232	if opts.toc_pretty_href: # NEW WAY
233	targets.append(ANCHOR_FMT % pretty_href)
234	elif css_id: # Old blog explicit
235	targets.append(ANCHOR_FMT % css_id)
236	targets.append(ANCHOR_FMT % numeric_href)
237	else: # Old blog implicit
238	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
239	targets.append(ANCHOR_FMT % numeric_href)
240
241	insertions.append((line_num, ''.join(targets)))
242
243	i += 1
244
245	# +1 to insert AFTER the <div>
246	toc_insert = (toc_pos + 1, ''.join(toc_lines))
247	insertions.insert(0, toc_insert) # The first insertion is TOC
248
249	return insertions
250
251
252	def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
253	"""For the dense-toc style with columns, used by doc/ref
254
255	The style above is simpler: it outputs a div for every line:
256
257	<div id="toctitle">Table of Contents</div>
258
259	<div class="toclevel1><a ...> Level 1 </a></div>
260	<div class="toclevel2><a ...> 1.A </a></div>
261	<div class="toclevel2><a ...> 1.B </a></div>
262	<div class="toclevel1><a ...> Level 2 </a></div>
263	...
264
265	We want something like this:
266
267	<div id="dense-toc-title">Table of Contents</div>
268
269	<div class="dense-toc-group">
270	<a ...> Level 1 </a> <br/>
271
272	<a class="dense-toc-h3" ...> 1.A </a> <br/>
273	<a class="dense-toc-h3" ...> 1.B </a> <br/>
274
275	</div> # NO BREAKING within this div
276
277	<div class="dense-toc-group">
278	<a ...> Level 2 </a> <br/>
279	</div>
280	"""
281
282	heading_tree = []
283	current_h2 = None
284
285	insertions = []
286
287	for line_num, tag, css_id, html_parts, text_parts in headings:
288
289	pretty_href = html_lib.PrettyHref(
290	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
291
292	if css_id: # doc/ref can use <h3 id="explicit"></h3>
293	toc_href = css_id
294	else:
295	# Always use the pretty version now. The old numeric version is still a
296	# target, but not in the TOC.
297	toc_href = pretty_href
298
299	anchor_html = ''.join(html_parts)
300
301	# Create a two level tree
302	if tag == 'h2':
303	current_h2 = (anchor_html, toc_href, [])
304	heading_tree.append(current_h2)
305	elif tag == 'h3':
306	assert current_h2 is not None, "h3 shouldn't come before any h2"
307	current_h2[2].append((anchor_html, toc_href))
308
309	# Insert the target <a name="">
310	insertions.append((line_num, ANCHOR_FMT % pretty_href))
311
312	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
313
314	if 1:
315	log('Heading Tree:')
316	log(pprint.pformat(heading_tree))
317	log('')
318
319	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
320	toc_lines.append('<div id="dense-toc-cols">\n')
321
322	for h2_html, h2_href, children in heading_tree:
323	toc_lines.append('<div class="dense-toc-group">\n')
324	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
325	for h3_html, h3_href in children:
326	toc_lines.append(
327	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
328	(h3_href, h3_html))
329	toc_lines.append('</div>\n')
330
331	toc_lines.append('</div>\n')
332
333	if 1:
334	log('TOC lines')
335	log(pprint.pformat(toc_lines))
336	log('')
337
338	# +1 to insert AFTER the <div>
339	toc_insert = (toc_pos + 1, ''.join(toc_lines))
340	insertions.insert(0, toc_insert) # The first insertion is TOC
341
342	return insertions
343
344
345	def _ApplyInsertions(lines, insertions, out_file):
346	assert insertions, "Should be at least one insertion"
347	j = 0
348	n = len(insertions)
349
350	for i, line in enumerate(lines):
351	current_line = i + 1 # 1-based
352
353	if j < n:
354	line_num, s = insertions[j]
355	if current_line == line_num:
356	out_file.write(s)
357	j += 1
358
359	out_file.write(line)
360
361
362	def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
363	if debug_out is None:
364	debug_out = []
365
366	# First convert to HTML
367	html = md2html(in_file.read())
368	#print(html, file=sys.stderr)
369
370	# Now process HTML with oils_doc
371	if use_fastlex:
372	# Note: extract code BEFORE doing the HTML highlighting.
373	if opts.code_block_output:
374	with open(opts.code_block_output, 'w') as f:
375	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
376	opts.code_block_output)
377	text = oils_doc.ExtractCode(html, f)
378
379	html = oils_doc.RemoveComments(html)
380
381	# Hack for allowing tables without <p> in cells, which CommonMark seems
382	# to require?
383	html = html.replace('<p><pstrip>', '')
384	html = html.replace('</pstrip></p>', '')
385
386	try:
387	html = ul_table.ReplaceTables(html)
388	except lazylex_html.ParseError as e:
389	print('Error rendering file %r' % in_file, file=sys.stderr)
390	raise
391
392	# Expand $xref, etc.
393	html = oils_doc.ExpandLinks(html)
394
395	# <code> blocks
396	# Including class=language-oil-help-topics
397	html = oils_doc.HighlightCode(html,
398	meta.get('default_highlighter'),
399	debug_out=debug_out)
400
401	# h2 is the title. h1 is unused.
402	if opts.toc_tags:
403	toc_tags = opts.toc_tags
404	else:
405	toc_tags = ('h3', 'h4')
406
407	parser = TocExtractor()
408	parser.feed(html)
409
410	log('')
411	log('*** HTML headings:')
412	for heading in parser.headings:
413	log(heading)
414
415	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
416
417	if parser.toc_begin_line != -1:
418	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
419	parser.toc_begin_line,
420	preserve_anchor_case)
421	elif parser.dense_toc_begin_line != -1:
422	insertions = _MakeTocInsertionsDense(parser.headings,
423	parser.dense_toc_begin_line,
424	preserve_anchor_case)
425	else: # No TOC found Not found!
426	out_file.write(html) # Pass through
427	return
428
429	log('')
430	log('*** Text Insertions:')
431	for ins in insertions:
432	log(ins)
433
434	log('')
435	log('*** Output:')
436
437	lines = html.splitlines(True) # keep newlines
438	_ApplyInsertions(lines, insertions, out_file)
439
440
441	def Options():
442	p = optparse.OptionParser('cmark.py [options]')
443
444	p.add_option('--common-mark',
445	action='store_true',
446	default=False,
447	help='Only do CommonMark conversion')
448
449	p.add_option(
450	'--toc-pretty-href',
451	action='store_true',
452	default=False,
453	help='Generate textual hrefs #like-this rather than like #toc10')
454	p.add_option('--toc-tag',
455	dest='toc_tags',
456	action='append',
457	default=[],
458	help='h tags to include in the TOC, e.g. h2 h3')
459	p.add_option('--disable-fastlex',
460	dest='disable_fastlex',
461	action='store_true',
462	default=False,
463	help='Hack for old blog posts')
464
465	p.add_option('--code-block-output',
466	dest='code_block_output',
467	default=None,
468	help='Extract and print code blocks to this file')
469
470	return p
471
472
473	# width 40 by default
474	DEFAULT_META = {'body_css_class': 'width40'}
475
476
477	def main(argv):
478	o = Options()
479	opts, argv = o.parse_args(argv)
480	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
481
482	if opts.common_mark:
483	print(md2html(sys.stdin.read()))
484	return
485
486	meta = dict(DEFAULT_META)
487
488	if len(argv) == 3: # It's Oils documentation
489	with open(argv[1]) as f:
490	meta.update(json.load(f))
491
492	# Docs have a special header and footer.
493	with open(argv[2]) as content_f:
494	doc_html.Header(meta, sys.stdout, draft_warning=True)
495	Render(opts, meta, content_f, sys.stdout)
496	doc_html.Footer(meta, sys.stdout)
497	else:
498	# Filter for blog and for benchmarks.
499
500	# Metadata is optional here
501	try:
502	with open(argv[1]) as f:
503	meta.update(json.load(f))
504	except IndexError:
505	pass
506
507	# Old style for blog: it's a filter
508	Render(opts,
509	meta,
510	sys.stdin,
511	sys.stdout,
512	use_fastlex=not opts.disable_fastlex)
513
514
515	if __name__ == '__main__':
516	main(sys.argv)