doctools/cmark.py

OILS / doctools / cmark.py View on Github | oilshell.org

519 lines, 292 significant

1	#!/usr/bin/env python2
2	"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3	and insert anchors.
4
5	I started from cmark-0.28.3/wrappers/wrapper.py.
6	"""
7	from __future__ import print_function
8
9	import ctypes
10	import HTMLParser
11	import json
12	import optparse
13	import os
14	import pprint
15	import sys
16
17	from doctools import html_lib
18	from doctools import doc_html # templates
19	from doctools import oils_doc
20	from doctools import ul_table
21	from lazylex import html as lazylex_html
22
23	from typing import Any
24
25	# Geez find_library returns the filename and not the path? Just hardcode it as
26	# a workaround.
27	# https://bugs.python.org/issue21042
28
29	#from ctypes.util import find_library
30	#libname = find_library("cmark")
31	#assert libname, "cmark not found"
32
33	# There's some ongoing discussion about how to deal with the same in Nix.
34	# I think normally you'd just patch/substitute this path during the Nix build.
35	# See note in shell.nix
36	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
37
38	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
39	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
40	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
41
42	if cmark1 is not None and os.path.exists(cmark1):
43	libname = cmark1
44	elif os.path.exists(cmark2):
45	libname = cmark2
46	elif os.path.exists(cmark3):
47	libname = cmark3
48	else:
49	raise AssertionError("Couldn't find libcmark.so")
50
51	cmark = ctypes.CDLL(libname)
52
53	markdown = cmark.cmark_markdown_to_html
54	markdown.restype = ctypes.c_char_p
55	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
56
57
58	def log(msg, *args):
59	# type: (str, Any) -> None
60	if args:
61	msg = msg % args
62
63	if 0:
64	print(msg, file=sys.stderr)
65
66
67	# Version 0.29.0 disallowed raw HTML by default!
68	CMARK_OPT_UNSAFE = (1 << 17)
69
70
71	def md2html(text):
72	textbytes = text
73	textlen = len(text)
74	return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
75
76
77	def demo():
78	sys.stdout.write(md2html('hi'))
79
80
81	class TocExtractor(HTMLParser.HTMLParser):
82	"""Extract Table of Contents
83
84	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
85	the line number.
86
87	Later, we insert two things:
88	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
89	- The TOC after <div id="toc">
90	"""
91
92	def __init__(self):
93	HTMLParser.HTMLParser.__init__(self)
94
95	# make targets for these, regardless of whether the TOC links to them.
96	self.h_tags = ['h2', 'h3', 'h4']
97	self.indent = 0
98
99	# The TOC will be inserted after this.
100	self.toc_begin_line = -1
101	self.dense_toc_begin_line = -1
102
103	self.capturing = False
104
105	# Flat list of (line_num, tag, id, HTML)?
106	# HTML is like innerHTML. There can be <code> annotations and so forth.
107	# id is optional -- it can be used for generating headings.
108	self.headings = []
109
110	def handle_starttag(self, tag, attrs):
111	if tag == 'div':
112	if attrs == [('id', 'toc')]:
113	log('%s> %s %s', self.indent * ' ', tag, attrs)
114	self.indent += 1
115	self.toc_begin_line, _ = self.getpos()
116	elif attrs == [('id', 'dense-toc')]:
117	self.indent += 1
118	self.dense_toc_begin_line, _ = self.getpos()
119
120	# Can't have nested <a> tags
121	if self.capturing and tag != 'a':
122	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
123
124	if tag in self.h_tags:
125	log('%s> %s %s', self.indent * ' ', tag, attrs)
126	self.indent += 1
127	line_num, _ = self.getpos()
128
129	css_id = None
130	for k, v in attrs:
131	if k == 'id':
132	css_id = v
133	break
134	self.headings.append((line_num, tag, css_id, [], []))
135	self.capturing = True # record the text inside <h2></h2> etc.
136
137	def handle_endtag(self, tag):
138	# Debug print
139	if tag == 'div':
140	self.indent -= 1
141	log('%s< %s', self.indent * ' ', tag)
142
143	if tag in self.h_tags:
144	self.indent -= 1
145	log('%s< %s', self.indent * ' ', tag)
146	self.capturing = False
147
148	# Can't have nested <a> tags
149	if self.capturing and tag != 'a':
150	self._AppendHtml('</%s>' % tag)
151
152	def handle_entityref(self, data):
153	"""
154	From Python docs:
155	This method is called to process a named character reference of the form
156	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
157	"""
158	# BUG FIX: For when we have say " or < in subheadings
159	if self.capturing:
160	self._AppendHtml('&%s;' % data)
161
162	def handle_data(self, data):
163	# Debug print
164	if self.indent > 0:
165	log('%s\| %r', self.indent * ' ', data)
166
167	if self.capturing:
168	self._AppendHtml(data)
169	self._AppendText(data)
170
171	def _AppendText(self, text):
172	"""Accumulate text of the last heading."""
173	_, _, _, _, text_parts = self.headings[-1]
174	text_parts.append(text)
175
176	def _AppendHtml(self, html):
177	"""Accumulate HTML of the last heading."""
178	_, _, _, html_parts, _ = self.headings[-1]
179	html_parts.append(html)
180
181
182	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
183
184	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
185	# anchors.
186	# But it's easier to insert an entire line, rather than part ofa line.
187	ANCHOR_FMT = '<a name="%s"></a>\n'
188
189
190	def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
191	preserve_anchor_case):
192	"""Given extract headings list and TOC position, return a list of insertions.
193
194	The insertions <div> for the TOC itself, and <a name=""> for the targets.
195
196	Args:
197	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
198	all of them.
199	"""
200	# Example:
201	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
202	#
203	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
204	# that's easy.
205
206	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
207	insertions = []
208
209	i = 0
210	for line_num, tag, css_id, html_parts, text_parts in headings:
211	css_class = TAG_TO_CSS[tag]
212
213	# Add BOTH href, for stability.
214	numeric_href = 'toc_%d' % i
215
216	# If there was an explicit CSS ID written by the user, use that as the href.
217	# I used this in the blog a few times.
218
219	pretty_href = html_lib.PrettyHref(
220	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
221
222	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
223	toc_href = css_id
224	else:
225	# Always use the pretty version now. The old numeric version is still a
226	# target, but not in the TOC.
227	toc_href = pretty_href
228
229	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
230	css_class, toc_href, ''.join(html_parts))
231	if tag in toc_tags:
232	toc_lines.append(line)
233
234	targets = []
235	if opts.toc_pretty_href: # NEW WAY
236	targets.append(ANCHOR_FMT % pretty_href)
237	elif css_id: # Old blog explicit
238	targets.append(ANCHOR_FMT % css_id)
239	targets.append(ANCHOR_FMT % numeric_href)
240	else: # Old blog implicit
241	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
242	targets.append(ANCHOR_FMT % numeric_href)
243
244	insertions.append((line_num, ''.join(targets)))
245
246	i += 1
247
248	# +1 to insert AFTER the <div>
249	toc_insert = (toc_pos + 1, ''.join(toc_lines))
250	insertions.insert(0, toc_insert) # The first insertion is TOC
251
252	return insertions
253
254
255	def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
256	"""For the dense-toc style with columns, used by doc/ref
257
258	The style above is simpler: it outputs a div for every line:
259
260	<div id="toctitle">Table of Contents</div>
261
262	<div class="toclevel1><a ...> Level 1 </a></div>
263	<div class="toclevel2><a ...> 1.A </a></div>
264	<div class="toclevel2><a ...> 1.B </a></div>
265	<div class="toclevel1><a ...> Level 2 </a></div>
266	...
267
268	We want something like this:
269
270	<div id="dense-toc-title">Table of Contents</div>
271
272	<div class="dense-toc-group">
273	<a ...> Level 1 </a> <br/>
274
275	<a class="dense-toc-h3" ...> 1.A </a> <br/>
276	<a class="dense-toc-h3" ...> 1.B </a> <br/>
277
278	</div> # NO BREAKING within this div
279
280	<div class="dense-toc-group">
281	<a ...> Level 2 </a> <br/>
282	</div>
283	"""
284
285	heading_tree = []
286	current_h2 = None
287
288	insertions = []
289
290	for line_num, tag, css_id, html_parts, text_parts in headings:
291
292	pretty_href = html_lib.PrettyHref(
293	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
294
295	if css_id: # doc/ref can use <h3 id="explicit"></h3>
296	toc_href = css_id
297	else:
298	# Always use the pretty version now. The old numeric version is still a
299	# target, but not in the TOC.
300	toc_href = pretty_href
301
302	anchor_html = ''.join(html_parts)
303
304	# Create a two level tree
305	if tag == 'h2':
306	current_h2 = (anchor_html, toc_href, [])
307	heading_tree.append(current_h2)
308	elif tag == 'h3':
309	assert current_h2 is not None, "h3 shouldn't come before any h2"
310	current_h2[2].append((anchor_html, toc_href))
311
312	# Insert the target <a name="">
313	insertions.append((line_num, ANCHOR_FMT % pretty_href))
314
315	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
316
317	if 1:
318	log('Heading Tree:')
319	log(pprint.pformat(heading_tree))
320	log('')
321
322	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
323	toc_lines.append('<div id="dense-toc-cols">\n')
324
325	for h2_html, h2_href, children in heading_tree:
326	toc_lines.append('<div class="dense-toc-group">\n')
327	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
328	for h3_html, h3_href in children:
329	toc_lines.append(
330	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
331	(h3_href, h3_html))
332	toc_lines.append('</div>\n')
333
334	toc_lines.append('</div>\n')
335
336	if 1:
337	log('TOC lines')
338	log(pprint.pformat(toc_lines))
339	log('')
340
341	# +1 to insert AFTER the <div>
342	toc_insert = (toc_pos + 1, ''.join(toc_lines))
343	insertions.insert(0, toc_insert) # The first insertion is TOC
344
345	return insertions
346
347
348	def _ApplyInsertions(lines, insertions, out_file):
349	assert insertions, "Should be at least one insertion"
350	j = 0
351	n = len(insertions)
352
353	for i, line in enumerate(lines):
354	current_line = i + 1 # 1-based
355
356	if j < n:
357	line_num, s = insertions[j]
358	if current_line == line_num:
359	out_file.write(s)
360	j += 1
361
362	out_file.write(line)
363
364
365	def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
366	if debug_out is None:
367	debug_out = []
368
369	# First convert to HTML
370	html = md2html(in_file.read())
371	#print(html, file=sys.stderr)
372
373	# Now process HTML with oils_doc
374	if use_fastlex:
375	# Note: extract code BEFORE doing the HTML highlighting.
376	if opts.code_block_output:
377	with open(opts.code_block_output, 'w') as f:
378	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
379	opts.code_block_output)
380	text = oils_doc.ExtractCode(html, f)
381
382	html = oils_doc.RemoveComments(html)
383
384	# Hack for allowing tables without <p> in cells, which CommonMark seems
385	# to require?
386	html = html.replace('<p><pstrip>', '')
387	html = html.replace('</pstrip></p>', '')
388
389	try:
390	html = ul_table.ReplaceTables(html)
391	except lazylex_html.ParseError as e:
392	print('Error rendering file %r' % in_file, file=sys.stderr)
393	raise
394
395	# Expand $xref, etc.
396	html = oils_doc.ExpandLinks(html)
397
398	# <code> blocks
399	# Including class=language-oil-help-topics
400	html = oils_doc.HighlightCode(html,
401	meta.get('default_highlighter'),
402	debug_out=debug_out)
403
404	# h2 is the title. h1 is unused.
405	if opts.toc_tags:
406	toc_tags = opts.toc_tags
407	else:
408	toc_tags = ('h3', 'h4')
409
410	parser = TocExtractor()
411	parser.feed(html)
412
413	log('')
414	log('*** HTML headings:')
415	for heading in parser.headings:
416	log(heading)
417
418	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
419
420	if parser.toc_begin_line != -1:
421	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
422	parser.toc_begin_line,
423	preserve_anchor_case)
424	elif parser.dense_toc_begin_line != -1:
425	insertions = _MakeTocInsertionsDense(parser.headings,
426	parser.dense_toc_begin_line,
427	preserve_anchor_case)
428	else: # No TOC found Not found!
429	out_file.write(html) # Pass through
430	return
431
432	log('')
433	log('*** Text Insertions:')
434	for ins in insertions:
435	log(ins)
436
437	log('')
438	log('*** Output:')
439
440	lines = html.splitlines(True) # keep newlines
441	_ApplyInsertions(lines, insertions, out_file)
442
443
444	def Options():
445	p = optparse.OptionParser('cmark.py [options]')
446
447	p.add_option('--common-mark',
448	action='store_true',
449	default=False,
450	help='Only do CommonMark conversion')
451
452	p.add_option(
453	'--toc-pretty-href',
454	action='store_true',
455	default=False,
456	help='Generate textual hrefs #like-this rather than like #toc10')
457	p.add_option('--toc-tag',
458	dest='toc_tags',
459	action='append',
460	default=[],
461	help='h tags to include in the TOC, e.g. h2 h3')
462	p.add_option('--disable-fastlex',
463	dest='disable_fastlex',
464	action='store_true',
465	default=False,
466	help='Hack for old blog posts')
467
468	p.add_option('--code-block-output',
469	dest='code_block_output',
470	default=None,
471	help='Extract and print code blocks to this file')
472
473	return p
474
475
476	# width 40 by default
477	DEFAULT_META = {'body_css_class': 'width40'}
478
479
480	def main(argv):
481	o = Options()
482	opts, argv = o.parse_args(argv)
483	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
484
485	if opts.common_mark:
486	print(md2html(sys.stdin.read()))
487	return
488
489	meta = dict(DEFAULT_META)
490
491	if len(argv) == 3: # It's Oils documentation
492	with open(argv[1]) as f:
493	meta.update(json.load(f))
494
495	# Docs have a special header and footer.
496	with open(argv[2]) as content_f:
497	doc_html.Header(meta, sys.stdout, draft_warning=True)
498	Render(opts, meta, content_f, sys.stdout)
499	doc_html.Footer(meta, sys.stdout)
500	else:
501	# Filter for blog and for benchmarks.
502
503	# Metadata is optional here
504	try:
505	with open(argv[1]) as f:
506	meta.update(json.load(f))
507	except IndexError:
508	pass
509
510	# Old style for blog: it's a filter
511	Render(opts,
512	meta,
513	sys.stdin,
514	sys.stdout,
515	use_fastlex=not opts.disable_fastlex)
516
517
518	if __name__ == '__main__':
519	main(sys.argv)