doctools/cmark.py

OILS / doctools / cmark.py View on Github | oils.pub

568 lines, 325 significant

1	#!/usr/bin/env python2
2	"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3	and insert anchors.
4
5	I started from cmark-0.28.3/wrappers/wrapper.py.
6	"""
7	from __future__ import print_function
8
9	import ctypes
10	from typing import List
11	from typing import Tuple
12	from typing import Union
13	from typing import Optional
14	from typing import IO
15	from typing import Dict
16	try:
17	from HTMLParser import HTMLParser
18	except ImportError:
19	# python3
20	from html.parser import HTMLParser # type: ignore
21	import json
22	import optparse
23	import os
24	import pprint
25	import sys
26
27	from doctools import html_lib
28	from doctools import doc_html # templates
29	from doctools import oils_doc
30	from doctools import ul_table
31	from lazylex import html as lazylex_html
32
33	if sys.version_info.major == 2:
34	from typing import Any
35
36	# Geez find_library returns the filename and not the path? Just hardcode it as
37	# a workaround.
38	# https://bugs.python.org/issue21042
39
40	#from ctypes.util import find_library
41	#libname = find_library("cmark")
42	#assert libname, "cmark not found"
43
44	# There's some ongoing discussion about how to deal with the same in Nix.
45	# I think normally you'd just patch/substitute this path during the Nix build.
46	# See note in shell.nix
47	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
48
49	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
50	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
51	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
52
53	if cmark1 is not None and os.path.exists(cmark1):
54	libname = cmark1
55	elif os.path.exists(cmark2):
56	libname = cmark2
57	elif os.path.exists(cmark3):
58	libname = cmark3
59	else:
60	raise AssertionError("Couldn't find libcmark.so")
61
62	cmark = ctypes.CDLL(libname)
63
64	markdown = cmark.cmark_markdown_to_html
65	markdown.restype = ctypes.c_char_p
66	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
67
68
69	def log(msg, *args):
70	# type: (str, Any) -> None
71	if args:
72	msg = msg % args
73
74	if 0:
75	print(msg, file=sys.stderr)
76
77
78	# Version 0.29.0 disallowed raw HTML by default!
79	CMARK_OPT_UNSAFE = (1 << 17)
80
81
82	def md2html(md):
83	# type: (str) -> str
84	if sys.version_info.major == 2:
85	md_bytes = md
86	else:
87	md_bytes = md.encode('utf-8')
88
89	md_len = len(md)
90	html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
91
92	if sys.version_info.major == 2:
93	return html
94	else:
95	return html.decode('utf-8')
96
97
98	def demo():
99	sys.stdout.write(md2html('hi'))
100
101
102	class TocExtractor(HTMLParser):
103	"""Extract Table of Contents
104
105	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
106	the line number.
107
108	Later, we insert two things:
109	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
110	- The TOC after <div id="toc">
111	"""
112
113	def __init__(self):
114	# type: () -> None
115	HTMLParser.__init__(self)
116
117	# make targets for these, regardless of whether the TOC links to them.
118	self.h_tags = ['h2', 'h3', 'h4']
119	self.indent = 0
120
121	# The TOC will be inserted after this.
122	self.toc_begin_line = -1
123	self.dense_toc_begin_line = -1
124
125	self.capturing = False
126
127	# Flat list of (line_num, tag, id, HTML)?
128	# HTML is like innerHTML. There can be <code> annotations and so forth.
129	# id is optional -- it can be used for generating headings.
130	self.headings = []
131
132	def handle_starttag(self, tag, attrs):
133	# type: (str, List[Tuple[str, str]]) -> None
134	if tag == 'div':
135	if attrs == [('id', 'toc')]:
136	log('%s> %s %s', self.indent * ' ', tag, attrs)
137	self.indent += 1
138	self.toc_begin_line, _ = self.getpos()
139	elif attrs == [('id', 'dense-toc')]:
140	self.indent += 1
141	self.dense_toc_begin_line, _ = self.getpos()
142
143	# Can't have nested <a> tags
144	if self.capturing and tag != 'a':
145	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
146
147	if tag in self.h_tags:
148	log('%s> %s %s', self.indent * ' ', tag, attrs)
149	self.indent += 1
150	line_num, _ = self.getpos()
151
152	css_id = None
153	for k, v in attrs:
154	if k == 'id':
155	css_id = v
156	break
157	self.headings.append((line_num, tag, css_id, [], []))
158	self.capturing = True # record the text inside <h2></h2> etc.
159
160	def handle_endtag(self, tag):
161	# type: (str) -> None
162	# Debug print
163	if tag == 'div':
164	self.indent -= 1
165	log('%s< %s', self.indent * ' ', tag)
166
167	if tag in self.h_tags:
168	self.indent -= 1
169	log('%s< %s', self.indent * ' ', tag)
170	self.capturing = False
171
172	# Can't have nested <a> tags
173	if self.capturing and tag != 'a':
174	self._AppendHtml('</%s>' % tag)
175
176	def handle_entityref(self, data):
177	# type: (str) -> None
178	"""
179	From Python docs:
180	This method is called to process a named character reference of the form
181	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
182	"""
183	# BUG FIX: For when we have say " or < in subheadings
184	if self.capturing:
185	self._AppendHtml('&%s;' % data)
186
187	def handle_data(self, data):
188	# type: (str) -> None
189	# Debug print
190	if self.indent > 0:
191	log('%s\| %r', self.indent * ' ', data)
192
193	if self.capturing:
194	self._AppendHtml(data)
195	self._AppendText(data)
196
197	def _AppendText(self, text):
198	# type: (str) -> None
199	"""Accumulate text of the last heading."""
200	_, _, _, _, text_parts = self.headings[-1]
201	text_parts.append(text)
202
203	def _AppendHtml(self, html):
204	# type: (str) -> None
205	"""Accumulate HTML of the last heading."""
206	_, _, _, html_parts, _ = self.headings[-1]
207	html_parts.append(html)
208
209
210	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
211
212	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
213	# anchors.
214	# But it's easier to insert an entire line, rather than part ofa line.
215	ANCHOR_FMT = '<a name="%s"></a>\n'
216
217
218	def _MakeTocInsertions(
219	opts, # type: Any
220	toc_tags, # type: Union[List[str], Tuple[str, str]]
221	headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
222	toc_pos, # type: int
223	preserve_anchor_case, # type: bool
224	):
225	# type: (...) -> List[Tuple[int, str]]
226	"""Given extract headings list and TOC position, return a list of insertions.
227
228	The insertions <div> for the TOC itself, and <a name=""> for the targets.
229
230	Args:
231	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
232	all of them.
233	"""
234	# Example:
235	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
236	#
237	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
238	# that's easy.
239
240	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
241	insertions = []
242
243	i = 0
244	for line_num, tag, css_id, html_parts, text_parts in headings:
245	css_class = TAG_TO_CSS[tag]
246
247	# Add BOTH href, for stability.
248	numeric_href = 'toc_%d' % i
249
250	# If there was an explicit CSS ID written by the user, use that as the href.
251	# I used this in the blog a few times.
252
253	pretty_href = html_lib.PrettyHref(
254	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
255
256	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
257	toc_href = css_id
258	else:
259	# Always use the pretty version now. The old numeric version is still a
260	# target, but not in the TOC.
261	toc_href = pretty_href
262
263	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
264	css_class, toc_href, ''.join(html_parts))
265	if tag in toc_tags:
266	toc_lines.append(line)
267
268	targets = []
269	if opts.toc_pretty_href: # NEW WAY
270	targets.append(ANCHOR_FMT % pretty_href)
271	elif css_id: # Old blog explicit
272	targets.append(ANCHOR_FMT % css_id)
273	targets.append(ANCHOR_FMT % numeric_href)
274	else: # Old blog implicit
275	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
276	targets.append(ANCHOR_FMT % numeric_href)
277
278	insertions.append((line_num, ''.join(targets)))
279
280	i += 1
281
282	# +1 to insert AFTER the <div>
283	toc_insert = (toc_pos + 1, ''.join(toc_lines))
284	insertions.insert(0, toc_insert) # The first insertion is TOC
285
286	return insertions
287
288
289	def _MakeTocInsertionsDense(
290	headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
291	toc_pos, # type: int
292	preserve_anchor_case, # type: bool
293	):
294	# type: (...) -> List[Tuple[int, str]]
295	"""For the dense-toc style with columns, used by doc/ref
296
297	The style above is simpler: it outputs a div for every line:
298
299	<div id="toctitle">Table of Contents</div>
300
301	<div class="toclevel1><a ...> Level 1 </a></div>
302	<div class="toclevel2><a ...> 1.A </a></div>
303	<div class="toclevel2><a ...> 1.B </a></div>
304	<div class="toclevel1><a ...> Level 2 </a></div>
305	...
306
307	We want something like this:
308
309	<div id="dense-toc-title">Table of Contents</div>
310
311	<div class="dense-toc-group">
312	<a ...> Level 1 </a> <br/>
313
314	<a class="dense-toc-h3" ...> 1.A </a> <br/>
315	<a class="dense-toc-h3" ...> 1.B </a> <br/>
316
317	</div> # NO BREAKING within this div
318
319	<div class="dense-toc-group">
320	<a ...> Level 2 </a> <br/>
321	</div>
322	"""
323
324	heading_tree = []
325	current_h2 = None
326
327	insertions = []
328
329	for line_num, tag, css_id, html_parts, text_parts in headings:
330
331	pretty_href = html_lib.PrettyHref(
332	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
333
334	if css_id: # doc/ref can use <h3 id="explicit"></h3>
335	toc_href = css_id
336	else:
337	# Always use the pretty version now. The old numeric version is still a
338	# target, but not in the TOC.
339	toc_href = pretty_href
340
341	anchor_html = ''.join(html_parts)
342
343	# Create a two level tree
344	if tag == 'h2':
345	current_h2 = (anchor_html, toc_href, [])
346	heading_tree.append(current_h2)
347	elif tag == 'h3':
348	assert current_h2 is not None, "h3 shouldn't come before any h2"
349	current_h2[2].append((anchor_html, toc_href))
350
351	# Insert the target <a name="">
352	insertions.append((line_num, ANCHOR_FMT % pretty_href))
353
354	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
355
356	if 1:
357	log('Heading Tree:')
358	log(pprint.pformat(heading_tree))
359	log('')
360
361	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
362	toc_lines.append('<div id="dense-toc-cols">\n')
363
364	for h2_html, h2_href, children in heading_tree:
365	toc_lines.append('<div class="dense-toc-group">\n')
366	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
367	for h3_html, h3_href in children:
368	toc_lines.append(
369	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
370	(h3_href, h3_html))
371	toc_lines.append('</div>\n')
372
373	toc_lines.append('</div>\n')
374
375	if 1:
376	log('TOC lines')
377	log(pprint.pformat(toc_lines))
378	log('')
379
380	# +1 to insert AFTER the <div>
381	toc_insert = (toc_pos + 1, ''.join(toc_lines))
382	insertions.insert(0, toc_insert) # The first insertion is TOC
383
384	return insertions
385
386
387	def _ApplyInsertions(lines, insertions, out_file):
388	# type: (List[str], List[Tuple[int, str]], IO[str]) -> None
389	assert insertions, "Should be at least one insertion"
390	j = 0
391	n = len(insertions)
392
393	for i, line in enumerate(lines):
394	current_line = i + 1 # 1-based
395
396	if j < n:
397	line_num, s = insertions[j]
398	if current_line == line_num:
399	out_file.write(s)
400	j += 1
401
402	out_file.write(line)
403
404
405	def Render(
406	opts, # type: Any
407	meta, # type: Dict
408	in_file, # type: IO[str]
409	out_file, # type: IO[str]
410	use_fastlex=True, # type: bool
411	debug_out=None, # type: Optional[Any]
412	):
413	# type: (...) -> None
414	if debug_out is None:
415	debug_out = []
416
417	# First convert to HTML
418	html = md2html(in_file.read())
419	#print(html, file=sys.stderr)
420
421	# Now process HTML with oils_doc
422	if use_fastlex:
423	# Note: extract code BEFORE doing the HTML highlighting.
424	if opts.code_block_output:
425	with open(opts.code_block_output, 'w') as f:
426	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
427	opts.code_block_output)
428	text = oils_doc.ExtractCode(html, f)
429
430	html = ul_table.RemoveComments(html)
431
432	# Hack for allowing tables without <p> in cells, which CommonMark seems
433	# to require?
434	html = html.replace('<p><pstrip>', '')
435	html = html.replace('</pstrip></p>', '')
436
437	try:
438	html = ul_table.ReplaceTables(html)
439	except lazylex_html.ParseError as e:
440	print('Error rendering file %r' % in_file, file=sys.stderr)
441	raise
442
443	# Expand $xref, etc.
444	html = oils_doc.ExpandLinks(html)
445
446	# <code> blocks
447	# Including class=language-oil-help-topics
448	html = oils_doc.HighlightCode(html,
449	meta.get('default_highlighter'),
450	debug_out=debug_out)
451
452	# h2 is the title. h1 is unused.
453	if opts.toc_tags:
454	toc_tags = opts.toc_tags
455	else:
456	toc_tags = ('h3', 'h4')
457
458	parser = TocExtractor()
459	parser.feed(html)
460
461	log('')
462	log('*** HTML headings:')
463	for heading in parser.headings:
464	log(heading)
465
466	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
467
468	if parser.toc_begin_line != -1:
469	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
470	parser.toc_begin_line,
471	preserve_anchor_case)
472	elif parser.dense_toc_begin_line != -1:
473	insertions = _MakeTocInsertionsDense(parser.headings,
474	parser.dense_toc_begin_line,
475	preserve_anchor_case)
476	else: # No TOC found Not found!
477	out_file.write(html) # Pass through
478	return
479
480	log('')
481	log('*** Text Insertions:')
482	for ins in insertions:
483	log(ins)
484
485	log('')
486	log('*** Output:')
487
488	lines = html.splitlines(True) # keep newlines
489	_ApplyInsertions(lines, insertions, out_file)
490
491
492	def Options():
493	# type: () -> Any
494	p = optparse.OptionParser('cmark.py [options]')
495
496	p.add_option('--common-mark',
497	action='store_true',
498	default=False,
499	help='Only do CommonMark conversion')
500
501	p.add_option(
502	'--toc-pretty-href',
503	action='store_true',
504	default=False,
505	help='Generate textual hrefs #like-this rather than like #toc10')
506	p.add_option('--toc-tag',
507	dest='toc_tags',
508	action='append',
509	default=[],
510	help='h tags to include in the TOC, e.g. h2 h3')
511	p.add_option('--disable-fastlex',
512	dest='disable_fastlex',
513	action='store_true',
514	default=False,
515	help='Hack for old blog posts')
516
517	p.add_option('--code-block-output',
518	dest='code_block_output',
519	default=None,
520	help='Extract and print code blocks to this file')
521
522	return p
523
524
525	# width 40 by default
526	DEFAULT_META = {'body_css_class': 'width40'}
527
528
529	def main(argv):
530	o = Options()
531	opts, argv = o.parse_args(argv)
532	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
533
534	if opts.common_mark:
535	print(md2html(sys.stdin.read()))
536	return
537
538	meta = dict(DEFAULT_META)
539
540	if len(argv) == 3: # It's Oils documentation
541	with open(argv[1]) as f:
542	meta.update(json.load(f))
543
544	# Docs have a special header and footer.
545	with open(argv[2]) as content_f:
546	doc_html.Header(meta, sys.stdout, draft_warning=True)
547	Render(opts, meta, content_f, sys.stdout)
548	doc_html.Footer(meta, sys.stdout)
549	else:
550	# Filter for blog and for benchmarks.
551
552	# Metadata is optional here
553	try:
554	with open(argv[1]) as f:
555	meta.update(json.load(f))
556	except IndexError:
557	pass
558
559	# Old style for blog: it's a filter
560	Render(opts,
561	meta,
562	sys.stdin,
563	sys.stdout,
564	use_fastlex=not opts.disable_fastlex)
565
566
567	if __name__ == '__main__':
568	main(sys.argv)