doctools/cmark.py

OILS / doctools / cmark.py View on Github | oils.pub

573 lines, 325 significant

1	#!/usr/bin/env python2
2	"""Convert Markdown to HTML, with our enhancements
3
4	- Parse the HTML
5	- insert a TOC
6	- <pstrip> hack - this is obsolete with ul-table?
7	- Expand $xref links
8	- Highlight code blocks
9
10	I started from cmark-0.28.3/wrappers/wrapper.py.
11	"""
12	from __future__ import print_function
13
14	import ctypes
15	from typing import List
16	from typing import Tuple
17	from typing import Union
18	from typing import Optional
19	from typing import IO
20	from typing import Dict
21	try:
22	from HTMLParser import HTMLParser
23	except ImportError:
24	# python3
25	from html.parser import HTMLParser # type: ignore
26	import json
27	import optparse
28	import os
29	import pprint
30	import sys
31
32	from doctools import html_lib
33	from doctools import doc_html # templates
34	from doctools import oils_doc
35	from doctools import ul_table
36	from lazylex import html as lazylex_html
37
38	if sys.version_info.major == 2:
39	from typing import Any
40
41	# Geez find_library returns the filename and not the path? Just hardcode it as
42	# a workaround.
43	# https://bugs.python.org/issue21042
44
45	#from ctypes.util import find_library
46	#libname = find_library("cmark")
47	#assert libname, "cmark not found"
48
49	# There's some ongoing discussion about how to deal with the same in Nix.
50	# I think normally you'd just patch/substitute this path during the Nix build.
51	# See note in shell.nix
52	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
53
54	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
55	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
56	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
57
58	if cmark1 is not None and os.path.exists(cmark1):
59	libname = cmark1
60	elif os.path.exists(cmark2):
61	libname = cmark2
62	elif os.path.exists(cmark3):
63	libname = cmark3
64	else:
65	raise AssertionError("Couldn't find libcmark.so")
66
67	cmark = ctypes.CDLL(libname)
68
69	markdown = cmark.cmark_markdown_to_html
70	markdown.restype = ctypes.c_char_p
71	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
72
73
74	def log(msg, *args):
75	# type: (str, Any) -> None
76	if args:
77	msg = msg % args
78
79	if 0:
80	print(msg, file=sys.stderr)
81
82
83	# Version 0.29.0 disallowed raw HTML by default!
84	CMARK_OPT_UNSAFE = (1 << 17)
85
86
87	def md2html(md):
88	# type: (str) -> str
89	if sys.version_info.major == 2:
90	md_bytes = md
91	else:
92	md_bytes = md.encode('utf-8')
93
94	md_len = len(md)
95	html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
96
97	if sys.version_info.major == 2:
98	return html
99	else:
100	return html.decode('utf-8')
101
102
103	def demo():
104	sys.stdout.write(md2html('hi'))
105
106
107	class TocExtractor(HTMLParser):
108	"""Extract Table of Contents
109
110	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
111	the line number.
112
113	Later, we insert two things:
114	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
115	- The TOC after <div id="toc">
116	"""
117
118	def __init__(self):
119	# type: () -> None
120	HTMLParser.__init__(self)
121
122	# make targets for these, regardless of whether the TOC links to them.
123	self.h_tags = ['h2', 'h3', 'h4']
124	self.indent = 0
125
126	# The TOC will be inserted after this.
127	self.toc_begin_line = -1
128	self.dense_toc_begin_line = -1
129
130	self.capturing = False
131
132	# Flat list of (line_num, tag, id, HTML)?
133	# HTML is like innerHTML. There can be <code> annotations and so forth.
134	# id is optional -- it can be used for generating headings.
135	self.headings = []
136
137	def handle_starttag(self, tag, attrs):
138	# type: (str, List[Tuple[str, str]]) -> None
139	if tag == 'div':
140	if attrs == [('id', 'toc')]:
141	log('%s> %s %s', self.indent * ' ', tag, attrs)
142	self.indent += 1
143	self.toc_begin_line, _ = self.getpos()
144	elif attrs == [('id', 'dense-toc')]:
145	self.indent += 1
146	self.dense_toc_begin_line, _ = self.getpos()
147
148	# Can't have nested <a> tags
149	if self.capturing and tag != 'a':
150	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
151
152	if tag in self.h_tags:
153	log('%s> %s %s', self.indent * ' ', tag, attrs)
154	self.indent += 1
155	line_num, _ = self.getpos()
156
157	css_id = None
158	for k, v in attrs:
159	if k == 'id':
160	css_id = v
161	break
162	self.headings.append((line_num, tag, css_id, [], []))
163	self.capturing = True # record the text inside <h2></h2> etc.
164
165	def handle_endtag(self, tag):
166	# type: (str) -> None
167	# Debug print
168	if tag == 'div':
169	self.indent -= 1
170	log('%s< %s', self.indent * ' ', tag)
171
172	if tag in self.h_tags:
173	self.indent -= 1
174	log('%s< %s', self.indent * ' ', tag)
175	self.capturing = False
176
177	# Can't have nested <a> tags
178	if self.capturing and tag != 'a':
179	self._AppendHtml('</%s>' % tag)
180
181	def handle_entityref(self, data):
182	# type: (str) -> None
183	"""
184	From Python docs:
185	This method is called to process a named character reference of the form
186	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
187	"""
188	# BUG FIX: For when we have say " or < in subheadings
189	if self.capturing:
190	self._AppendHtml('&%s;' % data)
191
192	def handle_data(self, data):
193	# type: (str) -> None
194	# Debug print
195	if self.indent > 0:
196	log('%s\| %r', self.indent * ' ', data)
197
198	if self.capturing:
199	self._AppendHtml(data)
200	self._AppendText(data)
201
202	def _AppendText(self, text):
203	# type: (str) -> None
204	"""Accumulate text of the last heading."""
205	_, _, _, _, text_parts = self.headings[-1]
206	text_parts.append(text)
207
208	def _AppendHtml(self, html):
209	# type: (str) -> None
210	"""Accumulate HTML of the last heading."""
211	_, _, _, html_parts, _ = self.headings[-1]
212	html_parts.append(html)
213
214
215	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
216
217	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
218	# anchors.
219	# But it's easier to insert an entire line, rather than part ofa line.
220	ANCHOR_FMT = '<a name="%s"></a>\n'
221
222
223	def _MakeTocInsertions(
224	opts, # type: Any
225	toc_tags, # type: Union[List[str], Tuple[str, str]]
226	headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
227	toc_pos, # type: int
228	preserve_anchor_case, # type: bool
229	):
230	# type: (...) -> List[Tuple[int, str]]
231	"""Given extract headings list and TOC position, return a list of insertions.
232
233	The insertions <div> for the TOC itself, and <a name=""> for the targets.
234
235	Args:
236	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
237	all of them.
238	"""
239	# Example:
240	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
241	#
242	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
243	# that's easy.
244
245	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
246	insertions = []
247
248	i = 0
249	for line_num, tag, css_id, html_parts, text_parts in headings:
250	css_class = TAG_TO_CSS[tag]
251
252	# Add BOTH href, for stability.
253	numeric_href = 'toc_%d' % i
254
255	# If there was an explicit CSS ID written by the user, use that as the href.
256	# I used this in the blog a few times.
257
258	pretty_href = html_lib.PrettyHref(
259	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
260
261	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
262	toc_href = css_id
263	else:
264	# Always use the pretty version now. The old numeric version is still a
265	# target, but not in the TOC.
266	toc_href = pretty_href
267
268	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
269	css_class, toc_href, ''.join(html_parts))
270	if tag in toc_tags:
271	toc_lines.append(line)
272
273	targets = []
274	if opts.toc_pretty_href: # NEW WAY
275	targets.append(ANCHOR_FMT % pretty_href)
276	elif css_id: # Old blog explicit
277	targets.append(ANCHOR_FMT % css_id)
278	targets.append(ANCHOR_FMT % numeric_href)
279	else: # Old blog implicit
280	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
281	targets.append(ANCHOR_FMT % numeric_href)
282
283	insertions.append((line_num, ''.join(targets)))
284
285	i += 1
286
287	# +1 to insert AFTER the <div>
288	toc_insert = (toc_pos + 1, ''.join(toc_lines))
289	insertions.insert(0, toc_insert) # The first insertion is TOC
290
291	return insertions
292
293
294	def _MakeTocInsertionsDense(
295	headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
296	toc_pos, # type: int
297	preserve_anchor_case, # type: bool
298	):
299	# type: (...) -> List[Tuple[int, str]]
300	"""For the dense-toc style with columns, used by doc/ref
301
302	The style above is simpler: it outputs a div for every line:
303
304	<div id="toctitle">Table of Contents</div>
305
306	<div class="toclevel1><a ...> Level 1 </a></div>
307	<div class="toclevel2><a ...> 1.A </a></div>
308	<div class="toclevel2><a ...> 1.B </a></div>
309	<div class="toclevel1><a ...> Level 2 </a></div>
310	...
311
312	We want something like this:
313
314	<div id="dense-toc-title">Table of Contents</div>
315
316	<div class="dense-toc-group">
317	<a ...> Level 1 </a> <br/>
318
319	<a class="dense-toc-h3" ...> 1.A </a> <br/>
320	<a class="dense-toc-h3" ...> 1.B </a> <br/>
321
322	</div> # NO BREAKING within this div
323
324	<div class="dense-toc-group">
325	<a ...> Level 2 </a> <br/>
326	</div>
327	"""
328
329	heading_tree = []
330	current_h2 = None
331
332	insertions = []
333
334	for line_num, tag, css_id, html_parts, text_parts in headings:
335
336	pretty_href = html_lib.PrettyHref(
337	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
338
339	if css_id: # doc/ref can use <h3 id="explicit"></h3>
340	toc_href = css_id
341	else:
342	# Always use the pretty version now. The old numeric version is still a
343	# target, but not in the TOC.
344	toc_href = pretty_href
345
346	anchor_html = ''.join(html_parts)
347
348	# Create a two level tree
349	if tag == 'h2':
350	current_h2 = (anchor_html, toc_href, [])
351	heading_tree.append(current_h2)
352	elif tag == 'h3':
353	assert current_h2 is not None, "h3 shouldn't come before any h2"
354	current_h2[2].append((anchor_html, toc_href))
355
356	# Insert the target <a name="">
357	insertions.append((line_num, ANCHOR_FMT % pretty_href))
358
359	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
360
361	if 1:
362	log('Heading Tree:')
363	log(pprint.pformat(heading_tree))
364	log('')
365
366	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
367	toc_lines.append('<div id="dense-toc-cols">\n')
368
369	for h2_html, h2_href, children in heading_tree:
370	toc_lines.append('<div class="dense-toc-group">\n')
371	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
372	for h3_html, h3_href in children:
373	toc_lines.append(
374	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
375	(h3_href, h3_html))
376	toc_lines.append('</div>\n')
377
378	toc_lines.append('</div>\n')
379
380	if 1:
381	log('TOC lines')
382	log(pprint.pformat(toc_lines))
383	log('')
384
385	# +1 to insert AFTER the <div>
386	toc_insert = (toc_pos + 1, ''.join(toc_lines))
387	insertions.insert(0, toc_insert) # The first insertion is TOC
388
389	return insertions
390
391
392	def _ApplyInsertions(lines, insertions, out_file):
393	# type: (List[str], List[Tuple[int, str]], IO[str]) -> None
394	assert insertions, "Should be at least one insertion"
395	j = 0
396	n = len(insertions)
397
398	for i, line in enumerate(lines):
399	current_line = i + 1 # 1-based
400
401	if j < n:
402	line_num, s = insertions[j]
403	if current_line == line_num:
404	out_file.write(s)
405	j += 1
406
407	out_file.write(line)
408
409
410	def Render(
411	opts, # type: Any
412	meta, # type: Dict
413	in_file, # type: IO[str]
414	out_file, # type: IO[str]
415	use_fastlex=True, # type: bool
416	debug_out=None, # type: Optional[Any]
417	):
418	# type: (...) -> None
419	if debug_out is None:
420	debug_out = []
421
422	# First convert to HTML
423	html = md2html(in_file.read())
424	#print(html, file=sys.stderr)
425
426	# Now process HTML with oils_doc
427	if use_fastlex:
428	# Note: extract code BEFORE doing the HTML highlighting.
429	if opts.code_block_output:
430	with open(opts.code_block_output, 'w') as f:
431	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
432	opts.code_block_output)
433	text = oils_doc.ExtractCode(html, f)
434
435	html = ul_table.RemoveComments(html)
436
437	# Hack for allowing tables without <p> in cells, which CommonMark seems
438	# to require?
439	html = html.replace('<p><pstrip>', '')
440	html = html.replace('</pstrip></p>', '')
441
442	try:
443	html = ul_table.ReplaceTables(html)
444	except lazylex_html.ParseError as e:
445	print('Error rendering file %r' % in_file, file=sys.stderr)
446	raise
447
448	# Expand $xref, etc.
449	html = oils_doc.ExpandLinks(html)
450
451	# <code> blocks
452	# Including class=language-oil-help-topics
453	html = oils_doc.HighlightCode(html,
454	meta.get('default_highlighter'),
455	debug_out=debug_out)
456
457	# h2 is the title. h1 is unused.
458	if opts.toc_tags:
459	toc_tags = opts.toc_tags
460	else:
461	toc_tags = ('h3', 'h4')
462
463	parser = TocExtractor()
464	parser.feed(html)
465
466	log('')
467	log('*** HTML headings:')
468	for heading in parser.headings:
469	log(heading)
470
471	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
472
473	if parser.toc_begin_line != -1:
474	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
475	parser.toc_begin_line,
476	preserve_anchor_case)
477	elif parser.dense_toc_begin_line != -1:
478	insertions = _MakeTocInsertionsDense(parser.headings,
479	parser.dense_toc_begin_line,
480	preserve_anchor_case)
481	else: # No TOC found Not found!
482	out_file.write(html) # Pass through
483	return
484
485	log('')
486	log('*** Text Insertions:')
487	for ins in insertions:
488	log(ins)
489
490	log('')
491	log('*** Output:')
492
493	lines = html.splitlines(True) # keep newlines
494	_ApplyInsertions(lines, insertions, out_file)
495
496
497	def Options():
498	# type: () -> Any
499	p = optparse.OptionParser('cmark.py [options]')
500
501	p.add_option('--common-mark',
502	action='store_true',
503	default=False,
504	help='Only do CommonMark conversion')
505
506	p.add_option(
507	'--toc-pretty-href',
508	action='store_true',
509	default=False,
510	help='Generate textual hrefs #like-this rather than like #toc10')
511	p.add_option('--toc-tag',
512	dest='toc_tags',
513	action='append',
514	default=[],
515	help='h tags to include in the TOC, e.g. h2 h3')
516	p.add_option('--disable-fastlex',
517	dest='disable_fastlex',
518	action='store_true',
519	default=False,
520	help='Hack for old blog posts')
521
522	p.add_option('--code-block-output',
523	dest='code_block_output',
524	default=None,
525	help='Extract and print code blocks to this file')
526
527	return p
528
529
530	# width 40 by default
531	DEFAULT_META = {'body_css_class': 'width40'}
532
533
534	def main(argv):
535	o = Options()
536	opts, argv = o.parse_args(argv)
537	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
538
539	if opts.common_mark:
540	print(md2html(sys.stdin.read()))
541	return
542
543	meta = dict(DEFAULT_META)
544
545	if len(argv) == 3: # It's Oils documentation
546	with open(argv[1]) as f:
547	meta.update(json.load(f))
548
549	# Docs have a special header and footer.
550	with open(argv[2]) as content_f:
551	doc_html.Header(meta, sys.stdout, draft_warning=True)
552	Render(opts, meta, content_f, sys.stdout)
553	doc_html.Footer(meta, sys.stdout)
554	else:
555	# Filter for blog and for benchmarks.
556
557	# Metadata is optional here
558	try:
559	with open(argv[1]) as f:
560	meta.update(json.load(f))
561	except IndexError:
562	pass
563
564	# Old style for blog: it's a filter
565	Render(opts,
566	meta,
567	sys.stdin,
568	sys.stdout,
569	use_fastlex=not opts.disable_fastlex)
570
571
572	if __name__ == '__main__':
573	main(sys.argv)