OILS / doctools / cmark.py View on Github | oilshell.org

519 lines, 292 significant
1#!/usr/bin/env python2
2"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3and insert anchors.
4
5I started from cmark-0.28.3/wrappers/wrapper.py.
6"""
7from __future__ import print_function
8
9import ctypes
10import HTMLParser
11import json
12import optparse
13import os
14import pprint
15import sys
16
17from doctools import html_lib
18from doctools import doc_html # templates
19from doctools import oils_doc
20from doctools import ul_table
21from lazylex import html as lazylex_html
22
23from typing import Any
24
25# Geez find_library returns the filename and not the path? Just hardcode it as
26# a workaround.
27# https://bugs.python.org/issue21042
28
29#from ctypes.util import find_library
30#libname = find_library("cmark")
31#assert libname, "cmark not found"
32
33# There's some ongoing discussion about how to deal with the same in Nix.
34# I think normally you'd just patch/substitute this path during the Nix build.
35# See note in shell.nix
36this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
37
38cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
39cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
40cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
41
42if cmark1 is not None and os.path.exists(cmark1):
43 libname = cmark1
44elif os.path.exists(cmark2):
45 libname = cmark2
46elif os.path.exists(cmark3):
47 libname = cmark3
48else:
49 raise AssertionError("Couldn't find libcmark.so")
50
51cmark = ctypes.CDLL(libname)
52
53markdown = cmark.cmark_markdown_to_html
54markdown.restype = ctypes.c_char_p
55markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
56
57
58def log(msg, *args):
59 # type: (str, Any) -> None
60 if args:
61 msg = msg % args
62
63 if 0:
64 print(msg, file=sys.stderr)
65
66
67# Version 0.29.0 disallowed raw HTML by default!
68CMARK_OPT_UNSAFE = (1 << 17)
69
70
71def md2html(text):
72 textbytes = text
73 textlen = len(text)
74 return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
75
76
77def demo():
78 sys.stdout.write(md2html('*hi*'))
79
80
81class TocExtractor(HTMLParser.HTMLParser):
82 """Extract Table of Contents
83
84 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
85 the line number.
86
87 Later, we insert two things:
88 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
89 - The TOC after <div id="toc">
90 """
91
92 def __init__(self):
93 HTMLParser.HTMLParser.__init__(self)
94
95 # make targets for these, regardless of whether the TOC links to them.
96 self.h_tags = ['h2', 'h3', 'h4']
97 self.indent = 0
98
99 # The TOC will be inserted after this.
100 self.toc_begin_line = -1
101 self.dense_toc_begin_line = -1
102
103 self.capturing = False
104
105 # Flat list of (line_num, tag, id, HTML)?
106 # HTML is like innerHTML. There can be <code> annotations and so forth.
107 # id is optional -- it can be used for generating headings.
108 self.headings = []
109
110 def handle_starttag(self, tag, attrs):
111 if tag == 'div':
112 if attrs == [('id', 'toc')]:
113 log('%s> %s %s', self.indent * ' ', tag, attrs)
114 self.indent += 1
115 self.toc_begin_line, _ = self.getpos()
116 elif attrs == [('id', 'dense-toc')]:
117 self.indent += 1
118 self.dense_toc_begin_line, _ = self.getpos()
119
120 # Can't have nested <a> tags
121 if self.capturing and tag != 'a':
122 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
123
124 if tag in self.h_tags:
125 log('%s> %s %s', self.indent * ' ', tag, attrs)
126 self.indent += 1
127 line_num, _ = self.getpos()
128
129 css_id = None
130 for k, v in attrs:
131 if k == 'id':
132 css_id = v
133 break
134 self.headings.append((line_num, tag, css_id, [], []))
135 self.capturing = True # record the text inside <h2></h2> etc.
136
137 def handle_endtag(self, tag):
138 # Debug print
139 if tag == 'div':
140 self.indent -= 1
141 log('%s< %s', self.indent * ' ', tag)
142
143 if tag in self.h_tags:
144 self.indent -= 1
145 log('%s< %s', self.indent * ' ', tag)
146 self.capturing = False
147
148 # Can't have nested <a> tags
149 if self.capturing and tag != 'a':
150 self._AppendHtml('</%s>' % tag)
151
152 def handle_entityref(self, data):
153 """
154 From Python docs:
155 This method is called to process a named character reference of the form
156 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
157 """
158 # BUG FIX: For when we have say &quot; or &lt; in subheadings
159 if self.capturing:
160 self._AppendHtml('&%s;' % data)
161
162 def handle_data(self, data):
163 # Debug print
164 if self.indent > 0:
165 log('%s| %r', self.indent * ' ', data)
166
167 if self.capturing:
168 self._AppendHtml(data)
169 self._AppendText(data)
170
171 def _AppendText(self, text):
172 """Accumulate text of the last heading."""
173 _, _, _, _, text_parts = self.headings[-1]
174 text_parts.append(text)
175
176 def _AppendHtml(self, html):
177 """Accumulate HTML of the last heading."""
178 _, _, _, html_parts, _ = self.headings[-1]
179 html_parts.append(html)
180
181
182TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
183
184# We could just add <h2 id="foo"> attribute! I didn't know those are valid
185# anchors.
186# But it's easier to insert an entire line, rather than part ofa line.
187ANCHOR_FMT = '<a name="%s"></a>\n'
188
189
190def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
191 preserve_anchor_case):
192 """Given extract headings list and TOC position, return a list of insertions.
193
194 The insertions <div> for the TOC itself, and <a name=""> for the targets.
195
196 Args:
197 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
198 all of them.
199 """
200 # Example:
201 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
202 #
203 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
204 # that's easy.
205
206 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
207 insertions = []
208
209 i = 0
210 for line_num, tag, css_id, html_parts, text_parts in headings:
211 css_class = TAG_TO_CSS[tag]
212
213 # Add BOTH href, for stability.
214 numeric_href = 'toc_%d' % i
215
216 # If there was an explicit CSS ID written by the user, use that as the href.
217 # I used this in the blog a few times.
218
219 pretty_href = html_lib.PrettyHref(
220 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
221
222 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
223 toc_href = css_id
224 else:
225 # Always use the pretty version now. The old numeric version is still a
226 # target, but not in the TOC.
227 toc_href = pretty_href
228
229 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
230 css_class, toc_href, ''.join(html_parts))
231 if tag in toc_tags:
232 toc_lines.append(line)
233
234 targets = []
235 if opts.toc_pretty_href: # NEW WAY
236 targets.append(ANCHOR_FMT % pretty_href)
237 elif css_id: # Old blog explicit
238 targets.append(ANCHOR_FMT % css_id)
239 targets.append(ANCHOR_FMT % numeric_href)
240 else: # Old blog implicit
241 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
242 targets.append(ANCHOR_FMT % numeric_href)
243
244 insertions.append((line_num, ''.join(targets)))
245
246 i += 1
247
248 # +1 to insert AFTER the <div>
249 toc_insert = (toc_pos + 1, ''.join(toc_lines))
250 insertions.insert(0, toc_insert) # The first insertion is TOC
251
252 return insertions
253
254
255def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
256 """For the dense-toc style with columns, used by doc/ref
257
258 The style above is simpler: it outputs a div for every line:
259
260 <div id="toctitle">Table of Contents</div>
261
262 <div class="toclevel1><a ...> Level 1 </a></div>
263 <div class="toclevel2><a ...> 1.A </a></div>
264 <div class="toclevel2><a ...> 1.B </a></div>
265 <div class="toclevel1><a ...> Level 2 </a></div>
266 ...
267
268 We want something like this:
269
270 <div id="dense-toc-title">Table of Contents</div>
271
272 <div class="dense-toc-group">
273 <a ...> Level 1 </a> <br/>
274
275 <a class="dense-toc-h3" ...> 1.A </a> <br/>
276 <a class="dense-toc-h3" ...> 1.B </a> <br/>
277
278 </div> # NO BREAKING within this div
279
280 <div class="dense-toc-group">
281 <a ...> Level 2 </a> <br/>
282 </div>
283 """
284
285 heading_tree = []
286 current_h2 = None
287
288 insertions = []
289
290 for line_num, tag, css_id, html_parts, text_parts in headings:
291
292 pretty_href = html_lib.PrettyHref(
293 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
294
295 if css_id: # doc/ref can use <h3 id="explicit"></h3>
296 toc_href = css_id
297 else:
298 # Always use the pretty version now. The old numeric version is still a
299 # target, but not in the TOC.
300 toc_href = pretty_href
301
302 anchor_html = ''.join(html_parts)
303
304 # Create a two level tree
305 if tag == 'h2':
306 current_h2 = (anchor_html, toc_href, [])
307 heading_tree.append(current_h2)
308 elif tag == 'h3':
309 assert current_h2 is not None, "h3 shouldn't come before any h2"
310 current_h2[2].append((anchor_html, toc_href))
311
312 # Insert the target <a name="">
313 insertions.append((line_num, ANCHOR_FMT % pretty_href))
314
315 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
316
317 if 1:
318 log('Heading Tree:')
319 log(pprint.pformat(heading_tree))
320 log('')
321
322 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
323 toc_lines.append('<div id="dense-toc-cols">\n')
324
325 for h2_html, h2_href, children in heading_tree:
326 toc_lines.append('<div class="dense-toc-group">\n')
327 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
328 for h3_html, h3_href in children:
329 toc_lines.append(
330 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
331 (h3_href, h3_html))
332 toc_lines.append('</div>\n')
333
334 toc_lines.append('</div>\n')
335
336 if 1:
337 log('TOC lines')
338 log(pprint.pformat(toc_lines))
339 log('')
340
341 # +1 to insert AFTER the <div>
342 toc_insert = (toc_pos + 1, ''.join(toc_lines))
343 insertions.insert(0, toc_insert) # The first insertion is TOC
344
345 return insertions
346
347
348def _ApplyInsertions(lines, insertions, out_file):
349 assert insertions, "Should be at least one insertion"
350 j = 0
351 n = len(insertions)
352
353 for i, line in enumerate(lines):
354 current_line = i + 1 # 1-based
355
356 if j < n:
357 line_num, s = insertions[j]
358 if current_line == line_num:
359 out_file.write(s)
360 j += 1
361
362 out_file.write(line)
363
364
365def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
366 if debug_out is None:
367 debug_out = []
368
369 # First convert to HTML
370 html = md2html(in_file.read())
371 #print(html, file=sys.stderr)
372
373 # Now process HTML with oils_doc
374 if use_fastlex:
375 # Note: extract code BEFORE doing the HTML highlighting.
376 if opts.code_block_output:
377 with open(opts.code_block_output, 'w') as f:
378 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
379 opts.code_block_output)
380 text = oils_doc.ExtractCode(html, f)
381
382 html = oils_doc.RemoveComments(html)
383
384 # Hack for allowing tables without <p> in cells, which CommonMark seems
385 # to require?
386 html = html.replace('<p><pstrip>', '')
387 html = html.replace('</pstrip></p>', '')
388
389 try:
390 html = ul_table.ReplaceTables(html)
391 except lazylex_html.ParseError as e:
392 print('Error rendering file %r' % in_file, file=sys.stderr)
393 raise
394
395 # Expand $xref, etc.
396 html = oils_doc.ExpandLinks(html)
397
398 # <code> blocks
399 # Including class=language-oil-help-topics
400 html = oils_doc.HighlightCode(html,
401 meta.get('default_highlighter'),
402 debug_out=debug_out)
403
404 # h2 is the title. h1 is unused.
405 if opts.toc_tags:
406 toc_tags = opts.toc_tags
407 else:
408 toc_tags = ('h3', 'h4')
409
410 parser = TocExtractor()
411 parser.feed(html)
412
413 log('')
414 log('*** HTML headings:')
415 for heading in parser.headings:
416 log(heading)
417
418 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
419
420 if parser.toc_begin_line != -1:
421 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
422 parser.toc_begin_line,
423 preserve_anchor_case)
424 elif parser.dense_toc_begin_line != -1:
425 insertions = _MakeTocInsertionsDense(parser.headings,
426 parser.dense_toc_begin_line,
427 preserve_anchor_case)
428 else: # No TOC found Not found!
429 out_file.write(html) # Pass through
430 return
431
432 log('')
433 log('*** Text Insertions:')
434 for ins in insertions:
435 log(ins)
436
437 log('')
438 log('*** Output:')
439
440 lines = html.splitlines(True) # keep newlines
441 _ApplyInsertions(lines, insertions, out_file)
442
443
444def Options():
445 p = optparse.OptionParser('cmark.py [options]')
446
447 p.add_option('--common-mark',
448 action='store_true',
449 default=False,
450 help='Only do CommonMark conversion')
451
452 p.add_option(
453 '--toc-pretty-href',
454 action='store_true',
455 default=False,
456 help='Generate textual hrefs #like-this rather than like #toc10')
457 p.add_option('--toc-tag',
458 dest='toc_tags',
459 action='append',
460 default=[],
461 help='h tags to include in the TOC, e.g. h2 h3')
462 p.add_option('--disable-fastlex',
463 dest='disable_fastlex',
464 action='store_true',
465 default=False,
466 help='Hack for old blog posts')
467
468 p.add_option('--code-block-output',
469 dest='code_block_output',
470 default=None,
471 help='Extract and print code blocks to this file')
472
473 return p
474
475
476# width 40 by default
477DEFAULT_META = {'body_css_class': 'width40'}
478
479
480def main(argv):
481 o = Options()
482 opts, argv = o.parse_args(argv)
483 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
484
485 if opts.common_mark:
486 print(md2html(sys.stdin.read()))
487 return
488
489 meta = dict(DEFAULT_META)
490
491 if len(argv) == 3: # It's Oils documentation
492 with open(argv[1]) as f:
493 meta.update(json.load(f))
494
495 # Docs have a special header and footer.
496 with open(argv[2]) as content_f:
497 doc_html.Header(meta, sys.stdout, draft_warning=True)
498 Render(opts, meta, content_f, sys.stdout)
499 doc_html.Footer(meta, sys.stdout)
500 else:
501 # Filter for blog and for benchmarks.
502
503 # Metadata is optional here
504 try:
505 with open(argv[1]) as f:
506 meta.update(json.load(f))
507 except IndexError:
508 pass
509
510 # Old style for blog: it's a filter
511 Render(opts,
512 meta,
513 sys.stdin,
514 sys.stdout,
515 use_fastlex=not opts.disable_fastlex)
516
517
518if __name__ == '__main__':
519 main(sys.argv)