OILS / doctools / cmark.py View on Github | oils.pub

532 lines, 303 significant
1#!/usr/bin/env python2
2"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3and insert anchors.
4
5I started from cmark-0.28.3/wrappers/wrapper.py.
6"""
7from __future__ import print_function
8
9import ctypes
10try:
11 from HTMLParser import HTMLParser
12except ImportError:
13 from html.parser import HTMLParser # python3
14import json
15import optparse
16import os
17import pprint
18import sys
19
20from doctools import html_lib
21from doctools import doc_html # templates
22from doctools import oils_doc
23from doctools import ul_table
24from lazylex import html as lazylex_html
25
26if sys.version_info.major == 2:
27 from typing import Any
28
29# Geez find_library returns the filename and not the path? Just hardcode it as
30# a workaround.
31# https://bugs.python.org/issue21042
32
33#from ctypes.util import find_library
34#libname = find_library("cmark")
35#assert libname, "cmark not found"
36
37# There's some ongoing discussion about how to deal with the same in Nix.
38# I think normally you'd just patch/substitute this path during the Nix build.
39# See note in shell.nix
40this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
41
42cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
43cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
44cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
45
46if cmark1 is not None and os.path.exists(cmark1):
47 libname = cmark1
48elif os.path.exists(cmark2):
49 libname = cmark2
50elif os.path.exists(cmark3):
51 libname = cmark3
52else:
53 raise AssertionError("Couldn't find libcmark.so")
54
55cmark = ctypes.CDLL(libname)
56
57markdown = cmark.cmark_markdown_to_html
58markdown.restype = ctypes.c_char_p
59markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
60
61
62def log(msg, *args):
63 # type: (str, Any) -> None
64 if args:
65 msg = msg % args
66
67 if 0:
68 print(msg, file=sys.stderr)
69
70
71# Version 0.29.0 disallowed raw HTML by default!
72CMARK_OPT_UNSAFE = (1 << 17)
73
74
75def md2html(md):
76 if sys.version_info.major == 2:
77 md_bytes = md
78 else:
79 md_bytes = md.encode('utf-8')
80
81 md_len = len(md)
82 html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
83
84 if sys.version_info.major == 2:
85 return html
86 else:
87 return html.decode('utf-8')
88
89
90def demo():
91 sys.stdout.write(md2html('*hi*'))
92
93
94class TocExtractor(HTMLParser):
95 """Extract Table of Contents
96
97 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
98 the line number.
99
100 Later, we insert two things:
101 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
102 - The TOC after <div id="toc">
103 """
104
105 def __init__(self):
106 HTMLParser.__init__(self)
107
108 # make targets for these, regardless of whether the TOC links to them.
109 self.h_tags = ['h2', 'h3', 'h4']
110 self.indent = 0
111
112 # The TOC will be inserted after this.
113 self.toc_begin_line = -1
114 self.dense_toc_begin_line = -1
115
116 self.capturing = False
117
118 # Flat list of (line_num, tag, id, HTML)?
119 # HTML is like innerHTML. There can be <code> annotations and so forth.
120 # id is optional -- it can be used for generating headings.
121 self.headings = []
122
123 def handle_starttag(self, tag, attrs):
124 if tag == 'div':
125 if attrs == [('id', 'toc')]:
126 log('%s> %s %s', self.indent * ' ', tag, attrs)
127 self.indent += 1
128 self.toc_begin_line, _ = self.getpos()
129 elif attrs == [('id', 'dense-toc')]:
130 self.indent += 1
131 self.dense_toc_begin_line, _ = self.getpos()
132
133 # Can't have nested <a> tags
134 if self.capturing and tag != 'a':
135 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
136
137 if tag in self.h_tags:
138 log('%s> %s %s', self.indent * ' ', tag, attrs)
139 self.indent += 1
140 line_num, _ = self.getpos()
141
142 css_id = None
143 for k, v in attrs:
144 if k == 'id':
145 css_id = v
146 break
147 self.headings.append((line_num, tag, css_id, [], []))
148 self.capturing = True # record the text inside <h2></h2> etc.
149
150 def handle_endtag(self, tag):
151 # Debug print
152 if tag == 'div':
153 self.indent -= 1
154 log('%s< %s', self.indent * ' ', tag)
155
156 if tag in self.h_tags:
157 self.indent -= 1
158 log('%s< %s', self.indent * ' ', tag)
159 self.capturing = False
160
161 # Can't have nested <a> tags
162 if self.capturing and tag != 'a':
163 self._AppendHtml('</%s>' % tag)
164
165 def handle_entityref(self, data):
166 """
167 From Python docs:
168 This method is called to process a named character reference of the form
169 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
170 """
171 # BUG FIX: For when we have say &quot; or &lt; in subheadings
172 if self.capturing:
173 self._AppendHtml('&%s;' % data)
174
175 def handle_data(self, data):
176 # Debug print
177 if self.indent > 0:
178 log('%s| %r', self.indent * ' ', data)
179
180 if self.capturing:
181 self._AppendHtml(data)
182 self._AppendText(data)
183
184 def _AppendText(self, text):
185 """Accumulate text of the last heading."""
186 _, _, _, _, text_parts = self.headings[-1]
187 text_parts.append(text)
188
189 def _AppendHtml(self, html):
190 """Accumulate HTML of the last heading."""
191 _, _, _, html_parts, _ = self.headings[-1]
192 html_parts.append(html)
193
194
195TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
196
197# We could just add <h2 id="foo"> attribute! I didn't know those are valid
198# anchors.
199# But it's easier to insert an entire line, rather than part ofa line.
200ANCHOR_FMT = '<a name="%s"></a>\n'
201
202
203def _MakeTocInsertions(opts, toc_tags, headings, toc_pos,
204 preserve_anchor_case):
205 """Given extract headings list and TOC position, return a list of insertions.
206
207 The insertions <div> for the TOC itself, and <a name=""> for the targets.
208
209 Args:
210 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
211 all of them.
212 """
213 # Example:
214 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
215 #
216 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
217 # that's easy.
218
219 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
220 insertions = []
221
222 i = 0
223 for line_num, tag, css_id, html_parts, text_parts in headings:
224 css_class = TAG_TO_CSS[tag]
225
226 # Add BOTH href, for stability.
227 numeric_href = 'toc_%d' % i
228
229 # If there was an explicit CSS ID written by the user, use that as the href.
230 # I used this in the blog a few times.
231
232 pretty_href = html_lib.PrettyHref(
233 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
234
235 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
236 toc_href = css_id
237 else:
238 # Always use the pretty version now. The old numeric version is still a
239 # target, but not in the TOC.
240 toc_href = pretty_href
241
242 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
243 css_class, toc_href, ''.join(html_parts))
244 if tag in toc_tags:
245 toc_lines.append(line)
246
247 targets = []
248 if opts.toc_pretty_href: # NEW WAY
249 targets.append(ANCHOR_FMT % pretty_href)
250 elif css_id: # Old blog explicit
251 targets.append(ANCHOR_FMT % css_id)
252 targets.append(ANCHOR_FMT % numeric_href)
253 else: # Old blog implicit
254 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
255 targets.append(ANCHOR_FMT % numeric_href)
256
257 insertions.append((line_num, ''.join(targets)))
258
259 i += 1
260
261 # +1 to insert AFTER the <div>
262 toc_insert = (toc_pos + 1, ''.join(toc_lines))
263 insertions.insert(0, toc_insert) # The first insertion is TOC
264
265 return insertions
266
267
268def _MakeTocInsertionsDense(headings, toc_pos, preserve_anchor_case):
269 """For the dense-toc style with columns, used by doc/ref
270
271 The style above is simpler: it outputs a div for every line:
272
273 <div id="toctitle">Table of Contents</div>
274
275 <div class="toclevel1><a ...> Level 1 </a></div>
276 <div class="toclevel2><a ...> 1.A </a></div>
277 <div class="toclevel2><a ...> 1.B </a></div>
278 <div class="toclevel1><a ...> Level 2 </a></div>
279 ...
280
281 We want something like this:
282
283 <div id="dense-toc-title">Table of Contents</div>
284
285 <div class="dense-toc-group">
286 <a ...> Level 1 </a> <br/>
287
288 <a class="dense-toc-h3" ...> 1.A </a> <br/>
289 <a class="dense-toc-h3" ...> 1.B </a> <br/>
290
291 </div> # NO BREAKING within this div
292
293 <div class="dense-toc-group">
294 <a ...> Level 2 </a> <br/>
295 </div>
296 """
297
298 heading_tree = []
299 current_h2 = None
300
301 insertions = []
302
303 for line_num, tag, css_id, html_parts, text_parts in headings:
304
305 pretty_href = html_lib.PrettyHref(
306 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
307
308 if css_id: # doc/ref can use <h3 id="explicit"></h3>
309 toc_href = css_id
310 else:
311 # Always use the pretty version now. The old numeric version is still a
312 # target, but not in the TOC.
313 toc_href = pretty_href
314
315 anchor_html = ''.join(html_parts)
316
317 # Create a two level tree
318 if tag == 'h2':
319 current_h2 = (anchor_html, toc_href, [])
320 heading_tree.append(current_h2)
321 elif tag == 'h3':
322 assert current_h2 is not None, "h3 shouldn't come before any h2"
323 current_h2[2].append((anchor_html, toc_href))
324
325 # Insert the target <a name="">
326 insertions.append((line_num, ANCHOR_FMT % pretty_href))
327
328 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
329
330 if 1:
331 log('Heading Tree:')
332 log(pprint.pformat(heading_tree))
333 log('')
334
335 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
336 toc_lines.append('<div id="dense-toc-cols">\n')
337
338 for h2_html, h2_href, children in heading_tree:
339 toc_lines.append('<div class="dense-toc-group">\n')
340 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
341 for h3_html, h3_href in children:
342 toc_lines.append(
343 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
344 (h3_href, h3_html))
345 toc_lines.append('</div>\n')
346
347 toc_lines.append('</div>\n')
348
349 if 1:
350 log('TOC lines')
351 log(pprint.pformat(toc_lines))
352 log('')
353
354 # +1 to insert AFTER the <div>
355 toc_insert = (toc_pos + 1, ''.join(toc_lines))
356 insertions.insert(0, toc_insert) # The first insertion is TOC
357
358 return insertions
359
360
361def _ApplyInsertions(lines, insertions, out_file):
362 assert insertions, "Should be at least one insertion"
363 j = 0
364 n = len(insertions)
365
366 for i, line in enumerate(lines):
367 current_line = i + 1 # 1-based
368
369 if j < n:
370 line_num, s = insertions[j]
371 if current_line == line_num:
372 out_file.write(s)
373 j += 1
374
375 out_file.write(line)
376
377
378def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
379 if debug_out is None:
380 debug_out = []
381
382 # First convert to HTML
383 html = md2html(in_file.read())
384 #print(html, file=sys.stderr)
385
386 # Now process HTML with oils_doc
387 if use_fastlex:
388 # Note: extract code BEFORE doing the HTML highlighting.
389 if opts.code_block_output:
390 with open(opts.code_block_output, 'w') as f:
391 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
392 opts.code_block_output)
393 text = oils_doc.ExtractCode(html, f)
394
395 html = oils_doc.RemoveComments(html)
396
397 # Hack for allowing tables without <p> in cells, which CommonMark seems
398 # to require?
399 html = html.replace('<p><pstrip>', '')
400 html = html.replace('</pstrip></p>', '')
401
402 try:
403 html = ul_table.ReplaceTables(html)
404 except lazylex_html.ParseError as e:
405 print('Error rendering file %r' % in_file, file=sys.stderr)
406 raise
407
408 # Expand $xref, etc.
409 html = oils_doc.ExpandLinks(html)
410
411 # <code> blocks
412 # Including class=language-oil-help-topics
413 html = oils_doc.HighlightCode(html,
414 meta.get('default_highlighter'),
415 debug_out=debug_out)
416
417 # h2 is the title. h1 is unused.
418 if opts.toc_tags:
419 toc_tags = opts.toc_tags
420 else:
421 toc_tags = ('h3', 'h4')
422
423 parser = TocExtractor()
424 parser.feed(html)
425
426 log('')
427 log('*** HTML headings:')
428 for heading in parser.headings:
429 log(heading)
430
431 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
432
433 if parser.toc_begin_line != -1:
434 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
435 parser.toc_begin_line,
436 preserve_anchor_case)
437 elif parser.dense_toc_begin_line != -1:
438 insertions = _MakeTocInsertionsDense(parser.headings,
439 parser.dense_toc_begin_line,
440 preserve_anchor_case)
441 else: # No TOC found Not found!
442 out_file.write(html) # Pass through
443 return
444
445 log('')
446 log('*** Text Insertions:')
447 for ins in insertions:
448 log(ins)
449
450 log('')
451 log('*** Output:')
452
453 lines = html.splitlines(True) # keep newlines
454 _ApplyInsertions(lines, insertions, out_file)
455
456
457def Options():
458 p = optparse.OptionParser('cmark.py [options]')
459
460 p.add_option('--common-mark',
461 action='store_true',
462 default=False,
463 help='Only do CommonMark conversion')
464
465 p.add_option(
466 '--toc-pretty-href',
467 action='store_true',
468 default=False,
469 help='Generate textual hrefs #like-this rather than like #toc10')
470 p.add_option('--toc-tag',
471 dest='toc_tags',
472 action='append',
473 default=[],
474 help='h tags to include in the TOC, e.g. h2 h3')
475 p.add_option('--disable-fastlex',
476 dest='disable_fastlex',
477 action='store_true',
478 default=False,
479 help='Hack for old blog posts')
480
481 p.add_option('--code-block-output',
482 dest='code_block_output',
483 default=None,
484 help='Extract and print code blocks to this file')
485
486 return p
487
488
489# width 40 by default
490DEFAULT_META = {'body_css_class': 'width40'}
491
492
493def main(argv):
494 o = Options()
495 opts, argv = o.parse_args(argv)
496 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
497
498 if opts.common_mark:
499 print(md2html(sys.stdin.read()))
500 return
501
502 meta = dict(DEFAULT_META)
503
504 if len(argv) == 3: # It's Oils documentation
505 with open(argv[1]) as f:
506 meta.update(json.load(f))
507
508 # Docs have a special header and footer.
509 with open(argv[2]) as content_f:
510 doc_html.Header(meta, sys.stdout, draft_warning=True)
511 Render(opts, meta, content_f, sys.stdout)
512 doc_html.Footer(meta, sys.stdout)
513 else:
514 # Filter for blog and for benchmarks.
515
516 # Metadata is optional here
517 try:
518 with open(argv[1]) as f:
519 meta.update(json.load(f))
520 except IndexError:
521 pass
522
523 # Old style for blog: it's a filter
524 Render(opts,
525 meta,
526 sys.stdin,
527 sys.stdout,
528 use_fastlex=not opts.disable_fastlex)
529
530
531if __name__ == '__main__':
532 main(sys.argv)