OILS / doctools / cmark.py View on Github | oils.pub

573 lines, 325 significant
1#!/usr/bin/env python2
2"""Convert Markdown to HTML, with our enhancements
3
4- Parse the HTML
5- insert a TOC
6- <pstrip> hack - this is obsolete with ul-table?
7- Expand $xref links
8- Highlight code blocks
9
10I started from cmark-0.28.3/wrappers/wrapper.py.
11"""
12from __future__ import print_function
13
14import ctypes
15from typing import List
16from typing import Tuple
17from typing import Union
18from typing import Optional
19from typing import IO
20from typing import Dict
21try:
22 from HTMLParser import HTMLParser
23except ImportError:
24 # python3
25 from html.parser import HTMLParser # type: ignore
26import json
27import optparse
28import os
29import pprint
30import sys
31
32from doctools import html_lib
33from doctools import doc_html # templates
34from doctools import oils_doc
35from doctools import ul_table
36from lazylex import html as lazylex_html
37
38if sys.version_info.major == 2:
39 from typing import Any
40
41# Geez find_library returns the filename and not the path? Just hardcode it as
42# a workaround.
43# https://bugs.python.org/issue21042
44
45#from ctypes.util import find_library
46#libname = find_library("cmark")
47#assert libname, "cmark not found"
48
49# There's some ongoing discussion about how to deal with the same in Nix.
50# I think normally you'd just patch/substitute this path during the Nix build.
51# See note in shell.nix
52this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
53
54cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
55cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
56cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
57
58if cmark1 is not None and os.path.exists(cmark1):
59 libname = cmark1
60elif os.path.exists(cmark2):
61 libname = cmark2
62elif os.path.exists(cmark3):
63 libname = cmark3
64else:
65 raise AssertionError("Couldn't find libcmark.so")
66
67cmark = ctypes.CDLL(libname)
68
69markdown = cmark.cmark_markdown_to_html
70markdown.restype = ctypes.c_char_p
71markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
72
73
74def log(msg, *args):
75 # type: (str, Any) -> None
76 if args:
77 msg = msg % args
78
79 if 0:
80 print(msg, file=sys.stderr)
81
82
83# Version 0.29.0 disallowed raw HTML by default!
84CMARK_OPT_UNSAFE = (1 << 17)
85
86
87def md2html(md):
88 # type: (str) -> str
89 if sys.version_info.major == 2:
90 md_bytes = md
91 else:
92 md_bytes = md.encode('utf-8')
93
94 md_len = len(md)
95 html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
96
97 if sys.version_info.major == 2:
98 return html
99 else:
100 return html.decode('utf-8')
101
102
103def demo():
104 sys.stdout.write(md2html('*hi*'))
105
106
107class TocExtractor(HTMLParser):
108 """Extract Table of Contents
109
110 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
111 the line number.
112
113 Later, we insert two things:
114 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
115 - The TOC after <div id="toc">
116 """
117
118 def __init__(self):
119 # type: () -> None
120 HTMLParser.__init__(self)
121
122 # make targets for these, regardless of whether the TOC links to them.
123 self.h_tags = ['h2', 'h3', 'h4']
124 self.indent = 0
125
126 # The TOC will be inserted after this.
127 self.toc_begin_line = -1
128 self.dense_toc_begin_line = -1
129
130 self.capturing = False
131
132 # Flat list of (line_num, tag, id, HTML)?
133 # HTML is like innerHTML. There can be <code> annotations and so forth.
134 # id is optional -- it can be used for generating headings.
135 self.headings = []
136
137 def handle_starttag(self, tag, attrs):
138 # type: (str, List[Tuple[str, str]]) -> None
139 if tag == 'div':
140 if attrs == [('id', 'toc')]:
141 log('%s> %s %s', self.indent * ' ', tag, attrs)
142 self.indent += 1
143 self.toc_begin_line, _ = self.getpos()
144 elif attrs == [('id', 'dense-toc')]:
145 self.indent += 1
146 self.dense_toc_begin_line, _ = self.getpos()
147
148 # Can't have nested <a> tags
149 if self.capturing and tag != 'a':
150 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
151
152 if tag in self.h_tags:
153 log('%s> %s %s', self.indent * ' ', tag, attrs)
154 self.indent += 1
155 line_num, _ = self.getpos()
156
157 css_id = None
158 for k, v in attrs:
159 if k == 'id':
160 css_id = v
161 break
162 self.headings.append((line_num, tag, css_id, [], []))
163 self.capturing = True # record the text inside <h2></h2> etc.
164
165 def handle_endtag(self, tag):
166 # type: (str) -> None
167 # Debug print
168 if tag == 'div':
169 self.indent -= 1
170 log('%s< %s', self.indent * ' ', tag)
171
172 if tag in self.h_tags:
173 self.indent -= 1
174 log('%s< %s', self.indent * ' ', tag)
175 self.capturing = False
176
177 # Can't have nested <a> tags
178 if self.capturing and tag != 'a':
179 self._AppendHtml('</%s>' % tag)
180
181 def handle_entityref(self, data):
182 # type: (str) -> None
183 """
184 From Python docs:
185 This method is called to process a named character reference of the form
186 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
187 """
188 # BUG FIX: For when we have say &quot; or &lt; in subheadings
189 if self.capturing:
190 self._AppendHtml('&%s;' % data)
191
192 def handle_data(self, data):
193 # type: (str) -> None
194 # Debug print
195 if self.indent > 0:
196 log('%s| %r', self.indent * ' ', data)
197
198 if self.capturing:
199 self._AppendHtml(data)
200 self._AppendText(data)
201
202 def _AppendText(self, text):
203 # type: (str) -> None
204 """Accumulate text of the last heading."""
205 _, _, _, _, text_parts = self.headings[-1]
206 text_parts.append(text)
207
208 def _AppendHtml(self, html):
209 # type: (str) -> None
210 """Accumulate HTML of the last heading."""
211 _, _, _, html_parts, _ = self.headings[-1]
212 html_parts.append(html)
213
214
215TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
216
217# We could just add <h2 id="foo"> attribute! I didn't know those are valid
218# anchors.
219# But it's easier to insert an entire line, rather than part ofa line.
220ANCHOR_FMT = '<a name="%s"></a>\n'
221
222
223def _MakeTocInsertions(
224 opts, # type: Any
225 toc_tags, # type: Union[List[str], Tuple[str, str]]
226 headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
227 toc_pos, # type: int
228 preserve_anchor_case, # type: bool
229):
230 # type: (...) -> List[Tuple[int, str]]
231 """Given extract headings list and TOC position, return a list of insertions.
232
233 The insertions <div> for the TOC itself, and <a name=""> for the targets.
234
235 Args:
236 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
237 all of them.
238 """
239 # Example:
240 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
241 #
242 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
243 # that's easy.
244
245 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
246 insertions = []
247
248 i = 0
249 for line_num, tag, css_id, html_parts, text_parts in headings:
250 css_class = TAG_TO_CSS[tag]
251
252 # Add BOTH href, for stability.
253 numeric_href = 'toc_%d' % i
254
255 # If there was an explicit CSS ID written by the user, use that as the href.
256 # I used this in the blog a few times.
257
258 pretty_href = html_lib.PrettyHref(
259 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
260
261 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
262 toc_href = css_id
263 else:
264 # Always use the pretty version now. The old numeric version is still a
265 # target, but not in the TOC.
266 toc_href = pretty_href
267
268 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
269 css_class, toc_href, ''.join(html_parts))
270 if tag in toc_tags:
271 toc_lines.append(line)
272
273 targets = []
274 if opts.toc_pretty_href: # NEW WAY
275 targets.append(ANCHOR_FMT % pretty_href)
276 elif css_id: # Old blog explicit
277 targets.append(ANCHOR_FMT % css_id)
278 targets.append(ANCHOR_FMT % numeric_href)
279 else: # Old blog implicit
280 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
281 targets.append(ANCHOR_FMT % numeric_href)
282
283 insertions.append((line_num, ''.join(targets)))
284
285 i += 1
286
287 # +1 to insert AFTER the <div>
288 toc_insert = (toc_pos + 1, ''.join(toc_lines))
289 insertions.insert(0, toc_insert) # The first insertion is TOC
290
291 return insertions
292
293
294def _MakeTocInsertionsDense(
295 headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
296 toc_pos, # type: int
297 preserve_anchor_case, # type: bool
298):
299 # type: (...) -> List[Tuple[int, str]]
300 """For the dense-toc style with columns, used by doc/ref
301
302 The style above is simpler: it outputs a div for every line:
303
304 <div id="toctitle">Table of Contents</div>
305
306 <div class="toclevel1><a ...> Level 1 </a></div>
307 <div class="toclevel2><a ...> 1.A </a></div>
308 <div class="toclevel2><a ...> 1.B </a></div>
309 <div class="toclevel1><a ...> Level 2 </a></div>
310 ...
311
312 We want something like this:
313
314 <div id="dense-toc-title">Table of Contents</div>
315
316 <div class="dense-toc-group">
317 <a ...> Level 1 </a> <br/>
318
319 <a class="dense-toc-h3" ...> 1.A </a> <br/>
320 <a class="dense-toc-h3" ...> 1.B </a> <br/>
321
322 </div> # NO BREAKING within this div
323
324 <div class="dense-toc-group">
325 <a ...> Level 2 </a> <br/>
326 </div>
327 """
328
329 heading_tree = []
330 current_h2 = None
331
332 insertions = []
333
334 for line_num, tag, css_id, html_parts, text_parts in headings:
335
336 pretty_href = html_lib.PrettyHref(
337 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
338
339 if css_id: # doc/ref can use <h3 id="explicit"></h3>
340 toc_href = css_id
341 else:
342 # Always use the pretty version now. The old numeric version is still a
343 # target, but not in the TOC.
344 toc_href = pretty_href
345
346 anchor_html = ''.join(html_parts)
347
348 # Create a two level tree
349 if tag == 'h2':
350 current_h2 = (anchor_html, toc_href, [])
351 heading_tree.append(current_h2)
352 elif tag == 'h3':
353 assert current_h2 is not None, "h3 shouldn't come before any h2"
354 current_h2[2].append((anchor_html, toc_href))
355
356 # Insert the target <a name="">
357 insertions.append((line_num, ANCHOR_FMT % pretty_href))
358
359 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
360
361 if 1:
362 log('Heading Tree:')
363 log(pprint.pformat(heading_tree))
364 log('')
365
366 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
367 toc_lines.append('<div id="dense-toc-cols">\n')
368
369 for h2_html, h2_href, children in heading_tree:
370 toc_lines.append('<div class="dense-toc-group">\n')
371 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
372 for h3_html, h3_href in children:
373 toc_lines.append(
374 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
375 (h3_href, h3_html))
376 toc_lines.append('</div>\n')
377
378 toc_lines.append('</div>\n')
379
380 if 1:
381 log('TOC lines')
382 log(pprint.pformat(toc_lines))
383 log('')
384
385 # +1 to insert AFTER the <div>
386 toc_insert = (toc_pos + 1, ''.join(toc_lines))
387 insertions.insert(0, toc_insert) # The first insertion is TOC
388
389 return insertions
390
391
392def _ApplyInsertions(lines, insertions, out_file):
393 # type: (List[str], List[Tuple[int, str]], IO[str]) -> None
394 assert insertions, "Should be at least one insertion"
395 j = 0
396 n = len(insertions)
397
398 for i, line in enumerate(lines):
399 current_line = i + 1 # 1-based
400
401 if j < n:
402 line_num, s = insertions[j]
403 if current_line == line_num:
404 out_file.write(s)
405 j += 1
406
407 out_file.write(line)
408
409
410def Render(
411 opts, # type: Any
412 meta, # type: Dict
413 in_file, # type: IO[str]
414 out_file, # type: IO[str]
415 use_fastlex=True, # type: bool
416 debug_out=None, # type: Optional[Any]
417):
418 # type: (...) -> None
419 if debug_out is None:
420 debug_out = []
421
422 # First convert to HTML
423 html = md2html(in_file.read())
424 #print(html, file=sys.stderr)
425
426 # Now process HTML with oils_doc
427 if use_fastlex:
428 # Note: extract code BEFORE doing the HTML highlighting.
429 if opts.code_block_output:
430 with open(opts.code_block_output, 'w') as f:
431 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
432 opts.code_block_output)
433 text = oils_doc.ExtractCode(html, f)
434
435 html = ul_table.RemoveComments(html)
436
437 # Hack for allowing tables without <p> in cells, which CommonMark seems
438 # to require?
439 html = html.replace('<p><pstrip>', '')
440 html = html.replace('</pstrip></p>', '')
441
442 try:
443 html = ul_table.ReplaceTables(html)
444 except lazylex_html.ParseError as e:
445 print('Error rendering file %r' % in_file, file=sys.stderr)
446 raise
447
448 # Expand $xref, etc.
449 html = oils_doc.ExpandLinks(html)
450
451 # <code> blocks
452 # Including class=language-oil-help-topics
453 html = oils_doc.HighlightCode(html,
454 meta.get('default_highlighter'),
455 debug_out=debug_out)
456
457 # h2 is the title. h1 is unused.
458 if opts.toc_tags:
459 toc_tags = opts.toc_tags
460 else:
461 toc_tags = ('h3', 'h4')
462
463 parser = TocExtractor()
464 parser.feed(html)
465
466 log('')
467 log('*** HTML headings:')
468 for heading in parser.headings:
469 log(heading)
470
471 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
472
473 if parser.toc_begin_line != -1:
474 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
475 parser.toc_begin_line,
476 preserve_anchor_case)
477 elif parser.dense_toc_begin_line != -1:
478 insertions = _MakeTocInsertionsDense(parser.headings,
479 parser.dense_toc_begin_line,
480 preserve_anchor_case)
481 else: # No TOC found Not found!
482 out_file.write(html) # Pass through
483 return
484
485 log('')
486 log('*** Text Insertions:')
487 for ins in insertions:
488 log(ins)
489
490 log('')
491 log('*** Output:')
492
493 lines = html.splitlines(True) # keep newlines
494 _ApplyInsertions(lines, insertions, out_file)
495
496
497def Options():
498 # type: () -> Any
499 p = optparse.OptionParser('cmark.py [options]')
500
501 p.add_option('--common-mark',
502 action='store_true',
503 default=False,
504 help='Only do CommonMark conversion')
505
506 p.add_option(
507 '--toc-pretty-href',
508 action='store_true',
509 default=False,
510 help='Generate textual hrefs #like-this rather than like #toc10')
511 p.add_option('--toc-tag',
512 dest='toc_tags',
513 action='append',
514 default=[],
515 help='h tags to include in the TOC, e.g. h2 h3')
516 p.add_option('--disable-fastlex',
517 dest='disable_fastlex',
518 action='store_true',
519 default=False,
520 help='Hack for old blog posts')
521
522 p.add_option('--code-block-output',
523 dest='code_block_output',
524 default=None,
525 help='Extract and print code blocks to this file')
526
527 return p
528
529
530# width 40 by default
531DEFAULT_META = {'body_css_class': 'width40'}
532
533
534def main(argv):
535 o = Options()
536 opts, argv = o.parse_args(argv)
537 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
538
539 if opts.common_mark:
540 print(md2html(sys.stdin.read()))
541 return
542
543 meta = dict(DEFAULT_META)
544
545 if len(argv) == 3: # It's Oils documentation
546 with open(argv[1]) as f:
547 meta.update(json.load(f))
548
549 # Docs have a special header and footer.
550 with open(argv[2]) as content_f:
551 doc_html.Header(meta, sys.stdout, draft_warning=True)
552 Render(opts, meta, content_f, sys.stdout)
553 doc_html.Footer(meta, sys.stdout)
554 else:
555 # Filter for blog and for benchmarks.
556
557 # Metadata is optional here
558 try:
559 with open(argv[1]) as f:
560 meta.update(json.load(f))
561 except IndexError:
562 pass
563
564 # Old style for blog: it's a filter
565 Render(opts,
566 meta,
567 sys.stdin,
568 sys.stdout,
569 use_fastlex=not opts.disable_fastlex)
570
571
572if __name__ == '__main__':
573 main(sys.argv)