OILS / doctools / cmark.py View on Github | oils.pub

568 lines, 325 significant
1#!/usr/bin/env python2
2"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
3and insert anchors.
4
5I started from cmark-0.28.3/wrappers/wrapper.py.
6"""
7from __future__ import print_function
8
9import ctypes
10from typing import List
11from typing import Tuple
12from typing import Union
13from typing import Optional
14from typing import IO
15from typing import Dict
16try:
17 from HTMLParser import HTMLParser
18except ImportError:
19 # python3
20 from html.parser import HTMLParser # type: ignore
21import json
22import optparse
23import os
24import pprint
25import sys
26
27from doctools import html_lib
28from doctools import doc_html # templates
29from doctools import oils_doc
30from doctools import ul_table
31from lazylex import html as lazylex_html
32
33if sys.version_info.major == 2:
34 from typing import Any
35
36# Geez find_library returns the filename and not the path? Just hardcode it as
37# a workaround.
38# https://bugs.python.org/issue21042
39
40#from ctypes.util import find_library
41#libname = find_library("cmark")
42#assert libname, "cmark not found"
43
44# There's some ongoing discussion about how to deal with the same in Nix.
45# I think normally you'd just patch/substitute this path during the Nix build.
46# See note in shell.nix
47this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
48
49cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
50cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
51cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
52
53if cmark1 is not None and os.path.exists(cmark1):
54 libname = cmark1
55elif os.path.exists(cmark2):
56 libname = cmark2
57elif os.path.exists(cmark3):
58 libname = cmark3
59else:
60 raise AssertionError("Couldn't find libcmark.so")
61
62cmark = ctypes.CDLL(libname)
63
64markdown = cmark.cmark_markdown_to_html
65markdown.restype = ctypes.c_char_p
66markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
67
68
69def log(msg, *args):
70 # type: (str, Any) -> None
71 if args:
72 msg = msg % args
73
74 if 0:
75 print(msg, file=sys.stderr)
76
77
78# Version 0.29.0 disallowed raw HTML by default!
79CMARK_OPT_UNSAFE = (1 << 17)
80
81
82def md2html(md):
83 # type: (str) -> str
84 if sys.version_info.major == 2:
85 md_bytes = md
86 else:
87 md_bytes = md.encode('utf-8')
88
89 md_len = len(md)
90 html = markdown(md_bytes, md_len, CMARK_OPT_UNSAFE)
91
92 if sys.version_info.major == 2:
93 return html
94 else:
95 return html.decode('utf-8')
96
97
98def demo():
99 sys.stdout.write(md2html('*hi*'))
100
101
102class TocExtractor(HTMLParser):
103 """Extract Table of Contents
104
105 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
106 the line number.
107
108 Later, we insert two things:
109 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
110 - The TOC after <div id="toc">
111 """
112
113 def __init__(self):
114 # type: () -> None
115 HTMLParser.__init__(self)
116
117 # make targets for these, regardless of whether the TOC links to them.
118 self.h_tags = ['h2', 'h3', 'h4']
119 self.indent = 0
120
121 # The TOC will be inserted after this.
122 self.toc_begin_line = -1
123 self.dense_toc_begin_line = -1
124
125 self.capturing = False
126
127 # Flat list of (line_num, tag, id, HTML)?
128 # HTML is like innerHTML. There can be <code> annotations and so forth.
129 # id is optional -- it can be used for generating headings.
130 self.headings = []
131
132 def handle_starttag(self, tag, attrs):
133 # type: (str, List[Tuple[str, str]]) -> None
134 if tag == 'div':
135 if attrs == [('id', 'toc')]:
136 log('%s> %s %s', self.indent * ' ', tag, attrs)
137 self.indent += 1
138 self.toc_begin_line, _ = self.getpos()
139 elif attrs == [('id', 'dense-toc')]:
140 self.indent += 1
141 self.dense_toc_begin_line, _ = self.getpos()
142
143 # Can't have nested <a> tags
144 if self.capturing and tag != 'a':
145 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
146
147 if tag in self.h_tags:
148 log('%s> %s %s', self.indent * ' ', tag, attrs)
149 self.indent += 1
150 line_num, _ = self.getpos()
151
152 css_id = None
153 for k, v in attrs:
154 if k == 'id':
155 css_id = v
156 break
157 self.headings.append((line_num, tag, css_id, [], []))
158 self.capturing = True # record the text inside <h2></h2> etc.
159
160 def handle_endtag(self, tag):
161 # type: (str) -> None
162 # Debug print
163 if tag == 'div':
164 self.indent -= 1
165 log('%s< %s', self.indent * ' ', tag)
166
167 if tag in self.h_tags:
168 self.indent -= 1
169 log('%s< %s', self.indent * ' ', tag)
170 self.capturing = False
171
172 # Can't have nested <a> tags
173 if self.capturing and tag != 'a':
174 self._AppendHtml('</%s>' % tag)
175
176 def handle_entityref(self, data):
177 # type: (str) -> None
178 """
179 From Python docs:
180 This method is called to process a named character reference of the form
181 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
182 """
183 # BUG FIX: For when we have say &quot; or &lt; in subheadings
184 if self.capturing:
185 self._AppendHtml('&%s;' % data)
186
187 def handle_data(self, data):
188 # type: (str) -> None
189 # Debug print
190 if self.indent > 0:
191 log('%s| %r', self.indent * ' ', data)
192
193 if self.capturing:
194 self._AppendHtml(data)
195 self._AppendText(data)
196
197 def _AppendText(self, text):
198 # type: (str) -> None
199 """Accumulate text of the last heading."""
200 _, _, _, _, text_parts = self.headings[-1]
201 text_parts.append(text)
202
203 def _AppendHtml(self, html):
204 # type: (str) -> None
205 """Accumulate HTML of the last heading."""
206 _, _, _, html_parts, _ = self.headings[-1]
207 html_parts.append(html)
208
209
210TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
211
212# We could just add <h2 id="foo"> attribute! I didn't know those are valid
213# anchors.
214# But it's easier to insert an entire line, rather than part ofa line.
215ANCHOR_FMT = '<a name="%s"></a>\n'
216
217
218def _MakeTocInsertions(
219 opts, # type: Any
220 toc_tags, # type: Union[List[str], Tuple[str, str]]
221 headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
222 toc_pos, # type: int
223 preserve_anchor_case, # type: bool
224):
225 # type: (...) -> List[Tuple[int, str]]
226 """Given extract headings list and TOC position, return a list of insertions.
227
228 The insertions <div> for the TOC itself, and <a name=""> for the targets.
229
230 Args:
231 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
232 all of them.
233 """
234 # Example:
235 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
236 #
237 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
238 # that's easy.
239
240 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
241 insertions = []
242
243 i = 0
244 for line_num, tag, css_id, html_parts, text_parts in headings:
245 css_class = TAG_TO_CSS[tag]
246
247 # Add BOTH href, for stability.
248 numeric_href = 'toc_%d' % i
249
250 # If there was an explicit CSS ID written by the user, use that as the href.
251 # I used this in the blog a few times.
252
253 pretty_href = html_lib.PrettyHref(
254 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
255
256 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
257 toc_href = css_id
258 else:
259 # Always use the pretty version now. The old numeric version is still a
260 # target, but not in the TOC.
261 toc_href = pretty_href
262
263 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
264 css_class, toc_href, ''.join(html_parts))
265 if tag in toc_tags:
266 toc_lines.append(line)
267
268 targets = []
269 if opts.toc_pretty_href: # NEW WAY
270 targets.append(ANCHOR_FMT % pretty_href)
271 elif css_id: # Old blog explicit
272 targets.append(ANCHOR_FMT % css_id)
273 targets.append(ANCHOR_FMT % numeric_href)
274 else: # Old blog implicit
275 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
276 targets.append(ANCHOR_FMT % numeric_href)
277
278 insertions.append((line_num, ''.join(targets)))
279
280 i += 1
281
282 # +1 to insert AFTER the <div>
283 toc_insert = (toc_pos + 1, ''.join(toc_lines))
284 insertions.insert(0, toc_insert) # The first insertion is TOC
285
286 return insertions
287
288
289def _MakeTocInsertionsDense(
290 headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
291 toc_pos, # type: int
292 preserve_anchor_case, # type: bool
293):
294 # type: (...) -> List[Tuple[int, str]]
295 """For the dense-toc style with columns, used by doc/ref
296
297 The style above is simpler: it outputs a div for every line:
298
299 <div id="toctitle">Table of Contents</div>
300
301 <div class="toclevel1><a ...> Level 1 </a></div>
302 <div class="toclevel2><a ...> 1.A </a></div>
303 <div class="toclevel2><a ...> 1.B </a></div>
304 <div class="toclevel1><a ...> Level 2 </a></div>
305 ...
306
307 We want something like this:
308
309 <div id="dense-toc-title">Table of Contents</div>
310
311 <div class="dense-toc-group">
312 <a ...> Level 1 </a> <br/>
313
314 <a class="dense-toc-h3" ...> 1.A </a> <br/>
315 <a class="dense-toc-h3" ...> 1.B </a> <br/>
316
317 </div> # NO BREAKING within this div
318
319 <div class="dense-toc-group">
320 <a ...> Level 2 </a> <br/>
321 </div>
322 """
323
324 heading_tree = []
325 current_h2 = None
326
327 insertions = []
328
329 for line_num, tag, css_id, html_parts, text_parts in headings:
330
331 pretty_href = html_lib.PrettyHref(
332 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
333
334 if css_id: # doc/ref can use <h3 id="explicit"></h3>
335 toc_href = css_id
336 else:
337 # Always use the pretty version now. The old numeric version is still a
338 # target, but not in the TOC.
339 toc_href = pretty_href
340
341 anchor_html = ''.join(html_parts)
342
343 # Create a two level tree
344 if tag == 'h2':
345 current_h2 = (anchor_html, toc_href, [])
346 heading_tree.append(current_h2)
347 elif tag == 'h3':
348 assert current_h2 is not None, "h3 shouldn't come before any h2"
349 current_h2[2].append((anchor_html, toc_href))
350
351 # Insert the target <a name="">
352 insertions.append((line_num, ANCHOR_FMT % pretty_href))
353
354 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
355
356 if 1:
357 log('Heading Tree:')
358 log(pprint.pformat(heading_tree))
359 log('')
360
361 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
362 toc_lines.append('<div id="dense-toc-cols">\n')
363
364 for h2_html, h2_href, children in heading_tree:
365 toc_lines.append('<div class="dense-toc-group">\n')
366 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
367 for h3_html, h3_href in children:
368 toc_lines.append(
369 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
370 (h3_href, h3_html))
371 toc_lines.append('</div>\n')
372
373 toc_lines.append('</div>\n')
374
375 if 1:
376 log('TOC lines')
377 log(pprint.pformat(toc_lines))
378 log('')
379
380 # +1 to insert AFTER the <div>
381 toc_insert = (toc_pos + 1, ''.join(toc_lines))
382 insertions.insert(0, toc_insert) # The first insertion is TOC
383
384 return insertions
385
386
387def _ApplyInsertions(lines, insertions, out_file):
388 # type: (List[str], List[Tuple[int, str]], IO[str]) -> None
389 assert insertions, "Should be at least one insertion"
390 j = 0
391 n = len(insertions)
392
393 for i, line in enumerate(lines):
394 current_line = i + 1 # 1-based
395
396 if j < n:
397 line_num, s = insertions[j]
398 if current_line == line_num:
399 out_file.write(s)
400 j += 1
401
402 out_file.write(line)
403
404
405def Render(
406 opts, # type: Any
407 meta, # type: Dict
408 in_file, # type: IO[str]
409 out_file, # type: IO[str]
410 use_fastlex=True, # type: bool
411 debug_out=None, # type: Optional[Any]
412):
413 # type: (...) -> None
414 if debug_out is None:
415 debug_out = []
416
417 # First convert to HTML
418 html = md2html(in_file.read())
419 #print(html, file=sys.stderr)
420
421 # Now process HTML with oils_doc
422 if use_fastlex:
423 # Note: extract code BEFORE doing the HTML highlighting.
424 if opts.code_block_output:
425 with open(opts.code_block_output, 'w') as f:
426 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
427 opts.code_block_output)
428 text = oils_doc.ExtractCode(html, f)
429
430 html = ul_table.RemoveComments(html)
431
432 # Hack for allowing tables without <p> in cells, which CommonMark seems
433 # to require?
434 html = html.replace('<p><pstrip>', '')
435 html = html.replace('</pstrip></p>', '')
436
437 try:
438 html = ul_table.ReplaceTables(html)
439 except lazylex_html.ParseError as e:
440 print('Error rendering file %r' % in_file, file=sys.stderr)
441 raise
442
443 # Expand $xref, etc.
444 html = oils_doc.ExpandLinks(html)
445
446 # <code> blocks
447 # Including class=language-oil-help-topics
448 html = oils_doc.HighlightCode(html,
449 meta.get('default_highlighter'),
450 debug_out=debug_out)
451
452 # h2 is the title. h1 is unused.
453 if opts.toc_tags:
454 toc_tags = opts.toc_tags
455 else:
456 toc_tags = ('h3', 'h4')
457
458 parser = TocExtractor()
459 parser.feed(html)
460
461 log('')
462 log('*** HTML headings:')
463 for heading in parser.headings:
464 log(heading)
465
466 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
467
468 if parser.toc_begin_line != -1:
469 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
470 parser.toc_begin_line,
471 preserve_anchor_case)
472 elif parser.dense_toc_begin_line != -1:
473 insertions = _MakeTocInsertionsDense(parser.headings,
474 parser.dense_toc_begin_line,
475 preserve_anchor_case)
476 else: # No TOC found Not found!
477 out_file.write(html) # Pass through
478 return
479
480 log('')
481 log('*** Text Insertions:')
482 for ins in insertions:
483 log(ins)
484
485 log('')
486 log('*** Output:')
487
488 lines = html.splitlines(True) # keep newlines
489 _ApplyInsertions(lines, insertions, out_file)
490
491
492def Options():
493 # type: () -> Any
494 p = optparse.OptionParser('cmark.py [options]')
495
496 p.add_option('--common-mark',
497 action='store_true',
498 default=False,
499 help='Only do CommonMark conversion')
500
501 p.add_option(
502 '--toc-pretty-href',
503 action='store_true',
504 default=False,
505 help='Generate textual hrefs #like-this rather than like #toc10')
506 p.add_option('--toc-tag',
507 dest='toc_tags',
508 action='append',
509 default=[],
510 help='h tags to include in the TOC, e.g. h2 h3')
511 p.add_option('--disable-fastlex',
512 dest='disable_fastlex',
513 action='store_true',
514 default=False,
515 help='Hack for old blog posts')
516
517 p.add_option('--code-block-output',
518 dest='code_block_output',
519 default=None,
520 help='Extract and print code blocks to this file')
521
522 return p
523
524
525# width 40 by default
526DEFAULT_META = {'body_css_class': 'width40'}
527
528
529def main(argv):
530 o = Options()
531 opts, argv = o.parse_args(argv)
532 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
533
534 if opts.common_mark:
535 print(md2html(sys.stdin.read()))
536 return
537
538 meta = dict(DEFAULT_META)
539
540 if len(argv) == 3: # It's Oils documentation
541 with open(argv[1]) as f:
542 meta.update(json.load(f))
543
544 # Docs have a special header and footer.
545 with open(argv[2]) as content_f:
546 doc_html.Header(meta, sys.stdout, draft_warning=True)
547 Render(opts, meta, content_f, sys.stdout)
548 doc_html.Footer(meta, sys.stdout)
549 else:
550 # Filter for blog and for benchmarks.
551
552 # Metadata is optional here
553 try:
554 with open(argv[1]) as f:
555 meta.update(json.load(f))
556 except IndexError:
557 pass
558
559 # Old style for blog: it's a filter
560 Render(opts,
561 meta,
562 sys.stdin,
563 sys.stdout,
564 use_fastlex=not opts.disable_fastlex)
565
566
567if __name__ == '__main__':
568 main(sys.argv)