OILS / doctools / cmark.py View on Github | oils.pub

538 lines, 306 significant
1#!/usr/bin/env python2
2"""Convert Markdown to HTML, with our enhancements
3
4- Parse the HTML
5- insert a TOC
6- <pstrip> hack - this is obsolete with ul-table?
7- Expand $xref links
8- Highlight code blocks
9
10I started from cmark-0.28.3/wrappers/wrapper.py.
11"""
12from __future__ import print_function
13
14try:
15 from HTMLParser import HTMLParser
16except ImportError:
17 # python3
18 from html.parser import HTMLParser # type: ignore
19import json
20import optparse
21import os
22import pprint
23import subprocess
24import sys
25
26from doctools import html_lib
27from doctools import doc_html # templates
28from doctools import oils_doc
29from doctools import ul_table
30from data_lang import htm8
31
32if sys.version_info.major == 2:
33 from typing import Any, List, Dict, Tuple, Union, Optional, IO
34
35
36def log(msg, *args):
37 # type: (str, Any) -> None
38 if args:
39 msg = msg % args
40
41 if 0:
42 print(msg, file=sys.stderr)
43
44
45this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
46NEW_CMARK_WEDGE_DIR = os.path.join(this_dir, '../../oils.DEPS/wedge/cmark/0.29.0')
47
48OLD_CMARK_WEDGE_DIR = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0'
49
50
51def cmark_bin(md):
52 # type: (str) -> str
53
54 b1 = os.path.join(NEW_CMARK_WEDGE_DIR, 'bin/cmark')
55 b2 = os.path.join(OLD_CMARK_WEDGE_DIR, 'bin/cmark')
56 if os.path.exists(b1):
57 cmark_path = b1
58 elif os.path.exists(b2):
59 cmark_path = b2
60 else:
61 raise AssertionError('bin/cmark not found')
62
63 # Need to render raw HTML
64 p = subprocess.Popen([cmark_path, '--unsafe'],
65 stdin=subprocess.PIPE,
66 stdout=subprocess.PIPE)
67 stdout, _ = p.communicate(input=md)
68 return stdout
69
70
71class TocExtractor(HTMLParser):
72 """Extract Table of Contents
73
74 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
75 the line number.
76
77 Later, we insert two things:
78 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
79 - The TOC after <div id="toc">
80 """
81
82 def __init__(self):
83 # type: () -> None
84 HTMLParser.__init__(self)
85
86 # make targets for these, regardless of whether the TOC links to them.
87 self.h_tags = ['h2', 'h3', 'h4']
88 self.indent = 0
89
90 # The TOC will be inserted after this.
91 self.toc_begin_line = -1
92 self.dense_toc_begin_line = -1
93
94 self.capturing = False
95
96 # Flat list of (line_num, tag, id, HTML)?
97 # HTML is like innerHTML. There can be <code> annotations and so forth.
98 # id is optional -- it can be used for generating headings.
99 self.headings = []
100
101 def handle_starttag(self, tag, attrs):
102 # type: (str, List[Tuple[str, str]]) -> None
103 if tag == 'div':
104 if attrs == [('id', 'toc')]:
105 log('%s> %s %s', self.indent * ' ', tag, attrs)
106 self.indent += 1
107 self.toc_begin_line, _ = self.getpos()
108 elif attrs == [('id', 'dense-toc')]:
109 self.indent += 1
110 self.dense_toc_begin_line, _ = self.getpos()
111
112 # Can't have nested <a> tags
113 if self.capturing and tag != 'a':
114 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
115
116 if tag in self.h_tags:
117 log('%s> %s %s', self.indent * ' ', tag, attrs)
118 self.indent += 1
119 line_num, _ = self.getpos()
120
121 css_id = None
122 for k, v in attrs:
123 if k == 'id':
124 css_id = v
125 break
126 self.headings.append((line_num, tag, css_id, [], []))
127 self.capturing = True # record the text inside <h2></h2> etc.
128
129 def handle_endtag(self, tag):
130 # type: (str) -> None
131 # Debug print
132 if tag == 'div':
133 self.indent -= 1
134 log('%s< %s', self.indent * ' ', tag)
135
136 if tag in self.h_tags:
137 self.indent -= 1
138 log('%s< %s', self.indent * ' ', tag)
139 self.capturing = False
140
141 # Can't have nested <a> tags
142 if self.capturing and tag != 'a':
143 self._AppendHtml('</%s>' % tag)
144
145 def handle_entityref(self, data):
146 # type: (str) -> None
147 """
148 From Python docs:
149 This method is called to process a named character reference of the form
150 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
151 """
152 # BUG FIX: For when we have say &quot; or &lt; in subheadings
153 if self.capturing:
154 self._AppendHtml('&%s;' % data)
155
156 def handle_data(self, data):
157 # type: (str) -> None
158 # Debug print
159 if self.indent > 0:
160 log('%s| %r', self.indent * ' ', data)
161
162 if self.capturing:
163 self._AppendHtml(data)
164 self._AppendText(data)
165
166 def _AppendText(self, text):
167 # type: (str) -> None
168 """Accumulate text of the last heading."""
169 _, _, _, _, text_parts = self.headings[-1]
170 text_parts.append(text)
171
172 def _AppendHtml(self, html):
173 # type: (str) -> None
174 """Accumulate HTML of the last heading."""
175 _, _, _, html_parts, _ = self.headings[-1]
176 html_parts.append(html)
177
178
179TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
180
181# We could just add <h2 id="foo"> attribute! I didn't know those are valid
182# anchors.
183# But it's easier to insert an entire line, rather than part ofa line.
184ANCHOR_FMT = '<a name="%s"></a>\n'
185
186
187def _MakeTocInsertions(
188 opts, # type: Any
189 toc_tags, # type: Union[List[str], Tuple[str, str]]
190 headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
191 toc_pos, # type: int
192 preserve_anchor_case, # type: bool
193):
194 # type: (...) -> List[Tuple[int, str]]
195 """Given extract headings list and TOC position, return a list of insertions.
196
197 The insertions <div> for the TOC itself, and <a name=""> for the targets.
198
199 Args:
200 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
201 all of them.
202 """
203 # Example:
204 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
205 #
206 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
207 # that's easy.
208
209 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
210 insertions = []
211
212 i = 0
213 for line_num, tag, css_id, html_parts, text_parts in headings:
214 css_class = TAG_TO_CSS[tag]
215
216 # Add BOTH href, for stability.
217 numeric_href = 'toc_%d' % i
218
219 # If there was an explicit CSS ID written by the user, use that as the href.
220 # I used this in the blog a few times.
221
222 pretty_href = html_lib.PrettyHref(
223 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
224
225 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
226 toc_href = css_id
227 else:
228 # Always use the pretty version now. The old numeric version is still a
229 # target, but not in the TOC.
230 toc_href = pretty_href
231
232 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
233 css_class, toc_href, ''.join(html_parts))
234 if tag in toc_tags:
235 toc_lines.append(line)
236
237 targets = []
238 if opts.toc_pretty_href: # NEW WAY
239 targets.append(ANCHOR_FMT % pretty_href)
240 elif css_id: # Old blog explicit
241 targets.append(ANCHOR_FMT % css_id)
242 targets.append(ANCHOR_FMT % numeric_href)
243 else: # Old blog implicit
244 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
245 targets.append(ANCHOR_FMT % numeric_href)
246
247 insertions.append((line_num, ''.join(targets)))
248
249 i += 1
250
251 # +1 to insert AFTER the <div>
252 toc_insert = (toc_pos + 1, ''.join(toc_lines))
253 insertions.insert(0, toc_insert) # The first insertion is TOC
254
255 return insertions
256
257
258def _MakeTocInsertionsDense(
259 headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
260 toc_pos, # type: int
261 preserve_anchor_case, # type: bool
262):
263 # type: (...) -> List[Tuple[int, str]]
264 """For the dense-toc style with columns, used by doc/ref
265
266 The style above is simpler: it outputs a div for every line:
267
268 <div id="toctitle">Table of Contents</div>
269
270 <div class="toclevel1><a ...> Level 1 </a></div>
271 <div class="toclevel2><a ...> 1.A </a></div>
272 <div class="toclevel2><a ...> 1.B </a></div>
273 <div class="toclevel1><a ...> Level 2 </a></div>
274 ...
275
276 We want something like this:
277
278 <div id="dense-toc-title">Table of Contents</div>
279
280 <div class="dense-toc-group">
281 <a ...> Level 1 </a> <br/>
282
283 <a class="dense-toc-h3" ...> 1.A </a> <br/>
284 <a class="dense-toc-h3" ...> 1.B </a> <br/>
285
286 </div> # NO BREAKING within this div
287
288 <div class="dense-toc-group">
289 <a ...> Level 2 </a> <br/>
290 </div>
291 """
292
293 heading_tree = []
294 current_h2 = None
295
296 insertions = []
297
298 for line_num, tag, css_id, html_parts, text_parts in headings:
299
300 pretty_href = html_lib.PrettyHref(
301 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
302
303 if css_id: # doc/ref can use <h3 id="explicit"></h3>
304 toc_href = css_id
305 else:
306 # Always use the pretty version now. The old numeric version is still a
307 # target, but not in the TOC.
308 toc_href = pretty_href
309
310 anchor_html = ''.join(html_parts)
311
312 # Create a two level tree
313 if tag == 'h2':
314 current_h2 = (anchor_html, toc_href, [])
315 heading_tree.append(current_h2)
316 elif tag == 'h3':
317 assert current_h2 is not None, "h3 shouldn't come before any h2"
318 current_h2[2].append((anchor_html, toc_href))
319
320 # Insert the target <a name="">
321 insertions.append((line_num, ANCHOR_FMT % pretty_href))
322
323 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
324
325 if 1:
326 log('Heading Tree:')
327 log(pprint.pformat(heading_tree))
328 log('')
329
330 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
331 toc_lines.append('<div id="dense-toc-cols">\n')
332
333 for h2_html, h2_href, children in heading_tree:
334 toc_lines.append('<div class="dense-toc-group">\n')
335 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
336 for h3_html, h3_href in children:
337 toc_lines.append(
338 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
339 (h3_href, h3_html))
340 toc_lines.append('</div>\n')
341
342 toc_lines.append('</div>\n')
343
344 if 1:
345 log('TOC lines')
346 log(pprint.pformat(toc_lines))
347 log('')
348
349 # +1 to insert AFTER the <div>
350 toc_insert = (toc_pos + 1, ''.join(toc_lines))
351 insertions.insert(0, toc_insert) # The first insertion is TOC
352
353 return insertions
354
355
356def _ApplyInsertions(lines, insertions, out_file):
357 # type: (List[str], List[Tuple[int, str]], IO[str]) -> None
358 assert insertions, "Should be at least one insertion"
359 j = 0
360 n = len(insertions)
361
362 for i, line in enumerate(lines):
363 current_line = i + 1 # 1-based
364
365 if j < n:
366 line_num, s = insertions[j]
367 if current_line == line_num:
368 out_file.write(s)
369 j += 1
370
371 out_file.write(line)
372
373
374def Render(
375 opts, # type: Any
376 meta, # type: Dict
377 in_file, # type: IO[str]
378 out_file, # type: IO[str]
379 use_fastlex=True, # type: bool
380 debug_out=None, # type: Optional[Any]
381):
382 # type: (...) -> None
383 if debug_out is None:
384 debug_out = []
385
386 # First convert to HTML
387 html = cmark_bin(in_file.read())
388 #print(html, file=sys.stderr)
389
390 # Now process HTML with oils_doc
391 if use_fastlex:
392 # Note: extract code BEFORE doing the HTML highlighting.
393 if opts.code_block_output:
394 with open(opts.code_block_output, 'w') as f:
395 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
396 opts.code_block_output)
397 text = oils_doc.ExtractCode(html, f)
398
399 html = ul_table.RemoveComments(html)
400
401 # Hack for allowing tables without <p> in cells, which CommonMark seems
402 # to require?
403 html = html.replace('<p><pstrip>', '')
404 html = html.replace('</pstrip></p>', '')
405
406 try:
407 html = ul_table.ReplaceTables(html)
408 except htm8.ParseError as e:
409 print('Error rendering file %r' % in_file, file=sys.stderr)
410 raise
411
412 # Expand $xref, etc.
413 html = oils_doc.ExpandLinks(html)
414
415 # <code> blocks
416 # Including class=language-oil-help-topics
417 html = oils_doc.HighlightCode(html,
418 meta.get('default_highlighter'),
419 debug_out=debug_out)
420
421 # h2 is the title. h1 is unused.
422 if opts.toc_tags:
423 toc_tags = opts.toc_tags
424 else:
425 toc_tags = ('h3', 'h4')
426
427 parser = TocExtractor()
428 parser.feed(html)
429
430 log('')
431 log('*** HTML headings:')
432 for heading in parser.headings:
433 log(heading)
434
435 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
436
437 if parser.toc_begin_line != -1:
438 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
439 parser.toc_begin_line,
440 preserve_anchor_case)
441 elif parser.dense_toc_begin_line != -1:
442 insertions = _MakeTocInsertionsDense(parser.headings,
443 parser.dense_toc_begin_line,
444 preserve_anchor_case)
445 else: # No TOC found Not found!
446 out_file.write(html) # Pass through
447 return
448
449 log('')
450 log('*** Text Insertions:')
451 for ins in insertions:
452 log(ins)
453
454 log('')
455 log('*** Output:')
456
457 lines = html.splitlines(True) # keep newlines
458 _ApplyInsertions(lines, insertions, out_file)
459
460
461def Options():
462 # type: () -> Any
463 p = optparse.OptionParser('cmark.py [options]')
464
465 p.add_option('--common-mark',
466 action='store_true',
467 default=False,
468 help='Only do CommonMark conversion')
469
470 p.add_option(
471 '--toc-pretty-href',
472 action='store_true',
473 default=False,
474 help='Generate textual hrefs #like-this rather than like #toc10')
475 p.add_option('--toc-tag',
476 dest='toc_tags',
477 action='append',
478 default=[],
479 help='h tags to include in the TOC, e.g. h2 h3')
480 p.add_option('--disable-fastlex',
481 dest='disable_fastlex',
482 action='store_true',
483 default=False,
484 help='Hack for old blog posts')
485
486 p.add_option('--code-block-output',
487 dest='code_block_output',
488 default=None,
489 help='Extract and print code blocks to this file')
490
491 return p
492
493
494# width 40 by default
495DEFAULT_META = {'body_css_class': 'width40'}
496
497
498def main(argv):
499 o = Options()
500 opts, argv = o.parse_args(argv)
501 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
502
503 if opts.common_mark:
504 print(cmark_bin(sys.stdin.read()))
505 return
506
507 meta = dict(DEFAULT_META)
508
509 if len(argv) == 3:
510 # Oils docs take 2 args: JSON and content HTML
511 with open(argv[1]) as f:
512 meta.update(json.load(f))
513
514 # Docs have a special header and footer.
515 with open(argv[2]) as content_f:
516 doc_html.Header(meta, sys.stdout, draft_warning=True)
517 Render(opts, meta, content_f, sys.stdout)
518 doc_html.Footer(meta, sys.stdout)
519 else:
520 # Filter usage for blog and for benchmarks.
521
522 # Metadata is optional here
523 try:
524 with open(argv[1]) as f:
525 meta.update(json.load(f))
526 except IndexError:
527 pass
528
529 # Old style for blog: it's a filter
530 Render(opts,
531 meta,
532 sys.stdin,
533 sys.stdout,
534 use_fastlex=not opts.disable_fastlex)
535
536
537if __name__ == '__main__':
538 main(sys.argv)