OILS / doctools / ul_table.py View on Github | oils.pub

563 lines, 281 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4try:
5 from cStringIO import StringIO
6except ImportError:
7 from io import StringIO # type: ignore
8import re
9import sys
10
11from doctools.util import log
12from lazylex import html
13from typing import List
14from typing import Optional
15from typing import Tuple
16from typing import Union
17from typing import Any
18from typing import Dict
19
20
21def RemoveComments(s):
22 # type: (str) -> str
23 """Remove <!-- comments -->
24
25 This is a required preprocessing step for ul-table.
26 """
27 f = StringIO()
28 out = html.Output(s, f)
29
30 tag_lexer = html.TagLexer(s)
31
32 pos = 0
33
34 for tok_id, end_pos in html.ValidTokens(s):
35 if tok_id == html.Comment:
36 value = s[pos:end_pos]
37 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
38 if 'REPLACE' not in value:
39 out.PrintUntil(pos)
40 out.SkipTo(end_pos)
41 pos = end_pos
42
43 out.PrintTheRest()
44 return f.getvalue()
45
46
47_WHITESPACE_RE = re.compile(r'\s*')
48
49
50class UlTableParser(object):
51
52 def __init__(self, lexer, tag_lexer):
53 # type: (html.Lexer, html.TagLexer) -> None
54 self.lexer = lexer
55 self.tag_lexer = tag_lexer
56
57 self.tok_id = html.Invalid
58 self.start_pos = 0
59 self.end_pos = 0
60
61 def _CurrentString(self):
62 # type: () -> str
63 part = self.lexer.s[self.start_pos:self.end_pos]
64 return part
65
66 def _Next(self, comment_ok=False):
67 # type: (bool) -> None
68 """
69 Advance and set self.tok_id, self.start_pos, self.end_pos
70 """
71 self.start_pos = self.end_pos
72 self.tok_id, self.end_pos = self.lexer.Read()
73
74 # Should have called RemoveComments() beforehand. That can still leave
75 # some REPLACE cmoments
76 if not comment_ok and self.tok_id == html.Comment:
77 raise html.ParseError('Unexpected HTML comment')
78
79 if 0:
80 part = self._CurrentString()
81 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
82
83 def _EatRawData(self, regex):
84 # type: (str) -> None
85 """
86 Assert that we got text data matching a regex, and advance
87 """
88 if self.tok_id != html.RawData:
89 raise html.ParseError('Expected RawData, got %s' %
90 html.TokenName(self.tok_id))
91 actual = self._CurrentString()
92 m = re.match(regex, actual) # could compile this
93 if m is None:
94 raise html.ParseError('Expected to match %r, got %r' %
95 (regex, actual))
96 self._Next()
97
98 def _Eat(self, expected_id, expected_tag):
99 # type: (int, str) -> None
100 """
101 Assert that we got a start or end tag, with the given name, and advance
102
103 Args:
104 expected_id: html.StartTag or html.EndTag
105 expected_tag: 'a', 'span', etc.
106 """
107 assert expected_id in (html.StartTag,
108 html.EndTag), html.TokenName(expected_id)
109
110 if self.tok_id != expected_id:
111 raise html.ParseError(
112 'Expected token %s, got %s' %
113 (html.TokenName(expected_id), html.TokenName(self.tok_id)))
114 self.tag_lexer.Reset(self.start_pos, self.end_pos)
115 tag_name = self.tag_lexer.TagName()
116 if expected_tag != tag_name:
117 raise html.ParseError('Expected tag %r, got %r' %
118 (expected_tag, tag_name))
119
120 self._Next()
121
122 def _WhitespaceOk(self):
123 # type: () -> None
124 """
125 Optional whitespace
126 """
127 if (self.tok_id == html.RawData and
128 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
129 self._Next()
130
131 def FindUlTable(self):
132 # type: () -> int
133 """Find <table ...> <ul>
134
135 Return the START position of the <ul>
136 Similar algorithm as html.ReadUntilStartTag()
137 """
138 tag_lexer = self.tag_lexer
139
140 # Find first table
141 while True:
142 self._Next(comment_ok=True)
143 if self.tok_id == html.EndOfStream:
144 return -1
145
146 tag_lexer.Reset(self.start_pos, self.end_pos)
147 if (self.tok_id == html.StartTag and
148 tag_lexer.TagName() == 'table'):
149 while True:
150 self._Next(comment_ok=True)
151 if self.tok_id != html.RawData:
152 break
153
154 tag_lexer.Reset(self.start_pos, self.end_pos)
155 if (self.tok_id == html.StartTag and
156 tag_lexer.TagName() == 'ul'):
157 return self.start_pos
158 return -1
159
160 def _ListItem(self):
161 # type: () -> Tuple[Optional[List[Tuple[str, str]]], Optional[str]]
162 """Parse a list item nested below thead or tr.
163
164 Returns:
165 A pair (td_attrs, inner_html)
166
167 Grammar:
168
169 LIST_ITEM =
170 [RawData \s*]?
171 [StartTag 'li']
172 ANY* # NOT context-free:
173 # - we MATCH <li> and </li> with a tack
174 # - We search for [StartEndTag 'cell-attrs']?
175 [EndTag 'li']
176
177 Example of attribute borrowing:
178
179 - hi there ==>
180 <li>hi there</li> ==>
181 <td>hi there</td>
182
183 - <cell-attrs class=foo /> hi there ==>
184 <li><cell-attrs class=foo /> hi there </li> ==>
185 <td class=foo> hi there </td> ==>
186 """
187 self._WhitespaceOk()
188
189 if self.tok_id != html.StartTag:
190 return None, None
191
192 inner_html = None
193 td_attrs = None # Can we also have col-attrs?
194 td_attrs_span = None
195
196 self._Eat(html.StartTag, 'li')
197
198 left = self.start_pos
199
200 # Find the closing </li>, taking into accounted NESTED tags:
201 # <li> <li>foo</li> </li>
202 # because cells can have bulleted lists
203 balance = 0
204 while True:
205 if self.tok_id == html.StartEndTag:
206 self.tag_lexer.Reset(self.start_pos, self.end_pos)
207 tag_name = self.tag_lexer.TagName()
208 # TODO: remove td-attrs backward compat
209 if tag_name in ('td-attrs', 'cell-attrs'):
210 td_attrs_span = self.start_pos, self.end_pos
211 td_attrs = self.tag_lexer.AllAttrsRaw()
212 #log('CELL ATTRS %r', self._CurrentString())
213
214 elif self.tok_id == html.StartTag:
215 self.tag_lexer.Reset(self.start_pos, self.end_pos)
216 if self.tag_lexer.TagName() == 'li':
217 balance += 1
218
219 elif self.tok_id == html.EndTag:
220 self.tag_lexer.Reset(self.start_pos, self.end_pos)
221 if self.tag_lexer.TagName() == 'li':
222 balance -= 1
223 if balance < 0:
224 break
225 self._Next()
226
227 right = self.start_pos # start of the end tag
228
229 s = self.tag_lexer.s
230 if td_attrs_span:
231 # everything except the <cell-attrs />
232 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
233 #log('LEFT %r', s[left:td_attrs_span[0]])
234 #log('RIGHT %r', s[td_attrs_span[1]:right])
235 else:
236 inner_html = s[left:right]
237 #log('RAW inner html %r', inner_html)
238
239 #self._Eat(html.EndTag, 'li')
240 self._Next()
241
242 return td_attrs, inner_html
243
244 def _ParseTHead(self):
245 # type: () -> Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[Optional[List[Tuple[str, str]]], str]]]
246 """
247 Assume we're looking at the first <ul> tag. Now we want to find
248 <li>thead and the nested <ul>
249
250 Grammar:
251
252 THEAD =
253 [StartTag 'ul']
254 [RawData \s*]?
255 [StartTag 'li']
256 [RawData thead\s*]
257 [StartTag 'ul'] # Indented bullet that starts -
258 LIST_ITEM+
259 [RawData \s*]?
260 [EndTag 'ul']
261 [RawData thead\s+]
262 [End 'li']
263
264 Two Algorithms:
265
266 1. Replacement:
267 - skip over the first ul 'thead' li, and ul 'tr' li
268 - then replace the next ul -> tr, and li -> td
269 2. Parsing and Rendering:
270 - parse them into a structure
271 - skip all the text
272 - print your own HTML
273
274 I think the second one is better, because it allows attribute extensions
275 to thead
276
277 - thead
278 - name [link][]
279 - colgroup=foo align=left
280 - age
281 - colgroup=foo align=right
282 """
283 #log('*** _ParseTHead')
284 cells = []
285
286 self._WhitespaceOk()
287 self._Eat(html.StartTag, 'li')
288
289 # In CommonMark, r'thead\n' is enough, because it strips trailing
290 # whitespace. I'm not sure if other Markdown processors do that, so
291 # use r'thead\s+'.
292 self._EatRawData(r'thead\s+')
293
294 # This is the row data
295 self._Eat(html.StartTag, 'ul')
296
297 while True:
298 td_attrs, inner_html = self._ListItem()
299 if inner_html is None:
300 break
301 cells.append((td_attrs, inner_html))
302 self._WhitespaceOk()
303
304 self._Eat(html.EndTag, 'ul')
305
306 self._WhitespaceOk()
307 self._Eat(html.EndTag, 'li')
308
309 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
310 return cells
311
312 def _ParseTr(self):
313 # type: () -> Tuple[None, Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[None, str]], None]]
314 """
315 Assume we're looking at the first <ul> tag. Now we want to find
316 <li>tr and the nested <ul>
317
318 Grammar:
319
320 TR =
321 [RawData \s*]?
322 [StartTag 'li']
323 [RawData thead\s*]
324 [StartTag 'ul'] # Indented bullet that starts -
325 ( [StartEndTag row-attrs] [RawData \s*] )?
326 LIST_ITEM+ # Defined above
327 [RawData \s*]?
328 [EndTag 'ul']
329 """
330 #log('*** _ParseTr')
331
332 cells = []
333
334 self._WhitespaceOk()
335
336 # Could be a </ul>
337 if self.tok_id != html.StartTag:
338 return None, None
339
340 self._Eat(html.StartTag, 'li')
341
342 self._EatRawData(r'tr\s*')
343
344 tr_attrs = None
345 if self.tok_id == html.StartEndTag:
346 self.tag_lexer.Reset(self.start_pos, self.end_pos)
347 tag_name = self.tag_lexer.TagName()
348 if tag_name != 'row-attrs':
349 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
350 tr_attrs = self.tag_lexer.AllAttrsRaw()
351 self._Next()
352 self._WhitespaceOk()
353
354 # This is the row data
355 self._Eat(html.StartTag, 'ul')
356
357 while True:
358 td_attrs, inner_html = self._ListItem()
359 if inner_html is None:
360 break
361 cells.append((td_attrs, inner_html))
362 # TODO: assert
363
364 self._WhitespaceOk()
365
366 self._Eat(html.EndTag, 'ul')
367
368 self._WhitespaceOk()
369 self._Eat(html.EndTag, 'li')
370
371 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
372 return tr_attrs, cells
373
374 def ParseTable(self):
375 # type: () -> Dict[str, Any]
376 """
377 Returns a structure like this
378 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
379 'tr': [ # raw HTML that you surround with <td>
380 [ 'cell1 html', 'cell2 html' ],
381 [ 'cell1 html', 'cell2 html' ],
382 ]
383 }
384
385 Grammar:
386
387 UL_TABLE =
388 [StartTag 'ul']
389 THEAD # this this returns the number of cells, so it's NOT context
390 # free
391 TR*
392 [EndTag 'ul']
393 """
394 table = {'tr': []}
395
396 ul_start = self.start_pos
397 self._Eat(html.StartTag, 'ul')
398
399 # Look ahead 2 or 3 tokens:
400 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
401 thead = self._ParseTHead()
402 else:
403 thead = None
404 #log('___ THEAD %s', thead)
405
406 while True:
407 tr_attrs, tr = self._ParseTr()
408 if tr is None:
409 break
410 # Not validating because of colspan
411 if 0:
412 if thead and len(tr) != len(thead):
413 raise html.ParseError('Expected %d cells, got %d: %s' %
414 (len(thead), len(tr), tr))
415
416 #log('___ TR %s', tr)
417 table['tr'].append((tr_attrs, tr))
418
419 self._Eat(html.EndTag, 'ul')
420
421 self._WhitespaceOk()
422
423 ul_end = self.start_pos
424
425 table['thead'] = thead
426 table['ul_start'] = ul_start
427 table['ul_end'] = ul_end
428
429 if 0:
430 log('table %s', table)
431 from pprint import pprint
432 pprint(table)
433
434 return table
435
436
437def MergeAttrs(
438 thead_td_attrs, # type: Optional[List[Tuple[str, str]]]
439 row_td_attrs, # type: Optional[List[Tuple[str, str]]]
440):
441 # type: (...) -> List[Tuple[str, str]]
442 merged_attrs = []
443
444 if row_td_attrs is None:
445 row_lookup = {}
446 else:
447 row_lookup = {n: v for n, v in row_td_attrs}
448
449 done_for_row = set()
450
451 if thead_td_attrs:
452 for name, raw_value in thead_td_attrs:
453 more_values = row_lookup.get(name)
454 if more_values is not None:
455 raw_value += ' %s' % more_values
456 done_for_row.add(name)
457 merged_attrs.append((name, raw_value))
458
459 if row_td_attrs:
460 for name, raw_value in row_td_attrs:
461 if name in done_for_row:
462 continue
463 merged_attrs.append((name, raw_value))
464
465 return merged_attrs
466
467
468def ReplaceTables(s, debug_out=None):
469 # type: (str, Optional[Any]) -> str
470 """
471 ul-table: Write tables using bulleted list
472 """
473 if debug_out is None:
474 debug_out = []
475
476 f = StringIO()
477 out = html.Output(s, f)
478
479 tag_lexer = html.TagLexer(s)
480 lexer = html.Lexer(s)
481
482 p = UlTableParser(lexer, tag_lexer)
483
484 while True:
485 ul_start = p.FindUlTable()
486 if ul_start == -1:
487 break
488
489 #log('UL START %d', ul_start)
490 out.PrintUntil(ul_start)
491
492 table = p.ParseTable()
493 #log('UL END %d', ul_end)
494
495 # Don't write the matching </u> of the LAST row, but write everything
496 # after that
497 out.SkipTo(table['ul_end'])
498
499 # Write the header
500 thead = table['thead']
501
502 col_attrs = {} # integer -> td_attrs
503 if thead:
504 out.Print('<thead>\n')
505 out.Print('<tr>\n')
506
507 i = 0
508 for td_attrs, raw_html in thead:
509 if td_attrs:
510 col_attrs[i] = td_attrs
511 # <th> tag is more semantic, and styled bold by default
512 out.Print(' <th>')
513 out.Print(raw_html)
514 out.Print('</th>\n')
515 i += 1
516
517 out.Print('</tr>\n')
518 out.Print('</thead>\n')
519
520 # Write each row
521 for tr_attrs, row in table['tr']:
522
523 # Print tr tag and attrs
524 out.Print('<tr')
525 if tr_attrs:
526 for name, raw_value in tr_attrs:
527 out.Print(' ')
528 out.Print(name)
529 # No escaping because it's raw. It can't contain quotes.
530 out.Print('="%s"' % raw_value)
531 out.Print('>\n')
532
533 # Print cells
534 i = 0
535 for row_td_attrs, raw_html in row:
536 # Inherited from header
537 thead_td_attrs = col_attrs.get(i)
538 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
539
540 out.Print(' <td')
541 for name, raw_value in merged_attrs:
542 out.Print(' ')
543 out.Print(name)
544 # No escaping because it's raw. It can't contain quotes.
545 out.Print('="%s"' % raw_value)
546 out.Print('>')
547
548 out.Print(raw_html)
549 out.Print('</td>\n')
550 i += 1
551 out.Print('</tr>\n')
552
553 out.PrintTheRest()
554
555 return f.getvalue()
556
557
558if __name__ == '__main__':
559 # Simple CLI filter
560 h = sys.stdin.read()
561 h = RemoveComments(h)
562 h = ReplaceTables(h)
563 sys.stdout.write(h)