OILS / doctools / ul_table.py View on Github | oils.pub

566 lines, 282 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6try:
7 from cStringIO import StringIO
8except ImportError:
9 from io import StringIO # type: ignore
10import re
11import sys
12
13from doctools.util import log
14from lazylex import html
15from typing import List
16from typing import Optional
17from typing import Tuple
18from typing import Any
19from typing import Dict
20
21
22def RemoveComments(s):
23 # type: (str) -> str
24 """Remove <!-- comments -->
25
26 This is a required preprocessing step for ul-table.
27 """
28 f = StringIO()
29 out = html.Output(s, f)
30
31 tag_lexer = html.TagLexer(s)
32
33 pos = 0
34
35 for tok_id, end_pos in html.ValidTokens(s):
36 if tok_id == h8_id.Comment:
37 value = s[pos:end_pos]
38 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
39 if 'REPLACE' not in value:
40 out.PrintUntil(pos)
41 out.SkipTo(end_pos)
42 pos = end_pos
43
44 out.PrintTheRest()
45 return f.getvalue()
46
47
48_WHITESPACE_RE = re.compile(r'\s*')
49
50TdAttrs = List[Tuple[str, str]]
51
52
53class UlTableParser(object):
54
55 def __init__(self, lexer, tag_lexer):
56 # type: (html.Lexer, html.TagLexer) -> None
57 self.lexer = lexer
58 self.tag_lexer = tag_lexer
59
60 self.tok_id = h8_id.Invalid
61 self.start_pos = 0
62 self.end_pos = 0
63
64 def _CurrentString(self):
65 # type: () -> str
66 part = self.lexer.s[self.start_pos:self.end_pos]
67 return part
68
69 def _Next(self, comment_ok=False):
70 # type: (bool) -> None
71 """
72 Advance and set self.tok_id, self.start_pos, self.end_pos
73 """
74 self.start_pos = self.end_pos
75 self.tok_id, self.end_pos = self.lexer.Read()
76
77 # Should have called RemoveComments() beforehand. That can still leave
78 # some REPLACE cmoments
79 if not comment_ok and self.tok_id == h8_id.Comment:
80 raise html.ParseError('Unexpected HTML comment')
81
82 if 0:
83 part = self._CurrentString()
84 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
85
86 def _EatRawData(self, regex):
87 # type: (str) -> None
88 """
89 Assert that we got text data matching a regex, and advance
90 """
91 if self.tok_id != h8_id.RawData:
92 raise html.ParseError('Expected RawData, got %s' %
93 h8_id_str(self.tok_id))
94 actual = self._CurrentString()
95 m = re.match(regex, actual) # could compile this
96 if m is None:
97 raise html.ParseError('Expected to match %r, got %r' %
98 (regex, actual))
99 self._Next()
100
101 def _Eat(self, expected_id, expected_tag):
102 # type: (h8_id_t, str) -> None
103 """
104 Assert that we got a start or end tag, with the given name, and advance
105
106 Args:
107 expected_id: h8_id.StartTag or h8_id.EndTag
108 expected_tag: 'a', 'span', etc.
109 """
110 assert expected_id in (h8_id.StartTag,
111 h8_id.EndTag), h8_id_str(expected_id)
112
113 if self.tok_id != expected_id:
114 raise html.ParseError(
115 'Expected token %s, got %s' %
116 (h8_id_str(expected_id), h8_id_str(self.tok_id)))
117 self.tag_lexer.Reset(self.start_pos, self.end_pos)
118 tag_name = self.tag_lexer.TagName()
119 if expected_tag != tag_name:
120 raise html.ParseError('Expected tag %r, got %r' %
121 (expected_tag, tag_name))
122
123 self._Next()
124
125 def _WhitespaceOk(self):
126 # type: () -> None
127 """
128 Optional whitespace
129 """
130 if (self.tok_id == h8_id.RawData and
131 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
132 self._Next()
133
134 def FindUlTable(self):
135 # type: () -> int
136 """Find <table ...> <ul>
137
138 Return the START position of the <ul>
139 Similar algorithm as html.ReadUntilStartTag()
140 """
141 tag_lexer = self.tag_lexer
142
143 # Find first table
144 while True:
145 self._Next(comment_ok=True)
146 if self.tok_id == h8_id.EndOfStream:
147 return -1
148
149 tag_lexer.Reset(self.start_pos, self.end_pos)
150 if (self.tok_id == h8_id.StartTag and
151 tag_lexer.TagName() == 'table'):
152 while True:
153 self._Next(comment_ok=True)
154 if self.tok_id != h8_id.RawData:
155 break
156
157 tag_lexer.Reset(self.start_pos, self.end_pos)
158 if (self.tok_id == h8_id.StartTag and
159 tag_lexer.TagName() == 'ul'):
160 return self.start_pos
161 return -1
162
163 def _ListItem(self):
164 # type: () -> Tuple[Optional[TdAttrs], Optional[str]]
165 """Parse a list item nested below thead or tr.
166
167 Returns:
168 A pair (td_attrs, inner_html)
169
170 Grammar:
171
172 LIST_ITEM =
173 [RawData \s*]?
174 [StartTag 'li']
175 ANY* # NOT context-free:
176 # - we MATCH <li> and </li> with a tack
177 # - We search for [StartEndTag 'cell-attrs']?
178 [EndTag 'li']
179
180 Example of attribute borrowing:
181
182 - hi there ==>
183 <li>hi there</li> ==>
184 <td>hi there</td>
185
186 - <cell-attrs class=foo /> hi there ==>
187 <li><cell-attrs class=foo /> hi there </li> ==>
188 <td class=foo> hi there </td> ==>
189 """
190 self._WhitespaceOk()
191
192 if self.tok_id != h8_id.StartTag:
193 return None, None
194
195 inner_html = None
196 td_attrs = None # Can we also have col-attrs?
197 td_attrs_span = None
198
199 self._Eat(h8_id.StartTag, 'li')
200
201 left = self.start_pos
202
203 # Find the closing </li>, taking into accounted NESTED tags:
204 # <li> <li>foo</li> </li>
205 # because cells can have bulleted lists
206 balance = 0
207 while True:
208 if self.tok_id == h8_id.StartEndTag:
209 self.tag_lexer.Reset(self.start_pos, self.end_pos)
210 tag_name = self.tag_lexer.TagName()
211 # TODO: remove td-attrs backward compat
212 if tag_name in ('td-attrs', 'cell-attrs'):
213 td_attrs_span = self.start_pos, self.end_pos
214 td_attrs = self.tag_lexer.AllAttrsRaw()
215 #log('CELL ATTRS %r', self._CurrentString())
216
217 elif self.tok_id == h8_id.StartTag:
218 self.tag_lexer.Reset(self.start_pos, self.end_pos)
219 if self.tag_lexer.TagName() == 'li':
220 balance += 1
221
222 elif self.tok_id == h8_id.EndTag:
223 self.tag_lexer.Reset(self.start_pos, self.end_pos)
224 if self.tag_lexer.TagName() == 'li':
225 balance -= 1
226 if balance < 0:
227 break
228 self._Next()
229
230 right = self.start_pos # start of the end tag
231
232 s = self.tag_lexer.s
233 if td_attrs_span:
234 # everything except the <cell-attrs />
235 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
236 #log('LEFT %r', s[left:td_attrs_span[0]])
237 #log('RIGHT %r', s[td_attrs_span[1]:right])
238 else:
239 inner_html = s[left:right]
240 #log('RAW inner html %r', inner_html)
241
242 #self._Eat(h8_id.EndTag, 'li')
243 self._Next()
244
245 return td_attrs, inner_html
246
247 def _ParseTHead(self):
248 # type: () -> List[Tuple[Optional[TdAttrs], str]]
249 """
250 Assume we're looking at the first <ul> tag. Now we want to find
251 <li>thead and the nested <ul>
252
253 Grammar:
254
255 THEAD =
256 [StartTag 'ul']
257 [RawData \s*]?
258 [StartTag 'li']
259 [RawData thead\s*]
260 [StartTag 'ul'] # Indented bullet that starts -
261 LIST_ITEM+
262 [RawData \s*]?
263 [EndTag 'ul']
264 [RawData thead\s+]
265 [End 'li']
266
267 Two Algorithms:
268
269 1. Replacement:
270 - skip over the first ul 'thead' li, and ul 'tr' li
271 - then replace the next ul -> tr, and li -> td
272 2. Parsing and Rendering:
273 - parse them into a structure
274 - skip all the text
275 - print your own HTML
276
277 I think the second one is better, because it allows attribute extensions
278 to thead
279
280 - thead
281 - name [link][]
282 - colgroup=foo align=left
283 - age
284 - colgroup=foo align=right
285 """
286 #log('*** _ParseTHead')
287 cells = []
288
289 self._WhitespaceOk()
290 self._Eat(h8_id.StartTag, 'li')
291
292 # In CommonMark, r'thead\n' is enough, because it strips trailing
293 # whitespace. I'm not sure if other Markdown processors do that, so
294 # use r'thead\s+'.
295 self._EatRawData(r'thead\s+')
296
297 # This is the row data
298 self._Eat(h8_id.StartTag, 'ul')
299
300 while True:
301 td_attrs, inner_html = self._ListItem()
302 if inner_html is None:
303 break
304 cells.append((td_attrs, inner_html))
305 self._WhitespaceOk()
306
307 self._Eat(h8_id.EndTag, 'ul')
308
309 self._WhitespaceOk()
310 self._Eat(h8_id.EndTag, 'li')
311
312 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
313 return cells
314
315 def _ParseTr(self):
316 # type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
317 """
318 Assume we're looking at the first <ul> tag. Now we want to find
319 <li>tr and the nested <ul>
320
321 Grammar:
322
323 TR =
324 [RawData \s*]?
325 [StartTag 'li']
326 [RawData thead\s*]
327 [StartTag 'ul'] # Indented bullet that starts -
328 ( [StartEndTag row-attrs] [RawData \s*] )?
329 LIST_ITEM+ # Defined above
330 [RawData \s*]?
331 [EndTag 'ul']
332 """
333 #log('*** _ParseTr')
334
335 cells = []
336
337 self._WhitespaceOk()
338
339 # Could be a </ul>
340 if self.tok_id != h8_id.StartTag:
341 return None, None
342
343 self._Eat(h8_id.StartTag, 'li')
344
345 self._EatRawData(r'tr\s*')
346
347 tr_attrs = None
348 if self.tok_id == h8_id.StartEndTag:
349 self.tag_lexer.Reset(self.start_pos, self.end_pos)
350 tag_name = self.tag_lexer.TagName()
351 if tag_name != 'row-attrs':
352 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
353 tr_attrs = self.tag_lexer.AllAttrsRaw()
354 self._Next()
355 self._WhitespaceOk()
356
357 # This is the row data
358 self._Eat(h8_id.StartTag, 'ul')
359
360 while True:
361 td_attrs, inner_html = self._ListItem()
362 if inner_html is None:
363 break
364 cells.append((td_attrs, inner_html))
365 # TODO: assert
366
367 self._WhitespaceOk()
368
369 self._Eat(h8_id.EndTag, 'ul')
370
371 self._WhitespaceOk()
372 self._Eat(h8_id.EndTag, 'li')
373
374 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
375 return tr_attrs, cells
376
377 def ParseTable(self):
378 # type: () -> Dict[str, Any]
379 """
380 Returns a structure like this
381 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
382 'tr': [ # raw HTML that you surround with <td>
383 [ 'cell1 html', 'cell2 html' ],
384 [ 'cell1 html', 'cell2 html' ],
385 ]
386 }
387
388 Grammar:
389
390 UL_TABLE =
391 [StartTag 'ul']
392 THEAD # this this returns the number of cells, so it's NOT context
393 # free
394 TR*
395 [EndTag 'ul']
396 """
397 table = {'tr': []} # type: Dict[str, Any]
398
399 ul_start = self.start_pos
400 self._Eat(h8_id.StartTag, 'ul')
401
402 # Look ahead 2 or 3 tokens:
403 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
404 thead = self._ParseTHead()
405 else:
406 thead = None
407 #log('___ THEAD %s', thead)
408
409 while True:
410 tr_attrs, tr = self._ParseTr()
411 if tr is None:
412 break
413 # Not validating because of colspan
414 if 0:
415 if thead and len(tr) != len(thead):
416 raise html.ParseError('Expected %d cells, got %d: %s' %
417 (len(thead), len(tr), tr))
418
419 #log('___ TR %s', tr)
420 table['tr'].append((tr_attrs, tr))
421
422 self._Eat(h8_id.EndTag, 'ul')
423
424 self._WhitespaceOk()
425
426 ul_end = self.start_pos
427
428 table['thead'] = thead
429 table['ul_start'] = ul_start
430 table['ul_end'] = ul_end
431
432 if 0:
433 log('table %s', table)
434 from pprint import pprint
435 pprint(table)
436
437 return table
438
439
440def MergeAttrs(
441 thead_td_attrs, # type: Optional[TdAttrs]
442 row_td_attrs, # type: Optional[TdAttrs]
443):
444 # type: (...) -> TdAttrs
445 merged_attrs = []
446
447 if row_td_attrs is None:
448 row_lookup = {}
449 else:
450 row_lookup = {n: v for n, v in row_td_attrs}
451
452 done_for_row = set()
453
454 if thead_td_attrs:
455 for name, raw_value in thead_td_attrs:
456 more_values = row_lookup.get(name)
457 if more_values is not None:
458 raw_value += ' %s' % more_values
459 done_for_row.add(name)
460 merged_attrs.append((name, raw_value))
461
462 if row_td_attrs:
463 for name, raw_value in row_td_attrs:
464 if name in done_for_row:
465 continue
466 merged_attrs.append((name, raw_value))
467
468 return merged_attrs
469
470
471def ReplaceTables(s, debug_out=None):
472 # type: (str, Optional[Any]) -> str
473 """
474 ul-table: Write tables using bulleted list
475 """
476 if debug_out is None:
477 debug_out = []
478
479 f = StringIO()
480 out = html.Output(s, f)
481
482 tag_lexer = html.TagLexer(s)
483 lexer = html.Lexer(s)
484
485 p = UlTableParser(lexer, tag_lexer)
486
487 while True:
488 ul_start = p.FindUlTable()
489 if ul_start == -1:
490 break
491
492 #log('UL START %d', ul_start)
493 out.PrintUntil(ul_start)
494
495 table = p.ParseTable()
496 #log('UL END %d', ul_end)
497
498 # Don't write the matching </u> of the LAST row, but write everything
499 # after that
500 out.SkipTo(table['ul_end'])
501
502 # Write the header
503 thead = table['thead']
504
505 col_attrs = {} # integer -> td_attrs
506 if thead:
507 out.Print('<thead>\n')
508 out.Print('<tr>\n')
509
510 i = 0
511 for td_attrs, raw_html in thead:
512 if td_attrs:
513 col_attrs[i] = td_attrs
514 # <th> tag is more semantic, and styled bold by default
515 out.Print(' <th>')
516 out.Print(raw_html)
517 out.Print('</th>\n')
518 i += 1
519
520 out.Print('</tr>\n')
521 out.Print('</thead>\n')
522
523 # Write each row
524 for tr_attrs, row in table['tr']:
525
526 # Print tr tag and attrs
527 out.Print('<tr')
528 if tr_attrs:
529 for name, raw_value in tr_attrs:
530 out.Print(' ')
531 out.Print(name)
532 # No escaping because it's raw. It can't contain quotes.
533 out.Print('="%s"' % raw_value)
534 out.Print('>\n')
535
536 # Print cells
537 i = 0
538 for row_td_attrs, raw_html in row:
539 # Inherited from header
540 thead_td_attrs = col_attrs.get(i)
541 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
542
543 out.Print(' <td')
544 for name, raw_value in merged_attrs:
545 out.Print(' ')
546 out.Print(name)
547 # No escaping because it's raw. It can't contain quotes.
548 out.Print('="%s"' % raw_value)
549 out.Print('>')
550
551 out.Print(raw_html)
552 out.Print('</td>\n')
553 i += 1
554 out.Print('</tr>\n')
555
556 out.PrintTheRest()
557
558 return f.getvalue()
559
560
561if __name__ == '__main__':
562 # Simple CLI filter
563 h = sys.stdin.read()
564 h = RemoveComments(h)
565 h = ReplaceTables(h)
566 sys.stdout.write(h)