OILS / doctools / ul_table.py View on Github | oils.pub

568 lines, 283 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6try:
7 from cStringIO import StringIO
8except ImportError:
9 from io import StringIO # type: ignore
10import re
11import sys
12
13from doctools.util import log
14from data_lang import htm8
15from typing import List
16from typing import Optional
17from typing import Tuple
18from typing import Any
19from typing import Dict
20
21
22def RemoveComments(s):
23 # type: (str) -> str
24 """Remove <!-- comments -->
25
26 This is a required preprocessing step for ul-table.
27 """
28 f = StringIO()
29 out = htm8.Output(s, f)
30 lx = htm8.Lexer(s)
31
32 pos = 0
33 while True:
34 tok_id, end_pos = lx.Read()
35 if tok_id == h8_id.EndOfStream:
36 break
37
38 if tok_id == h8_id.Invalid:
39 raise htm8.LexError('RemoveComments() got invalid token', s, pos)
40
41 if tok_id == h8_id.Comment:
42 value = s[pos:end_pos]
43 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44 if 'REPLACE' not in value:
45 out.PrintUntil(pos)
46 out.SkipTo(end_pos)
47 pos = end_pos
48
49 out.PrintTheRest()
50 return f.getvalue()
51
52
53_WHITESPACE_RE = re.compile(r'\s*')
54
55TdAttrs = List[Tuple[str, str]]
56
57
58class UlTableParser(object):
59
60 def __init__(self, lexer):
61 # type: (htm8.Lexer) -> None
62 self.lexer = lexer
63 self.attr_lexer = htm8.AttrLexer(lexer.s)
64
65 self.tok_id = h8_id.Invalid
66 self.start_pos = 0
67 self.end_pos = 0
68 # The tag name is only populated when we are "looking at"
69 # h8_id.{StartTag,EndTag,StartEndTag}
70 self.tag_name = None # type: Optional[str]
71
72 def _CurrentString(self):
73 # type: () -> str
74 part = self.lexer.s[self.start_pos:self.end_pos]
75 return part
76
77 def _Next(self, comment_ok=False):
78 # type: (bool) -> None
79 """
80 Advance and set self.tok_id, self.start_pos, self.end_pos
81 """
82 self.start_pos = self.end_pos
83 self.tok_id, self.end_pos = self.lexer.Read()
84 if self.tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
85 self.tag_name = self.lexer.CanonicalTagName()
86 else:
87 self.tag_name = None
88
89 # Should have called RemoveComments() beforehand. That can still leave
90 # some REPLACE cmoments
91 if not comment_ok and self.tok_id == h8_id.Comment:
92 raise htm8.ParseError('Unexpected HTML comment')
93
94 if 0:
95 part = self._CurrentString()
96 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
97
98 def _EatRawData(self, regex):
99 # type: (str) -> None
100 """
101 Assert that we got text data matching a regex, and advance
102 """
103 if self.tok_id != h8_id.RawData:
104 raise htm8.ParseError('Expected RawData, got %s' %
105 h8_id_str(self.tok_id))
106 actual = self._CurrentString()
107 m = re.match(regex, actual) # could compile this
108 if m is None:
109 raise htm8.ParseError('Expected to match %r, got %r' %
110 (regex, actual))
111 self._Next()
112
113 def _Eat(self, expected_id, expected_tag):
114 # type: (h8_id_t, str) -> None
115 """
116 Assert that we got a start or end tag, with the given name, and advance
117
118 Args:
119 expected_id: h8_id.StartTag or h8_id.EndTag
120 expected_tag: 'a', 'span', etc.
121 """
122 assert expected_id in (h8_id.StartTag,
123 h8_id.EndTag), h8_id_str(expected_id)
124
125 if self.tok_id != expected_id:
126 raise htm8.ParseError(
127 'Expected token %s, got %s' %
128 (h8_id_str(expected_id), h8_id_str(self.tok_id)))
129 if expected_tag != self.tag_name:
130 raise htm8.ParseError('Expected tag %r, got %r' %
131 (expected_tag, self.tag_name))
132
133 self._Next()
134
135 def _WhitespaceOk(self):
136 # type: () -> None
137 """
138 Optional whitespace
139 """
140 if (self.tok_id == h8_id.RawData and
141 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
142 self._Next()
143
144 def FindUlTable(self):
145 # type: () -> int
146 """Find <table ...> <ul>
147
148 Return the START position of the <ul>
149 Similar algorithm as html.ReadUntilStartTag()
150 """
151 # Find first table
152 while True:
153 self._Next(comment_ok=True)
154 if self.tok_id == h8_id.EndOfStream:
155 return -1
156
157 if (self.tok_id == h8_id.StartTag and self.tag_name == 'table'):
158 while True:
159 self._Next(comment_ok=True)
160 if self.tok_id != h8_id.RawData:
161 break
162
163 if (self.tok_id == h8_id.StartTag and self.tag_name == 'ul'):
164 return self.start_pos
165 return -1
166
167 def _ListItem(self):
168 # type: () -> Tuple[Optional[TdAttrs], Optional[str]]
169 """Parse a list item nested below thead or tr.
170
171 Returns:
172 A pair (td_attrs, inner_html)
173
174 Grammar:
175
176 LIST_ITEM =
177 [RawData \s*]?
178 [StartTag 'li']
179 ANY* # NOT context-free:
180 # - we MATCH <li> and </li> with a tack
181 # - We search for [StartEndTag 'cell-attrs']?
182 [EndTag 'li']
183
184 Example of attribute borrowing:
185
186 - hi there ==>
187 <li>hi there</li> ==>
188 <td>hi there</td>
189
190 - <cell-attrs class=foo /> hi there ==>
191 <li><cell-attrs class=foo /> hi there </li> ==>
192 <td class=foo> hi there </td> ==>
193 """
194 self._WhitespaceOk()
195
196 if self.tok_id != h8_id.StartTag:
197 return None, None
198
199 inner_html = None
200 td_attrs = None # Can we also have col-attrs?
201 td_attrs_span = None
202
203 self._Eat(h8_id.StartTag, 'li')
204
205 left = self.start_pos
206
207 # Find the closing </li>, taking into accounted NESTED tags:
208 # <li> <li>foo</li> </li>
209 # because cells can have bulleted lists
210 balance = 0
211 while True:
212 if self.tok_id == h8_id.StartEndTag:
213 self.attr_lexer.Init(self.tok_id, self.lexer.TagNamePos(),
214 self.end_pos)
215 # TODO: remove td-attrs backward compat
216 if self.tag_name in ('td-attrs', 'cell-attrs'):
217 td_attrs_span = self.start_pos, self.end_pos
218 td_attrs = htm8.AllAttrsRaw(self.attr_lexer)
219 #log('CELL ATTRS %r', self._CurrentString())
220
221 elif self.tok_id == h8_id.StartTag:
222 if self.tag_name == 'li':
223 balance += 1
224
225 elif self.tok_id == h8_id.EndTag:
226 if self.tag_name == 'li':
227 balance -= 1
228 if balance < 0:
229 break
230 self._Next()
231
232 right = self.start_pos # start of the end tag
233
234 s = self.lexer.s
235 if td_attrs_span:
236 # everything except the <cell-attrs />
237 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
238 #log('LEFT %r', s[left:td_attrs_span[0]])
239 #log('RIGHT %r', s[td_attrs_span[1]:right])
240 else:
241 inner_html = s[left:right]
242 #log('RAW inner html %r', inner_html)
243
244 #self._Eat(h8_id.EndTag, 'li')
245 self._Next()
246
247 return td_attrs, inner_html
248
249 def _ParseTHead(self):
250 # type: () -> List[Tuple[Optional[TdAttrs], str]]
251 """
252 Assume we're looking at the first <ul> tag. Now we want to find
253 <li>thead and the nested <ul>
254
255 Grammar:
256
257 THEAD =
258 [StartTag 'ul']
259 [RawData \s*]?
260 [StartTag 'li']
261 [RawData thead\s*]
262 [StartTag 'ul'] # Indented bullet that starts -
263 LIST_ITEM+
264 [RawData \s*]?
265 [EndTag 'ul']
266 [RawData thead\s+]
267 [End 'li']
268
269 Two Algorithms:
270
271 1. Replacement:
272 - skip over the first ul 'thead' li, and ul 'tr' li
273 - then replace the next ul -> tr, and li -> td
274 2. Parsing and Rendering:
275 - parse them into a structure
276 - skip all the text
277 - print your own HTML
278
279 I think the second one is better, because it allows attribute extensions
280 to thead
281
282 - thead
283 - name [link][]
284 - colgroup=foo align=left
285 - age
286 - colgroup=foo align=right
287 """
288 #log('*** _ParseTHead')
289 cells = []
290
291 self._WhitespaceOk()
292 self._Eat(h8_id.StartTag, 'li')
293
294 # In CommonMark, r'thead\n' is enough, because it strips trailing
295 # whitespace. I'm not sure if other Markdown processors do that, so
296 # use r'thead\s+'.
297 self._EatRawData(r'thead\s+')
298
299 # This is the row data
300 self._Eat(h8_id.StartTag, 'ul')
301
302 while True:
303 td_attrs, inner_html = self._ListItem()
304 if inner_html is None:
305 break
306 cells.append((td_attrs, inner_html))
307 self._WhitespaceOk()
308
309 self._Eat(h8_id.EndTag, 'ul')
310
311 self._WhitespaceOk()
312 self._Eat(h8_id.EndTag, 'li')
313
314 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
315 return cells
316
317 def _ParseTr(self):
318 # type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
319 """
320 Assume we're looking at the first <ul> tag. Now we want to find
321 <li>tr and the nested <ul>
322
323 Grammar:
324
325 TR =
326 [RawData \s*]?
327 [StartTag 'li']
328 [RawData thead\s*]
329 [StartTag 'ul'] # Indented bullet that starts -
330 ( [StartEndTag row-attrs] [RawData \s*] )?
331 LIST_ITEM+ # Defined above
332 [RawData \s*]?
333 [EndTag 'ul']
334 """
335 #log('*** _ParseTr')
336
337 cells = []
338
339 self._WhitespaceOk()
340
341 # Could be a </ul>
342 if self.tok_id != h8_id.StartTag:
343 return None, None
344
345 self._Eat(h8_id.StartTag, 'li')
346
347 self._EatRawData(r'tr\s*')
348
349 tr_attrs = None
350 if self.tok_id == h8_id.StartEndTag:
351 self.attr_lexer.Init(self.tok_id, self.lexer.TagNamePos(),
352 self.end_pos)
353 if self.tag_name != 'row-attrs':
354 raise htm8.ParseError('Expected row-attrs, got %r' %
355 self.tag_name)
356 tr_attrs = htm8.AllAttrsRaw(self.attr_lexer)
357 self._Next()
358 self._WhitespaceOk()
359
360 # This is the row data
361 self._Eat(h8_id.StartTag, 'ul')
362
363 while True:
364 td_attrs, inner_html = self._ListItem()
365 if inner_html is None:
366 break
367 cells.append((td_attrs, inner_html))
368 # TODO: assert
369
370 self._WhitespaceOk()
371
372 self._Eat(h8_id.EndTag, 'ul')
373
374 self._WhitespaceOk()
375 self._Eat(h8_id.EndTag, 'li')
376
377 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
378 return tr_attrs, cells
379
380 def ParseTable(self):
381 # type: () -> Dict[str, Any]
382 """
383 Returns a structure like this
384 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
385 'tr': [ # raw HTML that you surround with <td>
386 [ 'cell1 html', 'cell2 html' ],
387 [ 'cell1 html', 'cell2 html' ],
388 ]
389 }
390
391 Grammar:
392
393 UL_TABLE =
394 [StartTag 'ul']
395 THEAD # this this returns the number of cells, so it's NOT context
396 # free
397 TR*
398 [EndTag 'ul']
399 """
400 table = {'tr': []} # type: Dict[str, Any]
401
402 ul_start = self.start_pos
403 self._Eat(h8_id.StartTag, 'ul')
404
405 # Look ahead 2 or 3 tokens:
406 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
407 thead = self._ParseTHead()
408 else:
409 thead = None
410 #log('___ THEAD %s', thead)
411
412 while True:
413 tr_attrs, tr = self._ParseTr()
414 if tr is None:
415 break
416 # Not validating because of colspan
417 if 0:
418 if thead and len(tr) != len(thead):
419 raise htm8.ParseError('Expected %d cells, got %d: %s' %
420 (len(thead), len(tr), tr))
421
422 #log('___ TR %s', tr)
423 table['tr'].append((tr_attrs, tr))
424
425 self._Eat(h8_id.EndTag, 'ul')
426
427 self._WhitespaceOk()
428
429 ul_end = self.start_pos
430
431 table['thead'] = thead
432 table['ul_start'] = ul_start
433 table['ul_end'] = ul_end
434
435 if 0:
436 log('table %s', table)
437 from pprint import pprint
438 pprint(table)
439
440 return table
441
442
443def MergeAttrs(
444 thead_td_attrs, # type: Optional[TdAttrs]
445 row_td_attrs, # type: Optional[TdAttrs]
446):
447 # type: (...) -> TdAttrs
448 merged_attrs = []
449
450 if row_td_attrs is None:
451 row_lookup = {}
452 else:
453 row_lookup = {n: v for n, v in row_td_attrs}
454
455 done_for_row = set()
456
457 if thead_td_attrs:
458 for name, raw_value in thead_td_attrs:
459 more_values = row_lookup.get(name)
460 if more_values is not None:
461 raw_value += ' %s' % more_values
462 done_for_row.add(name)
463 merged_attrs.append((name, raw_value))
464
465 if row_td_attrs:
466 for name, raw_value in row_td_attrs:
467 if name in done_for_row:
468 continue
469 merged_attrs.append((name, raw_value))
470
471 return merged_attrs
472
473
474def ReplaceTables(s, debug_out=None):
475 # type: (str, Optional[Any]) -> str
476 """
477 ul-table: Write tables using bulleted list
478 """
479 if debug_out is None:
480 debug_out = []
481
482 f = StringIO()
483 out = htm8.Output(s, f)
484
485 lexer = htm8.Lexer(s)
486
487 p = UlTableParser(lexer)
488
489 while True:
490 ul_start = p.FindUlTable()
491 if ul_start == -1:
492 break
493
494 #log('UL START %d', ul_start)
495 out.PrintUntil(ul_start)
496
497 table = p.ParseTable()
498 #log('UL END %d', ul_end)
499
500 # Don't write the matching </u> of the LAST row, but write everything
501 # after that
502 out.SkipTo(table['ul_end'])
503
504 # Write the header
505 thead = table['thead']
506
507 col_attrs = {} # integer -> td_attrs
508 if thead:
509 out.Print('<thead>\n')
510 out.Print('<tr>\n')
511
512 i = 0
513 for td_attrs, raw_html in thead:
514 if td_attrs:
515 col_attrs[i] = td_attrs
516 # <th> tag is more semantic, and styled bold by default
517 out.Print(' <th>')
518 out.Print(raw_html)
519 out.Print('</th>\n')
520 i += 1
521
522 out.Print('</tr>\n')
523 out.Print('</thead>\n')
524
525 # Write each row
526 for tr_attrs, row in table['tr']:
527
528 # Print tr tag and attrs
529 out.Print('<tr')
530 if tr_attrs:
531 for name, raw_value in tr_attrs:
532 out.Print(' ')
533 out.Print(name)
534 # No escaping because it's raw. It can't contain quotes.
535 out.Print('="%s"' % raw_value)
536 out.Print('>\n')
537
538 # Print cells
539 i = 0
540 for row_td_attrs, raw_html in row:
541 # Inherited from header
542 thead_td_attrs = col_attrs.get(i)
543 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
544
545 out.Print(' <td')
546 for name, raw_value in merged_attrs:
547 out.Print(' ')
548 out.Print(name)
549 # No escaping because it's raw. It can't contain quotes.
550 out.Print('="%s"' % raw_value)
551 out.Print('>')
552
553 out.Print(raw_html)
554 out.Print('</td>\n')
555 i += 1
556 out.Print('</tr>\n')
557
558 out.PrintTheRest()
559
560 return f.getvalue()
561
562
563if __name__ == '__main__':
564 # Simple CLI filter
565 h = sys.stdin.read()
566 h = RemoveComments(h)
567 h = ReplaceTables(h)
568 sys.stdout.write(h)