OILS / doctools / ul_table.py View on Github | oils.pub

571 lines, 287 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6try:
7 from cStringIO import StringIO
8except ImportError:
9 from io import StringIO # type: ignore
10import re
11import sys
12
13from doctools.util import log
14from data_lang import htm8
15from typing import List
16from typing import Optional
17from typing import Tuple
18from typing import Any
19from typing import Dict
20
21
22def RemoveComments(s):
23 # type: (str) -> str
24 """Remove <!-- comments -->
25
26 This is a required preprocessing step for ul-table.
27 """
28 f = StringIO()
29 out = htm8.Output(s, f)
30 lx = htm8.Lexer(s)
31
32 pos = 0
33 while True:
34 tok_id, end_pos = lx.Read()
35 if tok_id == h8_id.EndOfStream:
36 break
37
38 if tok_id == h8_id.Invalid:
39 raise htm8.LexError(s, pos)
40
41 if tok_id == h8_id.Comment:
42 value = s[pos:end_pos]
43 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44 if 'REPLACE' not in value:
45 out.PrintUntil(pos)
46 out.SkipTo(end_pos)
47 pos = end_pos
48
49 out.PrintTheRest()
50 return f.getvalue()
51
52
53_WHITESPACE_RE = re.compile(r'\s*')
54
55TdAttrs = List[Tuple[str, str]]
56
57
58class UlTableParser(object):
59
60 def __init__(self, lexer, tag_lexer):
61 # type: (htm8.Lexer, htm8.TagLexer) -> None
62 self.lexer = lexer
63 self.tag_lexer = tag_lexer
64
65 self.tok_id = h8_id.Invalid
66 self.start_pos = 0
67 self.end_pos = 0
68
69 def _CurrentString(self):
70 # type: () -> str
71 part = self.lexer.s[self.start_pos:self.end_pos]
72 return part
73
74 def _Next(self, comment_ok=False):
75 # type: (bool) -> None
76 """
77 Advance and set self.tok_id, self.start_pos, self.end_pos
78 """
79 self.start_pos = self.end_pos
80 self.tok_id, self.end_pos = self.lexer.Read()
81
82 # Should have called RemoveComments() beforehand. That can still leave
83 # some REPLACE cmoments
84 if not comment_ok and self.tok_id == h8_id.Comment:
85 raise htm8.ParseError('Unexpected HTML comment')
86
87 if 0:
88 part = self._CurrentString()
89 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
90
91 def _EatRawData(self, regex):
92 # type: (str) -> None
93 """
94 Assert that we got text data matching a regex, and advance
95 """
96 if self.tok_id != h8_id.RawData:
97 raise htm8.ParseError('Expected RawData, got %s' %
98 h8_id_str(self.tok_id))
99 actual = self._CurrentString()
100 m = re.match(regex, actual) # could compile this
101 if m is None:
102 raise htm8.ParseError('Expected to match %r, got %r' %
103 (regex, actual))
104 self._Next()
105
106 def _Eat(self, expected_id, expected_tag):
107 # type: (h8_id_t, str) -> None
108 """
109 Assert that we got a start or end tag, with the given name, and advance
110
111 Args:
112 expected_id: h8_id.StartTag or h8_id.EndTag
113 expected_tag: 'a', 'span', etc.
114 """
115 assert expected_id in (h8_id.StartTag,
116 h8_id.EndTag), h8_id_str(expected_id)
117
118 if self.tok_id != expected_id:
119 raise htm8.ParseError(
120 'Expected token %s, got %s' %
121 (h8_id_str(expected_id), h8_id_str(self.tok_id)))
122 self.tag_lexer.Reset(self.start_pos, self.end_pos)
123 tag_name = self.tag_lexer.GetTagName()
124 if expected_tag != tag_name:
125 raise htm8.ParseError('Expected tag %r, got %r' %
126 (expected_tag, tag_name))
127
128 self._Next()
129
130 def _WhitespaceOk(self):
131 # type: () -> None
132 """
133 Optional whitespace
134 """
135 if (self.tok_id == h8_id.RawData and
136 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
137 self._Next()
138
139 def FindUlTable(self):
140 # type: () -> int
141 """Find <table ...> <ul>
142
143 Return the START position of the <ul>
144 Similar algorithm as html.ReadUntilStartTag()
145 """
146 tag_lexer = self.tag_lexer
147
148 # Find first table
149 while True:
150 self._Next(comment_ok=True)
151 if self.tok_id == h8_id.EndOfStream:
152 return -1
153
154 tag_lexer.Reset(self.start_pos, self.end_pos)
155 if (self.tok_id == h8_id.StartTag and
156 tag_lexer.GetTagName() == 'table'):
157 while True:
158 self._Next(comment_ok=True)
159 if self.tok_id != h8_id.RawData:
160 break
161
162 tag_lexer.Reset(self.start_pos, self.end_pos)
163 if (self.tok_id == h8_id.StartTag and
164 tag_lexer.GetTagName() == 'ul'):
165 return self.start_pos
166 return -1
167
168 def _ListItem(self):
169 # type: () -> Tuple[Optional[TdAttrs], Optional[str]]
170 """Parse a list item nested below thead or tr.
171
172 Returns:
173 A pair (td_attrs, inner_html)
174
175 Grammar:
176
177 LIST_ITEM =
178 [RawData \s*]?
179 [StartTag 'li']
180 ANY* # NOT context-free:
181 # - we MATCH <li> and </li> with a tack
182 # - We search for [StartEndTag 'cell-attrs']?
183 [EndTag 'li']
184
185 Example of attribute borrowing:
186
187 - hi there ==>
188 <li>hi there</li> ==>
189 <td>hi there</td>
190
191 - <cell-attrs class=foo /> hi there ==>
192 <li><cell-attrs class=foo /> hi there </li> ==>
193 <td class=foo> hi there </td> ==>
194 """
195 self._WhitespaceOk()
196
197 if self.tok_id != h8_id.StartTag:
198 return None, None
199
200 inner_html = None
201 td_attrs = None # Can we also have col-attrs?
202 td_attrs_span = None
203
204 self._Eat(h8_id.StartTag, 'li')
205
206 left = self.start_pos
207
208 # Find the closing </li>, taking into accounted NESTED tags:
209 # <li> <li>foo</li> </li>
210 # because cells can have bulleted lists
211 balance = 0
212 while True:
213 if self.tok_id == h8_id.StartEndTag:
214 self.tag_lexer.Reset(self.start_pos, self.end_pos)
215 tag_name = self.tag_lexer.GetTagName()
216 # TODO: remove td-attrs backward compat
217 if tag_name in ('td-attrs', 'cell-attrs'):
218 td_attrs_span = self.start_pos, self.end_pos
219 td_attrs = self.tag_lexer.AllAttrsRaw()
220 #log('CELL ATTRS %r', self._CurrentString())
221
222 elif self.tok_id == h8_id.StartTag:
223 self.tag_lexer.Reset(self.start_pos, self.end_pos)
224 if self.tag_lexer.GetTagName() == 'li':
225 balance += 1
226
227 elif self.tok_id == h8_id.EndTag:
228 self.tag_lexer.Reset(self.start_pos, self.end_pos)
229 if self.tag_lexer.GetTagName() == 'li':
230 balance -= 1
231 if balance < 0:
232 break
233 self._Next()
234
235 right = self.start_pos # start of the end tag
236
237 s = self.tag_lexer.s
238 if td_attrs_span:
239 # everything except the <cell-attrs />
240 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
241 #log('LEFT %r', s[left:td_attrs_span[0]])
242 #log('RIGHT %r', s[td_attrs_span[1]:right])
243 else:
244 inner_html = s[left:right]
245 #log('RAW inner html %r', inner_html)
246
247 #self._Eat(h8_id.EndTag, 'li')
248 self._Next()
249
250 return td_attrs, inner_html
251
252 def _ParseTHead(self):
253 # type: () -> List[Tuple[Optional[TdAttrs], str]]
254 """
255 Assume we're looking at the first <ul> tag. Now we want to find
256 <li>thead and the nested <ul>
257
258 Grammar:
259
260 THEAD =
261 [StartTag 'ul']
262 [RawData \s*]?
263 [StartTag 'li']
264 [RawData thead\s*]
265 [StartTag 'ul'] # Indented bullet that starts -
266 LIST_ITEM+
267 [RawData \s*]?
268 [EndTag 'ul']
269 [RawData thead\s+]
270 [End 'li']
271
272 Two Algorithms:
273
274 1. Replacement:
275 - skip over the first ul 'thead' li, and ul 'tr' li
276 - then replace the next ul -> tr, and li -> td
277 2. Parsing and Rendering:
278 - parse them into a structure
279 - skip all the text
280 - print your own HTML
281
282 I think the second one is better, because it allows attribute extensions
283 to thead
284
285 - thead
286 - name [link][]
287 - colgroup=foo align=left
288 - age
289 - colgroup=foo align=right
290 """
291 #log('*** _ParseTHead')
292 cells = []
293
294 self._WhitespaceOk()
295 self._Eat(h8_id.StartTag, 'li')
296
297 # In CommonMark, r'thead\n' is enough, because it strips trailing
298 # whitespace. I'm not sure if other Markdown processors do that, so
299 # use r'thead\s+'.
300 self._EatRawData(r'thead\s+')
301
302 # This is the row data
303 self._Eat(h8_id.StartTag, 'ul')
304
305 while True:
306 td_attrs, inner_html = self._ListItem()
307 if inner_html is None:
308 break
309 cells.append((td_attrs, inner_html))
310 self._WhitespaceOk()
311
312 self._Eat(h8_id.EndTag, 'ul')
313
314 self._WhitespaceOk()
315 self._Eat(h8_id.EndTag, 'li')
316
317 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
318 return cells
319
320 def _ParseTr(self):
321 # type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
322 """
323 Assume we're looking at the first <ul> tag. Now we want to find
324 <li>tr and the nested <ul>
325
326 Grammar:
327
328 TR =
329 [RawData \s*]?
330 [StartTag 'li']
331 [RawData thead\s*]
332 [StartTag 'ul'] # Indented bullet that starts -
333 ( [StartEndTag row-attrs] [RawData \s*] )?
334 LIST_ITEM+ # Defined above
335 [RawData \s*]?
336 [EndTag 'ul']
337 """
338 #log('*** _ParseTr')
339
340 cells = []
341
342 self._WhitespaceOk()
343
344 # Could be a </ul>
345 if self.tok_id != h8_id.StartTag:
346 return None, None
347
348 self._Eat(h8_id.StartTag, 'li')
349
350 self._EatRawData(r'tr\s*')
351
352 tr_attrs = None
353 if self.tok_id == h8_id.StartEndTag:
354 self.tag_lexer.Reset(self.start_pos, self.end_pos)
355 tag_name = self.tag_lexer.GetTagName()
356 if tag_name != 'row-attrs':
357 raise htm8.ParseError('Expected row-attrs, got %r' % tag_name)
358 tr_attrs = self.tag_lexer.AllAttrsRaw()
359 self._Next()
360 self._WhitespaceOk()
361
362 # This is the row data
363 self._Eat(h8_id.StartTag, 'ul')
364
365 while True:
366 td_attrs, inner_html = self._ListItem()
367 if inner_html is None:
368 break
369 cells.append((td_attrs, inner_html))
370 # TODO: assert
371
372 self._WhitespaceOk()
373
374 self._Eat(h8_id.EndTag, 'ul')
375
376 self._WhitespaceOk()
377 self._Eat(h8_id.EndTag, 'li')
378
379 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
380 return tr_attrs, cells
381
382 def ParseTable(self):
383 # type: () -> Dict[str, Any]
384 """
385 Returns a structure like this
386 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
387 'tr': [ # raw HTML that you surround with <td>
388 [ 'cell1 html', 'cell2 html' ],
389 [ 'cell1 html', 'cell2 html' ],
390 ]
391 }
392
393 Grammar:
394
395 UL_TABLE =
396 [StartTag 'ul']
397 THEAD # this this returns the number of cells, so it's NOT context
398 # free
399 TR*
400 [EndTag 'ul']
401 """
402 table = {'tr': []} # type: Dict[str, Any]
403
404 ul_start = self.start_pos
405 self._Eat(h8_id.StartTag, 'ul')
406
407 # Look ahead 2 or 3 tokens:
408 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
409 thead = self._ParseTHead()
410 else:
411 thead = None
412 #log('___ THEAD %s', thead)
413
414 while True:
415 tr_attrs, tr = self._ParseTr()
416 if tr is None:
417 break
418 # Not validating because of colspan
419 if 0:
420 if thead and len(tr) != len(thead):
421 raise htm8.ParseError('Expected %d cells, got %d: %s' %
422 (len(thead), len(tr), tr))
423
424 #log('___ TR %s', tr)
425 table['tr'].append((tr_attrs, tr))
426
427 self._Eat(h8_id.EndTag, 'ul')
428
429 self._WhitespaceOk()
430
431 ul_end = self.start_pos
432
433 table['thead'] = thead
434 table['ul_start'] = ul_start
435 table['ul_end'] = ul_end
436
437 if 0:
438 log('table %s', table)
439 from pprint import pprint
440 pprint(table)
441
442 return table
443
444
445def MergeAttrs(
446 thead_td_attrs, # type: Optional[TdAttrs]
447 row_td_attrs, # type: Optional[TdAttrs]
448):
449 # type: (...) -> TdAttrs
450 merged_attrs = []
451
452 if row_td_attrs is None:
453 row_lookup = {}
454 else:
455 row_lookup = {n: v for n, v in row_td_attrs}
456
457 done_for_row = set()
458
459 if thead_td_attrs:
460 for name, raw_value in thead_td_attrs:
461 more_values = row_lookup.get(name)
462 if more_values is not None:
463 raw_value += ' %s' % more_values
464 done_for_row.add(name)
465 merged_attrs.append((name, raw_value))
466
467 if row_td_attrs:
468 for name, raw_value in row_td_attrs:
469 if name in done_for_row:
470 continue
471 merged_attrs.append((name, raw_value))
472
473 return merged_attrs
474
475
476def ReplaceTables(s, debug_out=None):
477 # type: (str, Optional[Any]) -> str
478 """
479 ul-table: Write tables using bulleted list
480 """
481 if debug_out is None:
482 debug_out = []
483
484 f = StringIO()
485 out = htm8.Output(s, f)
486
487 tag_lexer = htm8.TagLexer(s)
488 lexer = htm8.Lexer(s)
489
490 p = UlTableParser(lexer, tag_lexer)
491
492 while True:
493 ul_start = p.FindUlTable()
494 if ul_start == -1:
495 break
496
497 #log('UL START %d', ul_start)
498 out.PrintUntil(ul_start)
499
500 table = p.ParseTable()
501 #log('UL END %d', ul_end)
502
503 # Don't write the matching </u> of the LAST row, but write everything
504 # after that
505 out.SkipTo(table['ul_end'])
506
507 # Write the header
508 thead = table['thead']
509
510 col_attrs = {} # integer -> td_attrs
511 if thead:
512 out.Print('<thead>\n')
513 out.Print('<tr>\n')
514
515 i = 0
516 for td_attrs, raw_html in thead:
517 if td_attrs:
518 col_attrs[i] = td_attrs
519 # <th> tag is more semantic, and styled bold by default
520 out.Print(' <th>')
521 out.Print(raw_html)
522 out.Print('</th>\n')
523 i += 1
524
525 out.Print('</tr>\n')
526 out.Print('</thead>\n')
527
528 # Write each row
529 for tr_attrs, row in table['tr']:
530
531 # Print tr tag and attrs
532 out.Print('<tr')
533 if tr_attrs:
534 for name, raw_value in tr_attrs:
535 out.Print(' ')
536 out.Print(name)
537 # No escaping because it's raw. It can't contain quotes.
538 out.Print('="%s"' % raw_value)
539 out.Print('>\n')
540
541 # Print cells
542 i = 0
543 for row_td_attrs, raw_html in row:
544 # Inherited from header
545 thead_td_attrs = col_attrs.get(i)
546 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
547
548 out.Print(' <td')
549 for name, raw_value in merged_attrs:
550 out.Print(' ')
551 out.Print(name)
552 # No escaping because it's raw. It can't contain quotes.
553 out.Print('="%s"' % raw_value)
554 out.Print('>')
555
556 out.Print(raw_html)
557 out.Print('</td>\n')
558 i += 1
559 out.Print('</tr>\n')
560
561 out.PrintTheRest()
562
563 return f.getvalue()
564
565
566if __name__ == '__main__':
567 # Simple CLI filter
568 h = sys.stdin.read()
569 h = RemoveComments(h)
570 h = ReplaceTables(h)
571 sys.stdout.write(h)