OILS / doctools / ul_table.py View on Github | oils.pub

565 lines, 282 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6try:
7 from cStringIO import StringIO
8except ImportError:
9 from io import StringIO # type: ignore
10import re
11import sys
12
13from doctools.util import log
14from lazylex import html
15from typing import List
16from typing import Optional
17from typing import Tuple
18from typing import Union
19from typing import Any
20from typing import Dict
21
22
23def RemoveComments(s):
24 # type: (str) -> str
25 """Remove <!-- comments -->
26
27 This is a required preprocessing step for ul-table.
28 """
29 f = StringIO()
30 out = html.Output(s, f)
31
32 tag_lexer = html.TagLexer(s)
33
34 pos = 0
35
36 for tok_id, end_pos in html.ValidTokens(s):
37 if tok_id == h8_id.Comment:
38 value = s[pos:end_pos]
39 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
40 if 'REPLACE' not in value:
41 out.PrintUntil(pos)
42 out.SkipTo(end_pos)
43 pos = end_pos
44
45 out.PrintTheRest()
46 return f.getvalue()
47
48
49_WHITESPACE_RE = re.compile(r'\s*')
50
51
52class UlTableParser(object):
53
54 def __init__(self, lexer, tag_lexer):
55 # type: (html.Lexer, html.TagLexer) -> None
56 self.lexer = lexer
57 self.tag_lexer = tag_lexer
58
59 self.tok_id = h8_id.Invalid
60 self.start_pos = 0
61 self.end_pos = 0
62
63 def _CurrentString(self):
64 # type: () -> str
65 part = self.lexer.s[self.start_pos:self.end_pos]
66 return part
67
68 def _Next(self, comment_ok=False):
69 # type: (bool) -> None
70 """
71 Advance and set self.tok_id, self.start_pos, self.end_pos
72 """
73 self.start_pos = self.end_pos
74 self.tok_id, self.end_pos = self.lexer.Read()
75
76 # Should have called RemoveComments() beforehand. That can still leave
77 # some REPLACE cmoments
78 if not comment_ok and self.tok_id == h8_id.Comment:
79 raise html.ParseError('Unexpected HTML comment')
80
81 if 0:
82 part = self._CurrentString()
83 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
84
85 def _EatRawData(self, regex):
86 # type: (str) -> None
87 """
88 Assert that we got text data matching a regex, and advance
89 """
90 if self.tok_id != h8_id.RawData:
91 raise html.ParseError('Expected RawData, got %s' %
92 h8_id_str(self.tok_id))
93 actual = self._CurrentString()
94 m = re.match(regex, actual) # could compile this
95 if m is None:
96 raise html.ParseError('Expected to match %r, got %r' %
97 (regex, actual))
98 self._Next()
99
100 def _Eat(self, expected_id, expected_tag):
101 # type: (h8_id_t, str) -> None
102 """
103 Assert that we got a start or end tag, with the given name, and advance
104
105 Args:
106 expected_id: h8_id.StartTag or h8_id.EndTag
107 expected_tag: 'a', 'span', etc.
108 """
109 assert expected_id in (h8_id.StartTag,
110 h8_id.EndTag), h8_id_str(expected_id)
111
112 if self.tok_id != expected_id:
113 raise html.ParseError(
114 'Expected token %s, got %s' %
115 (h8_id_str(expected_id), h8_id_str(self.tok_id)))
116 self.tag_lexer.Reset(self.start_pos, self.end_pos)
117 tag_name = self.tag_lexer.TagName()
118 if expected_tag != tag_name:
119 raise html.ParseError('Expected tag %r, got %r' %
120 (expected_tag, tag_name))
121
122 self._Next()
123
124 def _WhitespaceOk(self):
125 # type: () -> None
126 """
127 Optional whitespace
128 """
129 if (self.tok_id == h8_id.RawData and
130 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
131 self._Next()
132
133 def FindUlTable(self):
134 # type: () -> int
135 """Find <table ...> <ul>
136
137 Return the START position of the <ul>
138 Similar algorithm as html.ReadUntilStartTag()
139 """
140 tag_lexer = self.tag_lexer
141
142 # Find first table
143 while True:
144 self._Next(comment_ok=True)
145 if self.tok_id == h8_id.EndOfStream:
146 return -1
147
148 tag_lexer.Reset(self.start_pos, self.end_pos)
149 if (self.tok_id == h8_id.StartTag and
150 tag_lexer.TagName() == 'table'):
151 while True:
152 self._Next(comment_ok=True)
153 if self.tok_id != h8_id.RawData:
154 break
155
156 tag_lexer.Reset(self.start_pos, self.end_pos)
157 if (self.tok_id == h8_id.StartTag and
158 tag_lexer.TagName() == 'ul'):
159 return self.start_pos
160 return -1
161
162 def _ListItem(self):
163 # type: () -> Tuple[Optional[List[Tuple[str, str]]], Optional[str]]
164 """Parse a list item nested below thead or tr.
165
166 Returns:
167 A pair (td_attrs, inner_html)
168
169 Grammar:
170
171 LIST_ITEM =
172 [RawData \s*]?
173 [StartTag 'li']
174 ANY* # NOT context-free:
175 # - we MATCH <li> and </li> with a tack
176 # - We search for [StartEndTag 'cell-attrs']?
177 [EndTag 'li']
178
179 Example of attribute borrowing:
180
181 - hi there ==>
182 <li>hi there</li> ==>
183 <td>hi there</td>
184
185 - <cell-attrs class=foo /> hi there ==>
186 <li><cell-attrs class=foo /> hi there </li> ==>
187 <td class=foo> hi there </td> ==>
188 """
189 self._WhitespaceOk()
190
191 if self.tok_id != h8_id.StartTag:
192 return None, None
193
194 inner_html = None
195 td_attrs = None # Can we also have col-attrs?
196 td_attrs_span = None
197
198 self._Eat(h8_id.StartTag, 'li')
199
200 left = self.start_pos
201
202 # Find the closing </li>, taking into accounted NESTED tags:
203 # <li> <li>foo</li> </li>
204 # because cells can have bulleted lists
205 balance = 0
206 while True:
207 if self.tok_id == h8_id.StartEndTag:
208 self.tag_lexer.Reset(self.start_pos, self.end_pos)
209 tag_name = self.tag_lexer.TagName()
210 # TODO: remove td-attrs backward compat
211 if tag_name in ('td-attrs', 'cell-attrs'):
212 td_attrs_span = self.start_pos, self.end_pos
213 td_attrs = self.tag_lexer.AllAttrsRaw()
214 #log('CELL ATTRS %r', self._CurrentString())
215
216 elif self.tok_id == h8_id.StartTag:
217 self.tag_lexer.Reset(self.start_pos, self.end_pos)
218 if self.tag_lexer.TagName() == 'li':
219 balance += 1
220
221 elif self.tok_id == h8_id.EndTag:
222 self.tag_lexer.Reset(self.start_pos, self.end_pos)
223 if self.tag_lexer.TagName() == 'li':
224 balance -= 1
225 if balance < 0:
226 break
227 self._Next()
228
229 right = self.start_pos # start of the end tag
230
231 s = self.tag_lexer.s
232 if td_attrs_span:
233 # everything except the <cell-attrs />
234 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
235 #log('LEFT %r', s[left:td_attrs_span[0]])
236 #log('RIGHT %r', s[td_attrs_span[1]:right])
237 else:
238 inner_html = s[left:right]
239 #log('RAW inner html %r', inner_html)
240
241 #self._Eat(h8_id.EndTag, 'li')
242 self._Next()
243
244 return td_attrs, inner_html
245
246 def _ParseTHead(self):
247 # type: () -> Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[Optional[List[Tuple[str, str]]], str]]]
248 """
249 Assume we're looking at the first <ul> tag. Now we want to find
250 <li>thead and the nested <ul>
251
252 Grammar:
253
254 THEAD =
255 [StartTag 'ul']
256 [RawData \s*]?
257 [StartTag 'li']
258 [RawData thead\s*]
259 [StartTag 'ul'] # Indented bullet that starts -
260 LIST_ITEM+
261 [RawData \s*]?
262 [EndTag 'ul']
263 [RawData thead\s+]
264 [End 'li']
265
266 Two Algorithms:
267
268 1. Replacement:
269 - skip over the first ul 'thead' li, and ul 'tr' li
270 - then replace the next ul -> tr, and li -> td
271 2. Parsing and Rendering:
272 - parse them into a structure
273 - skip all the text
274 - print your own HTML
275
276 I think the second one is better, because it allows attribute extensions
277 to thead
278
279 - thead
280 - name [link][]
281 - colgroup=foo align=left
282 - age
283 - colgroup=foo align=right
284 """
285 #log('*** _ParseTHead')
286 cells = []
287
288 self._WhitespaceOk()
289 self._Eat(h8_id.StartTag, 'li')
290
291 # In CommonMark, r'thead\n' is enough, because it strips trailing
292 # whitespace. I'm not sure if other Markdown processors do that, so
293 # use r'thead\s+'.
294 self._EatRawData(r'thead\s+')
295
296 # This is the row data
297 self._Eat(h8_id.StartTag, 'ul')
298
299 while True:
300 td_attrs, inner_html = self._ListItem()
301 if inner_html is None:
302 break
303 cells.append((td_attrs, inner_html))
304 self._WhitespaceOk()
305
306 self._Eat(h8_id.EndTag, 'ul')
307
308 self._WhitespaceOk()
309 self._Eat(h8_id.EndTag, 'li')
310
311 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
312 return cells
313
314 def _ParseTr(self):
315 # type: () -> Tuple[None, Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[None, str]], None]]
316 """
317 Assume we're looking at the first <ul> tag. Now we want to find
318 <li>tr and the nested <ul>
319
320 Grammar:
321
322 TR =
323 [RawData \s*]?
324 [StartTag 'li']
325 [RawData thead\s*]
326 [StartTag 'ul'] # Indented bullet that starts -
327 ( [StartEndTag row-attrs] [RawData \s*] )?
328 LIST_ITEM+ # Defined above
329 [RawData \s*]?
330 [EndTag 'ul']
331 """
332 #log('*** _ParseTr')
333
334 cells = []
335
336 self._WhitespaceOk()
337
338 # Could be a </ul>
339 if self.tok_id != h8_id.StartTag:
340 return None, None
341
342 self._Eat(h8_id.StartTag, 'li')
343
344 self._EatRawData(r'tr\s*')
345
346 tr_attrs = None
347 if self.tok_id == h8_id.StartEndTag:
348 self.tag_lexer.Reset(self.start_pos, self.end_pos)
349 tag_name = self.tag_lexer.TagName()
350 if tag_name != 'row-attrs':
351 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
352 tr_attrs = self.tag_lexer.AllAttrsRaw()
353 self._Next()
354 self._WhitespaceOk()
355
356 # This is the row data
357 self._Eat(h8_id.StartTag, 'ul')
358
359 while True:
360 td_attrs, inner_html = self._ListItem()
361 if inner_html is None:
362 break
363 cells.append((td_attrs, inner_html))
364 # TODO: assert
365
366 self._WhitespaceOk()
367
368 self._Eat(h8_id.EndTag, 'ul')
369
370 self._WhitespaceOk()
371 self._Eat(h8_id.EndTag, 'li')
372
373 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
374 return tr_attrs, cells
375
376 def ParseTable(self):
377 # type: () -> Dict[str, Any]
378 """
379 Returns a structure like this
380 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
381 'tr': [ # raw HTML that you surround with <td>
382 [ 'cell1 html', 'cell2 html' ],
383 [ 'cell1 html', 'cell2 html' ],
384 ]
385 }
386
387 Grammar:
388
389 UL_TABLE =
390 [StartTag 'ul']
391 THEAD # this this returns the number of cells, so it's NOT context
392 # free
393 TR*
394 [EndTag 'ul']
395 """
396 table = {'tr': []} # type: Dict[str, Any]
397
398 ul_start = self.start_pos
399 self._Eat(h8_id.StartTag, 'ul')
400
401 # Look ahead 2 or 3 tokens:
402 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
403 thead = self._ParseTHead()
404 else:
405 thead = None
406 #log('___ THEAD %s', thead)
407
408 while True:
409 tr_attrs, tr = self._ParseTr()
410 if tr is None:
411 break
412 # Not validating because of colspan
413 if 0:
414 if thead and len(tr) != len(thead):
415 raise html.ParseError('Expected %d cells, got %d: %s' %
416 (len(thead), len(tr), tr))
417
418 #log('___ TR %s', tr)
419 table['tr'].append((tr_attrs, tr))
420
421 self._Eat(h8_id.EndTag, 'ul')
422
423 self._WhitespaceOk()
424
425 ul_end = self.start_pos
426
427 table['thead'] = thead
428 table['ul_start'] = ul_start
429 table['ul_end'] = ul_end
430
431 if 0:
432 log('table %s', table)
433 from pprint import pprint
434 pprint(table)
435
436 return table
437
438
439def MergeAttrs(
440 thead_td_attrs, # type: Optional[List[Tuple[str, str]]]
441 row_td_attrs, # type: Optional[List[Tuple[str, str]]]
442):
443 # type: (...) -> List[Tuple[str, str]]
444 merged_attrs = []
445
446 if row_td_attrs is None:
447 row_lookup = {}
448 else:
449 row_lookup = {n: v for n, v in row_td_attrs}
450
451 done_for_row = set()
452
453 if thead_td_attrs:
454 for name, raw_value in thead_td_attrs:
455 more_values = row_lookup.get(name)
456 if more_values is not None:
457 raw_value += ' %s' % more_values
458 done_for_row.add(name)
459 merged_attrs.append((name, raw_value))
460
461 if row_td_attrs:
462 for name, raw_value in row_td_attrs:
463 if name in done_for_row:
464 continue
465 merged_attrs.append((name, raw_value))
466
467 return merged_attrs
468
469
470def ReplaceTables(s, debug_out=None):
471 # type: (str, Optional[Any]) -> str
472 """
473 ul-table: Write tables using bulleted list
474 """
475 if debug_out is None:
476 debug_out = []
477
478 f = StringIO()
479 out = html.Output(s, f)
480
481 tag_lexer = html.TagLexer(s)
482 lexer = html.Lexer(s)
483
484 p = UlTableParser(lexer, tag_lexer)
485
486 while True:
487 ul_start = p.FindUlTable()
488 if ul_start == -1:
489 break
490
491 #log('UL START %d', ul_start)
492 out.PrintUntil(ul_start)
493
494 table = p.ParseTable()
495 #log('UL END %d', ul_end)
496
497 # Don't write the matching </u> of the LAST row, but write everything
498 # after that
499 out.SkipTo(table['ul_end'])
500
501 # Write the header
502 thead = table['thead']
503
504 col_attrs = {} # integer -> td_attrs
505 if thead:
506 out.Print('<thead>\n')
507 out.Print('<tr>\n')
508
509 i = 0
510 for td_attrs, raw_html in thead:
511 if td_attrs:
512 col_attrs[i] = td_attrs
513 # <th> tag is more semantic, and styled bold by default
514 out.Print(' <th>')
515 out.Print(raw_html)
516 out.Print('</th>\n')
517 i += 1
518
519 out.Print('</tr>\n')
520 out.Print('</thead>\n')
521
522 # Write each row
523 for tr_attrs, row in table['tr']:
524
525 # Print tr tag and attrs
526 out.Print('<tr')
527 if tr_attrs:
528 for name, raw_value in tr_attrs:
529 out.Print(' ')
530 out.Print(name)
531 # No escaping because it's raw. It can't contain quotes.
532 out.Print('="%s"' % raw_value)
533 out.Print('>\n')
534
535 # Print cells
536 i = 0
537 for row_td_attrs, raw_html in row:
538 # Inherited from header
539 thead_td_attrs = col_attrs.get(i)
540 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
541
542 out.Print(' <td')
543 for name, raw_value in merged_attrs:
544 out.Print(' ')
545 out.Print(name)
546 # No escaping because it's raw. It can't contain quotes.
547 out.Print('="%s"' % raw_value)
548 out.Print('>')
549
550 out.Print(raw_html)
551 out.Print('</td>\n')
552 i += 1
553 out.Print('</tr>\n')
554
555 out.PrintTheRest()
556
557 return f.getvalue()
558
559
560if __name__ == '__main__':
561 # Simple CLI filter
562 h = sys.stdin.read()
563 h = RemoveComments(h)
564 h = ReplaceTables(h)
565 sys.stdout.write(h)