OILS / doctools / ul_table.py View on Github | oils.pub

567 lines, 282 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6try:
7 from cStringIO import StringIO
8except ImportError:
9 from io import StringIO # type: ignore
10import re
11import sys
12
13from doctools.util import log
14from data_lang import htm8
15from typing import List
16from typing import Optional
17from typing import Tuple
18from typing import Any
19from typing import Dict
20
21
22def RemoveComments(s):
23 # type: (str) -> str
24 """Remove <!-- comments -->
25
26 This is a required preprocessing step for ul-table.
27 """
28 f = StringIO()
29 out = htm8.Output(s, f)
30 lx = htm8.Lexer(s)
31
32 pos = 0
33 while True:
34 tok_id, end_pos = lx.Read()
35 if tok_id == h8_id.EndOfStream:
36 break
37
38 if tok_id == h8_id.Invalid:
39 raise htm8.LexError('RemoveComments() got invalid token', s, pos)
40
41 if tok_id == h8_id.Comment:
42 value = s[pos:end_pos]
43 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44 if 'REPLACE' not in value:
45 out.PrintUntil(pos)
46 out.SkipTo(end_pos)
47 pos = end_pos
48
49 out.PrintTheRest()
50 return f.getvalue()
51
52
53_WHITESPACE_RE = re.compile(r'\s*')
54
55TdAttrs = List[Tuple[str, str]]
56
57
58class UlTableParser(object):
59
60 def __init__(self, lexer, tag_lexer):
61 # type: (htm8.Lexer, htm8.TagLexer) -> None
62 self.lexer = lexer
63 self.tag_lexer = tag_lexer
64
65 self.tok_id = h8_id.Invalid
66 self.start_pos = 0
67 self.end_pos = 0
68 # The tag name is only populated when we are "looking at"
69 # h8_id.{StartTag,EndTag,StartEndTag}
70 self.tag_name = None # type: Optional[str]
71
72 def _CurrentString(self):
73 # type: () -> str
74 part = self.lexer.s[self.start_pos:self.end_pos]
75 return part
76
77 def _Next(self, comment_ok=False):
78 # type: (bool) -> None
79 """
80 Advance and set self.tok_id, self.start_pos, self.end_pos
81 """
82 self.start_pos = self.end_pos
83 self.tok_id, self.end_pos = self.lexer.Read()
84 if self.tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
85 self.tag_name = self.lexer.CanonicalTagName()
86 else:
87 self.tag_name = None
88
89 # Should have called RemoveComments() beforehand. That can still leave
90 # some REPLACE cmoments
91 if not comment_ok and self.tok_id == h8_id.Comment:
92 raise htm8.ParseError('Unexpected HTML comment')
93
94 if 0:
95 part = self._CurrentString()
96 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
97
98 def _EatRawData(self, regex):
99 # type: (str) -> None
100 """
101 Assert that we got text data matching a regex, and advance
102 """
103 if self.tok_id != h8_id.RawData:
104 raise htm8.ParseError('Expected RawData, got %s' %
105 h8_id_str(self.tok_id))
106 actual = self._CurrentString()
107 m = re.match(regex, actual) # could compile this
108 if m is None:
109 raise htm8.ParseError('Expected to match %r, got %r' %
110 (regex, actual))
111 self._Next()
112
113 def _Eat(self, expected_id, expected_tag):
114 # type: (h8_id_t, str) -> None
115 """
116 Assert that we got a start or end tag, with the given name, and advance
117
118 Args:
119 expected_id: h8_id.StartTag or h8_id.EndTag
120 expected_tag: 'a', 'span', etc.
121 """
122 assert expected_id in (h8_id.StartTag,
123 h8_id.EndTag), h8_id_str(expected_id)
124
125 if self.tok_id != expected_id:
126 raise htm8.ParseError(
127 'Expected token %s, got %s' %
128 (h8_id_str(expected_id), h8_id_str(self.tok_id)))
129 if expected_tag != self.tag_name:
130 raise htm8.ParseError('Expected tag %r, got %r' %
131 (expected_tag, self.tag_name))
132
133 self._Next()
134
135 def _WhitespaceOk(self):
136 # type: () -> None
137 """
138 Optional whitespace
139 """
140 if (self.tok_id == h8_id.RawData and
141 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
142 self._Next()
143
144 def FindUlTable(self):
145 # type: () -> int
146 """Find <table ...> <ul>
147
148 Return the START position of the <ul>
149 Similar algorithm as html.ReadUntilStartTag()
150 """
151 # Find first table
152 while True:
153 self._Next(comment_ok=True)
154 if self.tok_id == h8_id.EndOfStream:
155 return -1
156
157 if (self.tok_id == h8_id.StartTag and self.tag_name == 'table'):
158 while True:
159 self._Next(comment_ok=True)
160 if self.tok_id != h8_id.RawData:
161 break
162
163 if (self.tok_id == h8_id.StartTag and self.tag_name == 'ul'):
164 return self.start_pos
165 return -1
166
167 def _ListItem(self):
168 # type: () -> Tuple[Optional[TdAttrs], Optional[str]]
169 """Parse a list item nested below thead or tr.
170
171 Returns:
172 A pair (td_attrs, inner_html)
173
174 Grammar:
175
176 LIST_ITEM =
177 [RawData \s*]?
178 [StartTag 'li']
179 ANY* # NOT context-free:
180 # - we MATCH <li> and </li> with a tack
181 # - We search for [StartEndTag 'cell-attrs']?
182 [EndTag 'li']
183
184 Example of attribute borrowing:
185
186 - hi there ==>
187 <li>hi there</li> ==>
188 <td>hi there</td>
189
190 - <cell-attrs class=foo /> hi there ==>
191 <li><cell-attrs class=foo /> hi there </li> ==>
192 <td class=foo> hi there </td> ==>
193 """
194 self._WhitespaceOk()
195
196 if self.tok_id != h8_id.StartTag:
197 return None, None
198
199 inner_html = None
200 td_attrs = None # Can we also have col-attrs?
201 td_attrs_span = None
202
203 self._Eat(h8_id.StartTag, 'li')
204
205 left = self.start_pos
206
207 # Find the closing </li>, taking into accounted NESTED tags:
208 # <li> <li>foo</li> </li>
209 # because cells can have bulleted lists
210 balance = 0
211 while True:
212 if self.tok_id == h8_id.StartEndTag:
213 self.tag_lexer.Reset(self.start_pos, self.end_pos)
214 # TODO: remove td-attrs backward compat
215 if self.tag_name in ('td-attrs', 'cell-attrs'):
216 td_attrs_span = self.start_pos, self.end_pos
217 td_attrs = self.tag_lexer.AllAttrsRaw()
218 #log('CELL ATTRS %r', self._CurrentString())
219
220 elif self.tok_id == h8_id.StartTag:
221 if self.tag_name == 'li':
222 balance += 1
223
224 elif self.tok_id == h8_id.EndTag:
225 if self.tag_name == 'li':
226 balance -= 1
227 if balance < 0:
228 break
229 self._Next()
230
231 right = self.start_pos # start of the end tag
232
233 s = self.lexer.s
234 if td_attrs_span:
235 # everything except the <cell-attrs />
236 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
237 #log('LEFT %r', s[left:td_attrs_span[0]])
238 #log('RIGHT %r', s[td_attrs_span[1]:right])
239 else:
240 inner_html = s[left:right]
241 #log('RAW inner html %r', inner_html)
242
243 #self._Eat(h8_id.EndTag, 'li')
244 self._Next()
245
246 return td_attrs, inner_html
247
248 def _ParseTHead(self):
249 # type: () -> List[Tuple[Optional[TdAttrs], str]]
250 """
251 Assume we're looking at the first <ul> tag. Now we want to find
252 <li>thead and the nested <ul>
253
254 Grammar:
255
256 THEAD =
257 [StartTag 'ul']
258 [RawData \s*]?
259 [StartTag 'li']
260 [RawData thead\s*]
261 [StartTag 'ul'] # Indented bullet that starts -
262 LIST_ITEM+
263 [RawData \s*]?
264 [EndTag 'ul']
265 [RawData thead\s+]
266 [End 'li']
267
268 Two Algorithms:
269
270 1. Replacement:
271 - skip over the first ul 'thead' li, and ul 'tr' li
272 - then replace the next ul -> tr, and li -> td
273 2. Parsing and Rendering:
274 - parse them into a structure
275 - skip all the text
276 - print your own HTML
277
278 I think the second one is better, because it allows attribute extensions
279 to thead
280
281 - thead
282 - name [link][]
283 - colgroup=foo align=left
284 - age
285 - colgroup=foo align=right
286 """
287 #log('*** _ParseTHead')
288 cells = []
289
290 self._WhitespaceOk()
291 self._Eat(h8_id.StartTag, 'li')
292
293 # In CommonMark, r'thead\n' is enough, because it strips trailing
294 # whitespace. I'm not sure if other Markdown processors do that, so
295 # use r'thead\s+'.
296 self._EatRawData(r'thead\s+')
297
298 # This is the row data
299 self._Eat(h8_id.StartTag, 'ul')
300
301 while True:
302 td_attrs, inner_html = self._ListItem()
303 if inner_html is None:
304 break
305 cells.append((td_attrs, inner_html))
306 self._WhitespaceOk()
307
308 self._Eat(h8_id.EndTag, 'ul')
309
310 self._WhitespaceOk()
311 self._Eat(h8_id.EndTag, 'li')
312
313 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
314 return cells
315
316 def _ParseTr(self):
317 # type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
318 """
319 Assume we're looking at the first <ul> tag. Now we want to find
320 <li>tr and the nested <ul>
321
322 Grammar:
323
324 TR =
325 [RawData \s*]?
326 [StartTag 'li']
327 [RawData thead\s*]
328 [StartTag 'ul'] # Indented bullet that starts -
329 ( [StartEndTag row-attrs] [RawData \s*] )?
330 LIST_ITEM+ # Defined above
331 [RawData \s*]?
332 [EndTag 'ul']
333 """
334 #log('*** _ParseTr')
335
336 cells = []
337
338 self._WhitespaceOk()
339
340 # Could be a </ul>
341 if self.tok_id != h8_id.StartTag:
342 return None, None
343
344 self._Eat(h8_id.StartTag, 'li')
345
346 self._EatRawData(r'tr\s*')
347
348 tr_attrs = None
349 if self.tok_id == h8_id.StartEndTag:
350 self.tag_lexer.Reset(self.start_pos, self.end_pos)
351 if self.tag_name != 'row-attrs':
352 raise htm8.ParseError('Expected row-attrs, got %r' %
353 self.tag_name)
354 tr_attrs = self.tag_lexer.AllAttrsRaw()
355 self._Next()
356 self._WhitespaceOk()
357
358 # This is the row data
359 self._Eat(h8_id.StartTag, 'ul')
360
361 while True:
362 td_attrs, inner_html = self._ListItem()
363 if inner_html is None:
364 break
365 cells.append((td_attrs, inner_html))
366 # TODO: assert
367
368 self._WhitespaceOk()
369
370 self._Eat(h8_id.EndTag, 'ul')
371
372 self._WhitespaceOk()
373 self._Eat(h8_id.EndTag, 'li')
374
375 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
376 return tr_attrs, cells
377
378 def ParseTable(self):
379 # type: () -> Dict[str, Any]
380 """
381 Returns a structure like this
382 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
383 'tr': [ # raw HTML that you surround with <td>
384 [ 'cell1 html', 'cell2 html' ],
385 [ 'cell1 html', 'cell2 html' ],
386 ]
387 }
388
389 Grammar:
390
391 UL_TABLE =
392 [StartTag 'ul']
393 THEAD # this this returns the number of cells, so it's NOT context
394 # free
395 TR*
396 [EndTag 'ul']
397 """
398 table = {'tr': []} # type: Dict[str, Any]
399
400 ul_start = self.start_pos
401 self._Eat(h8_id.StartTag, 'ul')
402
403 # Look ahead 2 or 3 tokens:
404 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
405 thead = self._ParseTHead()
406 else:
407 thead = None
408 #log('___ THEAD %s', thead)
409
410 while True:
411 tr_attrs, tr = self._ParseTr()
412 if tr is None:
413 break
414 # Not validating because of colspan
415 if 0:
416 if thead and len(tr) != len(thead):
417 raise htm8.ParseError('Expected %d cells, got %d: %s' %
418 (len(thead), len(tr), tr))
419
420 #log('___ TR %s', tr)
421 table['tr'].append((tr_attrs, tr))
422
423 self._Eat(h8_id.EndTag, 'ul')
424
425 self._WhitespaceOk()
426
427 ul_end = self.start_pos
428
429 table['thead'] = thead
430 table['ul_start'] = ul_start
431 table['ul_end'] = ul_end
432
433 if 0:
434 log('table %s', table)
435 from pprint import pprint
436 pprint(table)
437
438 return table
439
440
441def MergeAttrs(
442 thead_td_attrs, # type: Optional[TdAttrs]
443 row_td_attrs, # type: Optional[TdAttrs]
444):
445 # type: (...) -> TdAttrs
446 merged_attrs = []
447
448 if row_td_attrs is None:
449 row_lookup = {}
450 else:
451 row_lookup = {n: v for n, v in row_td_attrs}
452
453 done_for_row = set()
454
455 if thead_td_attrs:
456 for name, raw_value in thead_td_attrs:
457 more_values = row_lookup.get(name)
458 if more_values is not None:
459 raw_value += ' %s' % more_values
460 done_for_row.add(name)
461 merged_attrs.append((name, raw_value))
462
463 if row_td_attrs:
464 for name, raw_value in row_td_attrs:
465 if name in done_for_row:
466 continue
467 merged_attrs.append((name, raw_value))
468
469 return merged_attrs
470
471
472def ReplaceTables(s, debug_out=None):
473 # type: (str, Optional[Any]) -> str
474 """
475 ul-table: Write tables using bulleted list
476 """
477 if debug_out is None:
478 debug_out = []
479
480 f = StringIO()
481 out = htm8.Output(s, f)
482
483 tag_lexer = htm8.TagLexer(s)
484 lexer = htm8.Lexer(s)
485
486 p = UlTableParser(lexer, tag_lexer)
487
488 while True:
489 ul_start = p.FindUlTable()
490 if ul_start == -1:
491 break
492
493 #log('UL START %d', ul_start)
494 out.PrintUntil(ul_start)
495
496 table = p.ParseTable()
497 #log('UL END %d', ul_end)
498
499 # Don't write the matching </u> of the LAST row, but write everything
500 # after that
501 out.SkipTo(table['ul_end'])
502
503 # Write the header
504 thead = table['thead']
505
506 col_attrs = {} # integer -> td_attrs
507 if thead:
508 out.Print('<thead>\n')
509 out.Print('<tr>\n')
510
511 i = 0
512 for td_attrs, raw_html in thead:
513 if td_attrs:
514 col_attrs[i] = td_attrs
515 # <th> tag is more semantic, and styled bold by default
516 out.Print(' <th>')
517 out.Print(raw_html)
518 out.Print('</th>\n')
519 i += 1
520
521 out.Print('</tr>\n')
522 out.Print('</thead>\n')
523
524 # Write each row
525 for tr_attrs, row in table['tr']:
526
527 # Print tr tag and attrs
528 out.Print('<tr')
529 if tr_attrs:
530 for name, raw_value in tr_attrs:
531 out.Print(' ')
532 out.Print(name)
533 # No escaping because it's raw. It can't contain quotes.
534 out.Print('="%s"' % raw_value)
535 out.Print('>\n')
536
537 # Print cells
538 i = 0
539 for row_td_attrs, raw_html in row:
540 # Inherited from header
541 thead_td_attrs = col_attrs.get(i)
542 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
543
544 out.Print(' <td')
545 for name, raw_value in merged_attrs:
546 out.Print(' ')
547 out.Print(name)
548 # No escaping because it's raw. It can't contain quotes.
549 out.Print('="%s"' % raw_value)
550 out.Print('>')
551
552 out.Print(raw_html)
553 out.Print('</td>\n')
554 i += 1
555 out.Print('</tr>\n')
556
557 out.PrintTheRest()
558
559 return f.getvalue()
560
561
562if __name__ == '__main__':
563 # Simple CLI filter
564 h = sys.stdin.read()
565 h = RemoveComments(h)
566 h = ReplaceTables(h)
567 sys.stdout.write(h)