1 | #!/usr/bin/env python2
|
2 | """ul_table.py: Markdown Tables Without New Syntax."""
|
3 |
|
4 | from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
|
5 |
|
6 | try:
|
7 | from cStringIO import StringIO
|
8 | except ImportError:
|
9 | from io import StringIO # type: ignore
|
10 | import re
|
11 | import sys
|
12 |
|
13 | from doctools.util import log
|
14 | from data_lang import htm8
|
15 | from typing import List
|
16 | from typing import Optional
|
17 | from typing import Tuple
|
18 | from typing import Any
|
19 | from typing import Dict
|
20 |
|
21 |
|
22 | def RemoveComments(s):
|
23 | # type: (str) -> str
|
24 | """Remove <!-- comments -->
|
25 |
|
26 | This is a required preprocessing step for ul-table.
|
27 | """
|
28 | f = StringIO()
|
29 | out = htm8.Output(s, f)
|
30 | lx = htm8.Lexer(s)
|
31 |
|
32 | pos = 0
|
33 | while True:
|
34 | tok_id, end_pos = lx.Read()
|
35 | if tok_id == h8_id.EndOfStream:
|
36 | break
|
37 |
|
38 | if tok_id == h8_id.Invalid:
|
39 | raise htm8.LexError('RemoveComments() got invalid token', s, pos)
|
40 |
|
41 | if tok_id == h8_id.Comment:
|
42 | value = s[pos:end_pos]
|
43 | # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
|
44 | if 'REPLACE' not in value:
|
45 | out.PrintUntil(pos)
|
46 | out.SkipTo(end_pos)
|
47 | pos = end_pos
|
48 |
|
49 | out.PrintTheRest()
|
50 | return f.getvalue()
|
51 |
|
52 |
|
53 | _WHITESPACE_RE = re.compile(r'\s*')
|
54 |
|
55 | TdAttrs = List[Tuple[str, str]]
|
56 |
|
57 |
|
58 | class UlTableParser(object):
|
59 |
|
60 | def __init__(self, lexer, tag_lexer):
|
61 | # type: (htm8.Lexer, htm8.TagLexer) -> None
|
62 | self.lexer = lexer
|
63 | self.tag_lexer = tag_lexer
|
64 |
|
65 | self.tok_id = h8_id.Invalid
|
66 | self.start_pos = 0
|
67 | self.end_pos = 0
|
68 | # The tag name is only populated when we are "looking at"
|
69 | # h8_id.{StartTag,EndTag,StartEndTag}
|
70 | self.tag_name = None # type: Optional[str]
|
71 |
|
72 | def _CurrentString(self):
|
73 | # type: () -> str
|
74 | part = self.lexer.s[self.start_pos:self.end_pos]
|
75 | return part
|
76 |
|
77 | def _Next(self, comment_ok=False):
|
78 | # type: (bool) -> None
|
79 | """
|
80 | Advance and set self.tok_id, self.start_pos, self.end_pos
|
81 | """
|
82 | self.start_pos = self.end_pos
|
83 | self.tok_id, self.end_pos = self.lexer.Read()
|
84 | if self.tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
|
85 | self.tag_name = self.lexer.CanonicalTagName()
|
86 | else:
|
87 | self.tag_name = None
|
88 |
|
89 | # Should have called RemoveComments() beforehand. That can still leave
|
90 | # some REPLACE cmoments
|
91 | if not comment_ok and self.tok_id == h8_id.Comment:
|
92 | raise htm8.ParseError('Unexpected HTML comment')
|
93 |
|
94 | if 0:
|
95 | part = self._CurrentString()
|
96 | log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
|
97 |
|
98 | def _EatRawData(self, regex):
|
99 | # type: (str) -> None
|
100 | """
|
101 | Assert that we got text data matching a regex, and advance
|
102 | """
|
103 | if self.tok_id != h8_id.RawData:
|
104 | raise htm8.ParseError('Expected RawData, got %s' %
|
105 | h8_id_str(self.tok_id))
|
106 | actual = self._CurrentString()
|
107 | m = re.match(regex, actual) # could compile this
|
108 | if m is None:
|
109 | raise htm8.ParseError('Expected to match %r, got %r' %
|
110 | (regex, actual))
|
111 | self._Next()
|
112 |
|
113 | def _Eat(self, expected_id, expected_tag):
|
114 | # type: (h8_id_t, str) -> None
|
115 | """
|
116 | Assert that we got a start or end tag, with the given name, and advance
|
117 |
|
118 | Args:
|
119 | expected_id: h8_id.StartTag or h8_id.EndTag
|
120 | expected_tag: 'a', 'span', etc.
|
121 | """
|
122 | assert expected_id in (h8_id.StartTag,
|
123 | h8_id.EndTag), h8_id_str(expected_id)
|
124 |
|
125 | if self.tok_id != expected_id:
|
126 | raise htm8.ParseError(
|
127 | 'Expected token %s, got %s' %
|
128 | (h8_id_str(expected_id), h8_id_str(self.tok_id)))
|
129 | if expected_tag != self.tag_name:
|
130 | raise htm8.ParseError('Expected tag %r, got %r' %
|
131 | (expected_tag, self.tag_name))
|
132 |
|
133 | self._Next()
|
134 |
|
135 | def _WhitespaceOk(self):
|
136 | # type: () -> None
|
137 | """
|
138 | Optional whitespace
|
139 | """
|
140 | if (self.tok_id == h8_id.RawData and
|
141 | _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
|
142 | self._Next()
|
143 |
|
144 | def FindUlTable(self):
|
145 | # type: () -> int
|
146 | """Find <table ...> <ul>
|
147 |
|
148 | Return the START position of the <ul>
|
149 | Similar algorithm as html.ReadUntilStartTag()
|
150 | """
|
151 | # Find first table
|
152 | while True:
|
153 | self._Next(comment_ok=True)
|
154 | if self.tok_id == h8_id.EndOfStream:
|
155 | return -1
|
156 |
|
157 | if (self.tok_id == h8_id.StartTag and self.tag_name == 'table'):
|
158 | while True:
|
159 | self._Next(comment_ok=True)
|
160 | if self.tok_id != h8_id.RawData:
|
161 | break
|
162 |
|
163 | if (self.tok_id == h8_id.StartTag and self.tag_name == 'ul'):
|
164 | return self.start_pos
|
165 | return -1
|
166 |
|
167 | def _ListItem(self):
|
168 | # type: () -> Tuple[Optional[TdAttrs], Optional[str]]
|
169 | """Parse a list item nested below thead or tr.
|
170 |
|
171 | Returns:
|
172 | A pair (td_attrs, inner_html)
|
173 |
|
174 | Grammar:
|
175 |
|
176 | LIST_ITEM =
|
177 | [RawData \s*]?
|
178 | [StartTag 'li']
|
179 | ANY* # NOT context-free:
|
180 | # - we MATCH <li> and </li> with a tack
|
181 | # - We search for [StartEndTag 'cell-attrs']?
|
182 | [EndTag 'li']
|
183 |
|
184 | Example of attribute borrowing:
|
185 |
|
186 | - hi there ==>
|
187 | <li>hi there</li> ==>
|
188 | <td>hi there</td>
|
189 |
|
190 | - <cell-attrs class=foo /> hi there ==>
|
191 | <li><cell-attrs class=foo /> hi there </li> ==>
|
192 | <td class=foo> hi there </td> ==>
|
193 | """
|
194 | self._WhitespaceOk()
|
195 |
|
196 | if self.tok_id != h8_id.StartTag:
|
197 | return None, None
|
198 |
|
199 | inner_html = None
|
200 | td_attrs = None # Can we also have col-attrs?
|
201 | td_attrs_span = None
|
202 |
|
203 | self._Eat(h8_id.StartTag, 'li')
|
204 |
|
205 | left = self.start_pos
|
206 |
|
207 | # Find the closing </li>, taking into accounted NESTED tags:
|
208 | # <li> <li>foo</li> </li>
|
209 | # because cells can have bulleted lists
|
210 | balance = 0
|
211 | while True:
|
212 | if self.tok_id == h8_id.StartEndTag:
|
213 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
214 | # TODO: remove td-attrs backward compat
|
215 | if self.tag_name in ('td-attrs', 'cell-attrs'):
|
216 | td_attrs_span = self.start_pos, self.end_pos
|
217 | td_attrs = self.tag_lexer.AllAttrsRaw()
|
218 | #log('CELL ATTRS %r', self._CurrentString())
|
219 |
|
220 | elif self.tok_id == h8_id.StartTag:
|
221 | if self.tag_name == 'li':
|
222 | balance += 1
|
223 |
|
224 | elif self.tok_id == h8_id.EndTag:
|
225 | if self.tag_name == 'li':
|
226 | balance -= 1
|
227 | if balance < 0:
|
228 | break
|
229 | self._Next()
|
230 |
|
231 | right = self.start_pos # start of the end tag
|
232 |
|
233 | s = self.lexer.s
|
234 | if td_attrs_span:
|
235 | # everything except the <cell-attrs />
|
236 | inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
|
237 | #log('LEFT %r', s[left:td_attrs_span[0]])
|
238 | #log('RIGHT %r', s[td_attrs_span[1]:right])
|
239 | else:
|
240 | inner_html = s[left:right]
|
241 | #log('RAW inner html %r', inner_html)
|
242 |
|
243 | #self._Eat(h8_id.EndTag, 'li')
|
244 | self._Next()
|
245 |
|
246 | return td_attrs, inner_html
|
247 |
|
248 | def _ParseTHead(self):
|
249 | # type: () -> List[Tuple[Optional[TdAttrs], str]]
|
250 | """
|
251 | Assume we're looking at the first <ul> tag. Now we want to find
|
252 | <li>thead and the nested <ul>
|
253 |
|
254 | Grammar:
|
255 |
|
256 | THEAD =
|
257 | [StartTag 'ul']
|
258 | [RawData \s*]?
|
259 | [StartTag 'li']
|
260 | [RawData thead\s*]
|
261 | [StartTag 'ul'] # Indented bullet that starts -
|
262 | LIST_ITEM+
|
263 | [RawData \s*]?
|
264 | [EndTag 'ul']
|
265 | [RawData thead\s+]
|
266 | [End 'li']
|
267 |
|
268 | Two Algorithms:
|
269 |
|
270 | 1. Replacement:
|
271 | - skip over the first ul 'thead' li, and ul 'tr' li
|
272 | - then replace the next ul -> tr, and li -> td
|
273 | 2. Parsing and Rendering:
|
274 | - parse them into a structure
|
275 | - skip all the text
|
276 | - print your own HTML
|
277 |
|
278 | I think the second one is better, because it allows attribute extensions
|
279 | to thead
|
280 |
|
281 | - thead
|
282 | - name [link][]
|
283 | - colgroup=foo align=left
|
284 | - age
|
285 | - colgroup=foo align=right
|
286 | """
|
287 | #log('*** _ParseTHead')
|
288 | cells = []
|
289 |
|
290 | self._WhitespaceOk()
|
291 | self._Eat(h8_id.StartTag, 'li')
|
292 |
|
293 | # In CommonMark, r'thead\n' is enough, because it strips trailing
|
294 | # whitespace. I'm not sure if other Markdown processors do that, so
|
295 | # use r'thead\s+'.
|
296 | self._EatRawData(r'thead\s+')
|
297 |
|
298 | # This is the row data
|
299 | self._Eat(h8_id.StartTag, 'ul')
|
300 |
|
301 | while True:
|
302 | td_attrs, inner_html = self._ListItem()
|
303 | if inner_html is None:
|
304 | break
|
305 | cells.append((td_attrs, inner_html))
|
306 | self._WhitespaceOk()
|
307 |
|
308 | self._Eat(h8_id.EndTag, 'ul')
|
309 |
|
310 | self._WhitespaceOk()
|
311 | self._Eat(h8_id.EndTag, 'li')
|
312 |
|
313 | #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
|
314 | return cells
|
315 |
|
316 | def _ParseTr(self):
|
317 | # type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
|
318 | """
|
319 | Assume we're looking at the first <ul> tag. Now we want to find
|
320 | <li>tr and the nested <ul>
|
321 |
|
322 | Grammar:
|
323 |
|
324 | TR =
|
325 | [RawData \s*]?
|
326 | [StartTag 'li']
|
327 | [RawData thead\s*]
|
328 | [StartTag 'ul'] # Indented bullet that starts -
|
329 | ( [StartEndTag row-attrs] [RawData \s*] )?
|
330 | LIST_ITEM+ # Defined above
|
331 | [RawData \s*]?
|
332 | [EndTag 'ul']
|
333 | """
|
334 | #log('*** _ParseTr')
|
335 |
|
336 | cells = []
|
337 |
|
338 | self._WhitespaceOk()
|
339 |
|
340 | # Could be a </ul>
|
341 | if self.tok_id != h8_id.StartTag:
|
342 | return None, None
|
343 |
|
344 | self._Eat(h8_id.StartTag, 'li')
|
345 |
|
346 | self._EatRawData(r'tr\s*')
|
347 |
|
348 | tr_attrs = None
|
349 | if self.tok_id == h8_id.StartEndTag:
|
350 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
351 | if self.tag_name != 'row-attrs':
|
352 | raise htm8.ParseError('Expected row-attrs, got %r' %
|
353 | self.tag_name)
|
354 | tr_attrs = self.tag_lexer.AllAttrsRaw()
|
355 | self._Next()
|
356 | self._WhitespaceOk()
|
357 |
|
358 | # This is the row data
|
359 | self._Eat(h8_id.StartTag, 'ul')
|
360 |
|
361 | while True:
|
362 | td_attrs, inner_html = self._ListItem()
|
363 | if inner_html is None:
|
364 | break
|
365 | cells.append((td_attrs, inner_html))
|
366 | # TODO: assert
|
367 |
|
368 | self._WhitespaceOk()
|
369 |
|
370 | self._Eat(h8_id.EndTag, 'ul')
|
371 |
|
372 | self._WhitespaceOk()
|
373 | self._Eat(h8_id.EndTag, 'li')
|
374 |
|
375 | #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
|
376 | return tr_attrs, cells
|
377 |
|
378 | def ParseTable(self):
|
379 | # type: () -> Dict[str, Any]
|
380 | """
|
381 | Returns a structure like this
|
382 | { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
|
383 | 'tr': [ # raw HTML that you surround with <td>
|
384 | [ 'cell1 html', 'cell2 html' ],
|
385 | [ 'cell1 html', 'cell2 html' ],
|
386 | ]
|
387 | }
|
388 |
|
389 | Grammar:
|
390 |
|
391 | UL_TABLE =
|
392 | [StartTag 'ul']
|
393 | THEAD # this this returns the number of cells, so it's NOT context
|
394 | # free
|
395 | TR*
|
396 | [EndTag 'ul']
|
397 | """
|
398 | table = {'tr': []} # type: Dict[str, Any]
|
399 |
|
400 | ul_start = self.start_pos
|
401 | self._Eat(h8_id.StartTag, 'ul')
|
402 |
|
403 | # Look ahead 2 or 3 tokens:
|
404 | if self.lexer.LookAhead(r'\s*<li>thead\s+'):
|
405 | thead = self._ParseTHead()
|
406 | else:
|
407 | thead = None
|
408 | #log('___ THEAD %s', thead)
|
409 |
|
410 | while True:
|
411 | tr_attrs, tr = self._ParseTr()
|
412 | if tr is None:
|
413 | break
|
414 | # Not validating because of colspan
|
415 | if 0:
|
416 | if thead and len(tr) != len(thead):
|
417 | raise htm8.ParseError('Expected %d cells, got %d: %s' %
|
418 | (len(thead), len(tr), tr))
|
419 |
|
420 | #log('___ TR %s', tr)
|
421 | table['tr'].append((tr_attrs, tr))
|
422 |
|
423 | self._Eat(h8_id.EndTag, 'ul')
|
424 |
|
425 | self._WhitespaceOk()
|
426 |
|
427 | ul_end = self.start_pos
|
428 |
|
429 | table['thead'] = thead
|
430 | table['ul_start'] = ul_start
|
431 | table['ul_end'] = ul_end
|
432 |
|
433 | if 0:
|
434 | log('table %s', table)
|
435 | from pprint import pprint
|
436 | pprint(table)
|
437 |
|
438 | return table
|
439 |
|
440 |
|
441 | def MergeAttrs(
|
442 | thead_td_attrs, # type: Optional[TdAttrs]
|
443 | row_td_attrs, # type: Optional[TdAttrs]
|
444 | ):
|
445 | # type: (...) -> TdAttrs
|
446 | merged_attrs = []
|
447 |
|
448 | if row_td_attrs is None:
|
449 | row_lookup = {}
|
450 | else:
|
451 | row_lookup = {n: v for n, v in row_td_attrs}
|
452 |
|
453 | done_for_row = set()
|
454 |
|
455 | if thead_td_attrs:
|
456 | for name, raw_value in thead_td_attrs:
|
457 | more_values = row_lookup.get(name)
|
458 | if more_values is not None:
|
459 | raw_value += ' %s' % more_values
|
460 | done_for_row.add(name)
|
461 | merged_attrs.append((name, raw_value))
|
462 |
|
463 | if row_td_attrs:
|
464 | for name, raw_value in row_td_attrs:
|
465 | if name in done_for_row:
|
466 | continue
|
467 | merged_attrs.append((name, raw_value))
|
468 |
|
469 | return merged_attrs
|
470 |
|
471 |
|
472 | def ReplaceTables(s, debug_out=None):
|
473 | # type: (str, Optional[Any]) -> str
|
474 | """
|
475 | ul-table: Write tables using bulleted list
|
476 | """
|
477 | if debug_out is None:
|
478 | debug_out = []
|
479 |
|
480 | f = StringIO()
|
481 | out = htm8.Output(s, f)
|
482 |
|
483 | tag_lexer = htm8.TagLexer(s)
|
484 | lexer = htm8.Lexer(s)
|
485 |
|
486 | p = UlTableParser(lexer, tag_lexer)
|
487 |
|
488 | while True:
|
489 | ul_start = p.FindUlTable()
|
490 | if ul_start == -1:
|
491 | break
|
492 |
|
493 | #log('UL START %d', ul_start)
|
494 | out.PrintUntil(ul_start)
|
495 |
|
496 | table = p.ParseTable()
|
497 | #log('UL END %d', ul_end)
|
498 |
|
499 | # Don't write the matching </u> of the LAST row, but write everything
|
500 | # after that
|
501 | out.SkipTo(table['ul_end'])
|
502 |
|
503 | # Write the header
|
504 | thead = table['thead']
|
505 |
|
506 | col_attrs = {} # integer -> td_attrs
|
507 | if thead:
|
508 | out.Print('<thead>\n')
|
509 | out.Print('<tr>\n')
|
510 |
|
511 | i = 0
|
512 | for td_attrs, raw_html in thead:
|
513 | if td_attrs:
|
514 | col_attrs[i] = td_attrs
|
515 | # <th> tag is more semantic, and styled bold by default
|
516 | out.Print(' <th>')
|
517 | out.Print(raw_html)
|
518 | out.Print('</th>\n')
|
519 | i += 1
|
520 |
|
521 | out.Print('</tr>\n')
|
522 | out.Print('</thead>\n')
|
523 |
|
524 | # Write each row
|
525 | for tr_attrs, row in table['tr']:
|
526 |
|
527 | # Print tr tag and attrs
|
528 | out.Print('<tr')
|
529 | if tr_attrs:
|
530 | for name, raw_value in tr_attrs:
|
531 | out.Print(' ')
|
532 | out.Print(name)
|
533 | # No escaping because it's raw. It can't contain quotes.
|
534 | out.Print('="%s"' % raw_value)
|
535 | out.Print('>\n')
|
536 |
|
537 | # Print cells
|
538 | i = 0
|
539 | for row_td_attrs, raw_html in row:
|
540 | # Inherited from header
|
541 | thead_td_attrs = col_attrs.get(i)
|
542 | merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
|
543 |
|
544 | out.Print(' <td')
|
545 | for name, raw_value in merged_attrs:
|
546 | out.Print(' ')
|
547 | out.Print(name)
|
548 | # No escaping because it's raw. It can't contain quotes.
|
549 | out.Print('="%s"' % raw_value)
|
550 | out.Print('>')
|
551 |
|
552 | out.Print(raw_html)
|
553 | out.Print('</td>\n')
|
554 | i += 1
|
555 | out.Print('</tr>\n')
|
556 |
|
557 | out.PrintTheRest()
|
558 |
|
559 | return f.getvalue()
|
560 |
|
561 |
|
562 | if __name__ == '__main__':
|
563 | # Simple CLI filter
|
564 | h = sys.stdin.read()
|
565 | h = RemoveComments(h)
|
566 | h = ReplaceTables(h)
|
567 | sys.stdout.write(h)
|