OILS / doctools / ul_table.py View on Github | oils.pub

541 lines, 272 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4try:
5 from cStringIO import StringIO
6except ImportError:
7 from io import StringIO
8import re
9import sys
10
11from doctools.util import log
12from lazylex import html
13
14
15def RemoveComments(s):
16 """Remove <!-- comments -->
17
18 This is a required preprocessing step for ul-table.
19 """
20 f = StringIO()
21 out = html.Output(s, f)
22
23 tag_lexer = html.TagLexer(s)
24
25 pos = 0
26
27 for tok_id, end_pos in html.ValidTokens(s):
28 if tok_id == html.Comment:
29 value = s[pos:end_pos]
30 # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
31 if 'REPLACE' not in value:
32 out.PrintUntil(pos)
33 out.SkipTo(end_pos)
34 pos = end_pos
35
36 out.PrintTheRest()
37 return f.getvalue()
38
39
40_WHITESPACE_RE = re.compile(r'\s*')
41
42
43class UlTableParser(object):
44
45 def __init__(self, lexer, tag_lexer):
46 self.lexer = lexer
47 self.tag_lexer = tag_lexer
48
49 self.tok_id = html.Invalid
50 self.start_pos = 0
51 self.end_pos = 0
52
53 def _CurrentString(self):
54 part = self.lexer.s[self.start_pos:self.end_pos]
55 return part
56
57 def _Next(self, comment_ok=False):
58 """
59 Advance and set self.tok_id, self.start_pos, self.end_pos
60 """
61 self.start_pos = self.end_pos
62 self.tok_id, self.end_pos = self.lexer.Read()
63
64 # Should have called RemoveComments() beforehand. That can still leave
65 # some REPLACE cmoments
66 if not comment_ok and self.tok_id == html.Comment:
67 raise html.ParseError('Unexpected HTML comment')
68
69 if 0:
70 part = self._CurrentString()
71 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
72
73 def _EatRawData(self, regex):
74 # type: (str) -> None
75 """
76 Assert that we got text data matching a regex, and advance
77 """
78 if self.tok_id != html.RawData:
79 raise html.ParseError('Expected RawData, got %s',
80 html.TokenName(self.tok_id))
81 actual = self._CurrentString()
82 m = re.match(regex, actual) # could compile this
83 if m is None:
84 raise html.ParseError('Expected to match %r, got %r', regex,
85 actual)
86 self._Next()
87
88 def _Eat(self, expected_id, expected_tag):
89 """
90 Assert that we got a start or end tag, with the given name, and advance
91
92 Args:
93 expected_id: html.StartTag or html.EndTag
94 expected_tag: 'a', 'span', etc.
95 """
96 assert expected_id in (html.StartTag,
97 html.EndTag), html.TokenName(expected_id)
98
99 if self.tok_id != expected_id:
100 raise html.ParseError('Expected token %s, got %s',
101 html.TokenName(expected_id),
102 html.TokenName(self.tok_id))
103 self.tag_lexer.Reset(self.start_pos, self.end_pos)
104 tag_name = self.tag_lexer.TagName()
105 if expected_tag != tag_name:
106 raise html.ParseError('Expected tag %r, got %r', expected_tag,
107 tag_name)
108
109 self._Next()
110
111 def _WhitespaceOk(self):
112 """
113 Optional whitespace
114 """
115 if (self.tok_id == html.RawData and
116 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
117 self._Next()
118
119 def FindUlTable(self):
120 """Find <table ...> <ul>
121
122 Return the START position of the <ul>
123 Similar algorithm as html.ReadUntilStartTag()
124 """
125 tag_lexer = self.tag_lexer
126
127 # Find first table
128 while True:
129 self._Next(comment_ok=True)
130 if self.tok_id == html.EndOfStream:
131 return -1
132
133 tag_lexer.Reset(self.start_pos, self.end_pos)
134 if (self.tok_id == html.StartTag and
135 tag_lexer.TagName() == 'table'):
136 while True:
137 self._Next(comment_ok=True)
138 if self.tok_id != html.RawData:
139 break
140
141 tag_lexer.Reset(self.start_pos, self.end_pos)
142 if (self.tok_id == html.StartTag and
143 tag_lexer.TagName() == 'ul'):
144 return self.start_pos
145 return -1
146
147 def _ListItem(self):
148 """Parse a list item nested below thead or tr.
149
150 Returns:
151 A pair (td_attrs, inner_html)
152
153 Grammar:
154
155 LIST_ITEM =
156 [RawData \s*]?
157 [StartTag 'li']
158 ANY* # NOT context-free:
159 # - we MATCH <li> and </li> with a tack
160 # - We search for [StartEndTag 'cell-attrs']?
161 [EndTag 'li']
162
163 Example of attribute borrowing:
164
165 - hi there ==>
166 <li>hi there</li> ==>
167 <td>hi there</td>
168
169 - <cell-attrs class=foo /> hi there ==>
170 <li><cell-attrs class=foo /> hi there </li> ==>
171 <td class=foo> hi there </td> ==>
172 """
173 self._WhitespaceOk()
174
175 if self.tok_id != html.StartTag:
176 return None, None
177
178 inner_html = None
179 td_attrs = None # Can we also have col-attrs?
180 td_attrs_span = None
181
182 self._Eat(html.StartTag, 'li')
183
184 left = self.start_pos
185
186 # Find the closing </li>, taking into accounted NESTED tags:
187 # <li> <li>foo</li> </li>
188 # because cells can have bulleted lists
189 balance = 0
190 while True:
191 if self.tok_id == html.StartEndTag:
192 self.tag_lexer.Reset(self.start_pos, self.end_pos)
193 tag_name = self.tag_lexer.TagName()
194 # TODO: remove td-attrs backward compat
195 if tag_name in ('td-attrs', 'cell-attrs'):
196 td_attrs_span = self.start_pos, self.end_pos
197 td_attrs = self.tag_lexer.AllAttrsRaw()
198 #log('CELL ATTRS %r', self._CurrentString())
199
200 elif self.tok_id == html.StartTag:
201 self.tag_lexer.Reset(self.start_pos, self.end_pos)
202 if self.tag_lexer.TagName() == 'li':
203 balance += 1
204
205 elif self.tok_id == html.EndTag:
206 self.tag_lexer.Reset(self.start_pos, self.end_pos)
207 if self.tag_lexer.TagName() == 'li':
208 balance -= 1
209 if balance < 0:
210 break
211 self._Next()
212
213 right = self.start_pos # start of the end tag
214
215 s = self.tag_lexer.s
216 if td_attrs_span:
217 # everything except the <cell-attrs />
218 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
219 #log('LEFT %r', s[left:td_attrs_span[0]])
220 #log('RIGHT %r', s[td_attrs_span[1]:right])
221 else:
222 inner_html = s[left:right]
223 #log('RAW inner html %r', inner_html)
224
225 #self._Eat(html.EndTag, 'li')
226 self._Next()
227
228 return td_attrs, inner_html
229
230 def _ParseTHead(self):
231 """
232 Assume we're looking at the first <ul> tag. Now we want to find
233 <li>thead and the nested <ul>
234
235 Grammar:
236
237 THEAD =
238 [StartTag 'ul']
239 [RawData \s*]?
240 [StartTag 'li']
241 [RawData thead\s*]
242 [StartTag 'ul'] # Indented bullet that starts -
243 LIST_ITEM+
244 [RawData \s*]?
245 [EndTag 'ul']
246 [RawData thead\s+]
247 [End 'li']
248
249 Two Algorithms:
250
251 1. Replacement:
252 - skip over the first ul 'thead' li, and ul 'tr' li
253 - then replace the next ul -> tr, and li -> td
254 2. Parsing and Rendering:
255 - parse them into a structure
256 - skip all the text
257 - print your own HTML
258
259 I think the second one is better, because it allows attribute extensions
260 to thead
261
262 - thead
263 - name [link][]
264 - colgroup=foo align=left
265 - age
266 - colgroup=foo align=right
267 """
268 #log('*** _ParseTHead')
269 cells = []
270
271 self._WhitespaceOk()
272 self._Eat(html.StartTag, 'li')
273
274 # In CommonMark, r'thead\n' is enough, because it strips trailing
275 # whitespace. I'm not sure if other Markdown processors do that, so
276 # use r'thead\s+'.
277 self._EatRawData(r'thead\s+')
278
279 # This is the row data
280 self._Eat(html.StartTag, 'ul')
281
282 while True:
283 td_attrs, inner_html = self._ListItem()
284 if inner_html is None:
285 break
286 cells.append((td_attrs, inner_html))
287 self._WhitespaceOk()
288
289 self._Eat(html.EndTag, 'ul')
290
291 self._WhitespaceOk()
292 self._Eat(html.EndTag, 'li')
293
294 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
295 return cells
296
297 def _ParseTr(self):
298 """
299 Assume we're looking at the first <ul> tag. Now we want to find
300 <li>tr and the nested <ul>
301
302 Grammar:
303
304 TR =
305 [RawData \s*]?
306 [StartTag 'li']
307 [RawData thead\s*]
308 [StartTag 'ul'] # Indented bullet that starts -
309 ( [StartEndTag row-attrs] [RawData \s*] )?
310 LIST_ITEM+ # Defined above
311 [RawData \s*]?
312 [EndTag 'ul']
313 """
314 #log('*** _ParseTr')
315
316 cells = []
317
318 self._WhitespaceOk()
319
320 # Could be a </ul>
321 if self.tok_id != html.StartTag:
322 return None, None
323
324 self._Eat(html.StartTag, 'li')
325
326 self._EatRawData(r'tr\s*')
327
328 tr_attrs = None
329 if self.tok_id == html.StartEndTag:
330 self.tag_lexer.Reset(self.start_pos, self.end_pos)
331 tag_name = self.tag_lexer.TagName()
332 if tag_name != 'row-attrs':
333 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
334 tr_attrs = self.tag_lexer.AllAttrsRaw()
335 self._Next()
336 self._WhitespaceOk()
337
338 # This is the row data
339 self._Eat(html.StartTag, 'ul')
340
341 while True:
342 td_attrs, inner_html = self._ListItem()
343 if inner_html is None:
344 break
345 cells.append((td_attrs, inner_html))
346 # TODO: assert
347
348 self._WhitespaceOk()
349
350 self._Eat(html.EndTag, 'ul')
351
352 self._WhitespaceOk()
353 self._Eat(html.EndTag, 'li')
354
355 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
356 return tr_attrs, cells
357
358 def ParseTable(self):
359 """
360 Returns a structure like this
361 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
362 'tr': [ # raw HTML that you surround with <td>
363 [ 'cell1 html', 'cell2 html' ],
364 [ 'cell1 html', 'cell2 html' ],
365 ]
366 }
367
368 Grammar:
369
370 UL_TABLE =
371 [StartTag 'ul']
372 THEAD # this this returns the number of cells, so it's NOT context
373 # free
374 TR*
375 [EndTag 'ul']
376 """
377 table = {'tr': []}
378
379 ul_start = self.start_pos
380 self._Eat(html.StartTag, 'ul')
381
382 # Look ahead 2 or 3 tokens:
383 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
384 thead = self._ParseTHead()
385 else:
386 thead = None
387 #log('___ THEAD %s', thead)
388
389 while True:
390 tr_attrs, tr = self._ParseTr()
391 if tr is None:
392 break
393 # Not validating because of colspan
394 if 0:
395 if thead and len(tr) != len(thead):
396 raise html.ParseError('Expected %d cells, got %d: %s',
397 len(thead), len(tr), tr)
398
399 #log('___ TR %s', tr)
400 table['tr'].append((tr_attrs, tr))
401
402 self._Eat(html.EndTag, 'ul')
403
404 self._WhitespaceOk()
405
406 ul_end = self.start_pos
407
408 table['thead'] = thead
409 table['ul_start'] = ul_start
410 table['ul_end'] = ul_end
411
412 if 0:
413 log('table %s', table)
414 from pprint import pprint
415 pprint(table)
416
417 return table
418
419
420def MergeAttrs(thead_td_attrs, row_td_attrs):
421 merged_attrs = []
422
423 if row_td_attrs is None:
424 row_lookup = {}
425 else:
426 row_lookup = {n: v for n, v in row_td_attrs}
427
428 done_for_row = set()
429
430 if thead_td_attrs:
431 for name, raw_value in thead_td_attrs:
432 more_values = row_lookup.get(name)
433 if more_values is not None:
434 raw_value += ' %s' % more_values
435 done_for_row.add(name)
436 merged_attrs.append((name, raw_value))
437
438 if row_td_attrs:
439 for name, raw_value in row_td_attrs:
440 if name in done_for_row:
441 continue
442 merged_attrs.append((name, raw_value))
443
444 return merged_attrs
445
446
447def ReplaceTables(s, debug_out=None):
448 """
449 ul-table: Write tables using bulleted list
450 """
451 if debug_out is None:
452 debug_out = []
453
454 f = StringIO()
455 out = html.Output(s, f)
456
457 tag_lexer = html.TagLexer(s)
458 lexer = html.Lexer(s)
459
460 p = UlTableParser(lexer, tag_lexer)
461
462 while True:
463 ul_start = p.FindUlTable()
464 if ul_start == -1:
465 break
466
467 #log('UL START %d', ul_start)
468 out.PrintUntil(ul_start)
469
470 table = p.ParseTable()
471 #log('UL END %d', ul_end)
472
473 # Don't write the matching </u> of the LAST row, but write everything
474 # after that
475 out.SkipTo(table['ul_end'])
476
477 # Write the header
478 thead = table['thead']
479
480 col_attrs = {} # integer -> td_attrs
481 if thead:
482 out.Print('<thead>\n')
483 out.Print('<tr>\n')
484
485 i = 0
486 for td_attrs, raw_html in thead:
487 if td_attrs:
488 col_attrs[i] = td_attrs
489 # <th> tag is more semantic, and styled bold by default
490 out.Print(' <th>')
491 out.Print(raw_html)
492 out.Print('</th>\n')
493 i += 1
494
495 out.Print('</tr>\n')
496 out.Print('</thead>\n')
497
498 # Write each row
499 for tr_attrs, row in table['tr']:
500
501 # Print tr tag and attrs
502 out.Print('<tr')
503 if tr_attrs:
504 for name, raw_value in tr_attrs:
505 out.Print(' ')
506 out.Print(name)
507 # No escaping because it's raw. It can't contain quotes.
508 out.Print('="%s"' % raw_value)
509 out.Print('>\n')
510
511 # Print cells
512 i = 0
513 for row_td_attrs, raw_html in row:
514 # Inherited from header
515 thead_td_attrs = col_attrs.get(i)
516 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
517
518 out.Print(' <td')
519 for name, raw_value in merged_attrs:
520 out.Print(' ')
521 out.Print(name)
522 # No escaping because it's raw. It can't contain quotes.
523 out.Print('="%s"' % raw_value)
524 out.Print('>')
525
526 out.Print(raw_html)
527 out.Print('</td>\n')
528 i += 1
529 out.Print('</tr>\n')
530
531 out.PrintTheRest()
532
533 return f.getvalue()
534
535
536if __name__ == '__main__':
537 # Simple CLI filter
538 h = sys.stdin.read()
539 h = RemoveComments(h)
540 h = ReplaceTables(h)
541 sys.stdout.write(h)