OILS / doctools / ul_table.py View on Github | oils.pub

509 lines, 253 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4try:
5 from cStringIO import StringIO
6except ImportError:
7 from io import StringIO
8import re
9import sys
10
11from doctools.util import log
12from lazylex import html
13
14_WHITESPACE_RE = re.compile(r'\s*')
15
16
17class UlTableParser(object):
18
19 def __init__(self, lexer, tag_lexer):
20 self.lexer = lexer
21 self.tag_lexer = tag_lexer
22
23 self.tok_id = html.Invalid
24 self.start_pos = 0
25 self.end_pos = 0
26
27 def _CurrentString(self):
28 part = self.lexer.s[self.start_pos:self.end_pos]
29 return part
30
31 def _Next(self):
32 """
33 Advance and set self.tok_id, self.start_pos, self.end_pos
34 """
35 self.start_pos = self.end_pos
36 self.tok_id, self.end_pos = self.lexer.Read()
37 if 0:
38 part = self._CurrentString()
39 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
40
41 #self.tok_id = html.EndOfStream
42 # Don't change self.end_pos
43
44 def _EatRawData(self, regex):
45 # type: (str) -> None
46 """
47 Assert that we got text data matching a regex, and advance
48 """
49 if self.tok_id != html.RawData:
50 raise html.ParseError('Expected RawData, got %s',
51 html.TokenName(self.tok_id))
52 actual = self._CurrentString()
53 m = re.match(regex, actual) # could compile this
54 if m is None:
55 raise html.ParseError('Expected to match %r, got %r', regex,
56 actual)
57 self._Next()
58
59 def _Eat(self, expected_id, expected_tag):
60 """
61 Assert that we got a start or end tag, with the given name, and advance
62
63 Args:
64 expected_id: html.StartTag or html.EndTag
65 expected_tag: 'a', 'span', etc.
66 """
67 assert expected_id in (html.StartTag,
68 html.EndTag), html.TokenName(expected_id)
69
70 if self.tok_id != expected_id:
71 raise html.ParseError('Expected token %s, got %s',
72 html.TokenName(expected_id),
73 html.TokenName(self.tok_id))
74 self.tag_lexer.Reset(self.start_pos, self.end_pos)
75 tag_name = self.tag_lexer.TagName()
76 if expected_tag != tag_name:
77 raise html.ParseError('Expected tag %r, got %r', expected_tag,
78 tag_name)
79
80 self._Next()
81
82 def _WhitespaceOk(self):
83 """
84 Optional whitespace
85 """
86 if (self.tok_id == html.RawData and
87 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
88 self._Next()
89
90 def FindUlTable(self):
91 """Find <table ...> <ul>
92
93 Return the START position of the <ul>
94 Similar algorithm as html.ReadUntilStartTag()
95 """
96 tag_lexer = self.tag_lexer
97
98 # Find first table
99 while True:
100 self._Next()
101 if self.tok_id == html.EndOfStream:
102 return -1
103
104 tag_lexer.Reset(self.start_pos, self.end_pos)
105 if (self.tok_id == html.StartTag and
106 tag_lexer.TagName() == 'table'):
107 while True:
108 self._Next()
109 if self.tok_id != html.RawData:
110 break
111
112 tag_lexer.Reset(self.start_pos, self.end_pos)
113 if (self.tok_id == html.StartTag and
114 tag_lexer.TagName() == 'ul'):
115 return self.start_pos
116 return -1
117
118 def _ListItem(self):
119 """Parse a list item nested below thead or tr.
120
121 Returns:
122 A pair (td_attrs, inner_html)
123
124 Grammar:
125
126 LIST_ITEM =
127 [RawData \s*]?
128 [StartTag 'li']
129 ANY* # NOT context-free:
130 # - we MATCH <li> and </li> with a tack
131 # - We search for [StartEndTag 'cell-attrs']?
132 [EndTag 'li']
133
134 Example of attribute borrowing:
135
136 - hi there ==>
137 <li>hi there</li> ==>
138 <td>hi there</td>
139
140 - <cell-attrs class=foo /> hi there ==>
141 <li><cell-attrs class=foo /> hi there </li> ==>
142 <td class=foo> hi there </td> ==>
143 """
144 self._WhitespaceOk()
145
146 if self.tok_id != html.StartTag:
147 return None, None
148
149 inner_html = None
150 td_attrs = None # Can we also have col-attrs?
151 td_attrs_span = None
152
153 self._Eat(html.StartTag, 'li')
154
155 left = self.start_pos
156
157 # Find the closing </li>, taking into accounted NESTED tags:
158 # <li> <li>foo</li> </li>
159 # because cells can have bulleted lists
160 balance = 0
161 while True:
162 if self.tok_id == html.StartEndTag:
163 self.tag_lexer.Reset(self.start_pos, self.end_pos)
164 tag_name = self.tag_lexer.TagName()
165 # TODO: remove td-attrs backward compat
166 if tag_name in ('td-attrs', 'cell-attrs'):
167 td_attrs_span = self.start_pos, self.end_pos
168 td_attrs = self.tag_lexer.AllAttrsRaw()
169 #log('CELL ATTRS %r', self._CurrentString())
170
171 elif self.tok_id == html.StartTag:
172 self.tag_lexer.Reset(self.start_pos, self.end_pos)
173 if self.tag_lexer.TagName() == 'li':
174 balance += 1
175
176 elif self.tok_id == html.EndTag:
177 self.tag_lexer.Reset(self.start_pos, self.end_pos)
178 if self.tag_lexer.TagName() == 'li':
179 balance -= 1
180 if balance < 0:
181 break
182 self._Next()
183
184 right = self.start_pos # start of the end tag
185
186 s = self.tag_lexer.s
187 if td_attrs_span:
188 # everything except the <cell-attrs />
189 inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
190 #log('LEFT %r', s[left:td_attrs_span[0]])
191 #log('RIGHT %r', s[td_attrs_span[1]:right])
192 else:
193 inner_html = s[left:right]
194 #log('RAW inner html %r', inner_html)
195
196 #self._Eat(html.EndTag, 'li')
197 self._Next()
198
199 return td_attrs, inner_html
200
201 def _ParseTHead(self):
202 """
203 Assume we're looking at the first <ul> tag. Now we want to find
204 <li>thead and the nested <ul>
205
206 Grammar:
207
208 THEAD =
209 [StartTag 'ul']
210 [RawData \s*]?
211 [StartTag 'li']
212 [RawData thead\s*]
213 [StartTag 'ul'] # Indented bullet that starts -
214 LIST_ITEM+
215 [RawData \s*]?
216 [EndTag 'ul']
217 [RawData thead\s+]
218 [End 'li']
219
220 Two Algorithms:
221
222 1. Replacement:
223 - skip over the first ul 'thead' li, and ul 'tr' li
224 - then replace the next ul -> tr, and li -> td
225 2. Parsing and Rendering:
226 - parse them into a structure
227 - skip all the text
228 - print your own HTML
229
230 I think the second one is better, because it allows attribute extensions
231 to thead
232
233 - thead
234 - name [link][]
235 - colgroup=foo align=left
236 - age
237 - colgroup=foo align=right
238 """
239 #log('*** _ParseTHead')
240 cells = []
241
242 self._WhitespaceOk()
243 self._Eat(html.StartTag, 'li')
244
245 # In CommonMark, r'thead\n' is enough, because it strips trailing
246 # whitespace. I'm not sure if other Markdown processors do that, so
247 # use r'thead\s+'.
248 self._EatRawData(r'thead\s+')
249
250 # This is the row data
251 self._Eat(html.StartTag, 'ul')
252
253 while True:
254 td_attrs, inner_html = self._ListItem()
255 if inner_html is None:
256 break
257 cells.append((td_attrs, inner_html))
258 self._WhitespaceOk()
259
260 self._Eat(html.EndTag, 'ul')
261
262 self._WhitespaceOk()
263 self._Eat(html.EndTag, 'li')
264
265 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
266 return cells
267
268 def _ParseTr(self):
269 """
270 Assume we're looking at the first <ul> tag. Now we want to find
271 <li>tr and the nested <ul>
272
273 Grammar:
274
275 TR =
276 [RawData \s*]?
277 [StartTag 'li']
278 [RawData thead\s*]
279 [StartTag 'ul'] # Indented bullet that starts -
280 ( [StartEndTag row-attrs] [RawData \s*] )?
281 LIST_ITEM+ # Defined above
282 [RawData \s*]?
283 [EndTag 'ul']
284 """
285 #log('*** _ParseTr')
286
287 cells = []
288
289 self._WhitespaceOk()
290
291 # Could be a </ul>
292 if self.tok_id != html.StartTag:
293 return None, None
294
295 self._Eat(html.StartTag, 'li')
296
297 self._EatRawData(r'tr\s*')
298
299 tr_attrs = None
300 if self.tok_id == html.StartEndTag:
301 self.tag_lexer.Reset(self.start_pos, self.end_pos)
302 tag_name = self.tag_lexer.TagName()
303 if tag_name != 'row-attrs':
304 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
305 tr_attrs = self.tag_lexer.AllAttrsRaw()
306 self._Next()
307 self._WhitespaceOk()
308
309 # This is the row data
310 self._Eat(html.StartTag, 'ul')
311
312 while True:
313 td_attrs, inner_html = self._ListItem()
314 if inner_html is None:
315 break
316 cells.append((td_attrs, inner_html))
317 # TODO: assert
318
319 self._WhitespaceOk()
320
321 self._Eat(html.EndTag, 'ul')
322
323 self._WhitespaceOk()
324 self._Eat(html.EndTag, 'li')
325
326 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
327 return tr_attrs, cells
328
329 def ParseTable(self):
330 """
331 Returns a structure like this
332 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
333 'tr': [ # raw HTML that you surround with <td>
334 [ 'cell1 html', 'cell2 html' ],
335 [ 'cell1 html', 'cell2 html' ],
336 ]
337 }
338
339 Grammar:
340
341 UL_TABLE =
342 [StartTag 'ul']
343 THEAD # this this returns the number of cells, so it's NOT context
344 # free
345 TR*
346 [EndTag 'ul']
347 """
348 table = {'tr': []}
349
350 ul_start = self.start_pos
351 self._Eat(html.StartTag, 'ul')
352
353 # Look ahead 2 or 3 tokens:
354 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
355 thead = self._ParseTHead()
356 else:
357 thead = None
358 #log('___ THEAD %s', thead)
359
360 while True:
361 tr_attrs, tr = self._ParseTr()
362 if tr is None:
363 break
364 # Not validating because of colspan
365 if 0:
366 if thead and len(tr) != len(thead):
367 raise html.ParseError('Expected %d cells, got %d: %s',
368 len(thead), len(tr), tr)
369
370 #log('___ TR %s', tr)
371 table['tr'].append((tr_attrs, tr))
372
373 self._Eat(html.EndTag, 'ul')
374
375 self._WhitespaceOk()
376
377 ul_end = self.start_pos
378
379 table['thead'] = thead
380 table['ul_start'] = ul_start
381 table['ul_end'] = ul_end
382
383 if 0:
384 log('table %s', table)
385 from pprint import pprint
386 pprint(table)
387
388 return table
389
390
391def MergeAttrs(thead_td_attrs, row_td_attrs):
392 merged_attrs = []
393
394 if row_td_attrs is None:
395 row_lookup = {}
396 else:
397 row_lookup = {n: v for n, v in row_td_attrs}
398
399 done_for_row = set()
400
401 if thead_td_attrs:
402 for name, raw_value in thead_td_attrs:
403 more_values = row_lookup.get(name)
404 if more_values is not None:
405 raw_value += ' %s' % more_values
406 done_for_row.add(name)
407 merged_attrs.append((name, raw_value))
408
409 if row_td_attrs:
410 for name, raw_value in row_td_attrs:
411 if name in done_for_row:
412 continue
413 merged_attrs.append((name, raw_value))
414
415 return merged_attrs
416
417
418def ReplaceTables(s, debug_out=None):
419 """
420 ul-table: Write tables using bulleted list
421 """
422 if debug_out is None:
423 debug_out = []
424
425 f = StringIO()
426 out = html.Output(s, f)
427
428 tag_lexer = html.TagLexer(s)
429 lexer = html.Lexer(s)
430
431 p = UlTableParser(lexer, tag_lexer)
432
433 while True:
434 ul_start = p.FindUlTable()
435 if ul_start == -1:
436 break
437
438 #log('UL START %d', ul_start)
439 out.PrintUntil(ul_start)
440
441 table = p.ParseTable()
442 #log('UL END %d', ul_end)
443
444 # Don't write the matching </u> of the LAST row, but write everything
445 # after that
446 out.SkipTo(table['ul_end'])
447
448 # Write the header
449 thead = table['thead']
450
451 col_attrs = {} # integer -> td_attrs
452 if thead:
453 out.Print('<thead>\n')
454 out.Print('<tr>\n')
455
456 i = 0
457 for td_attrs, raw_html in thead:
458 if td_attrs:
459 col_attrs[i] = td_attrs
460 # <th> tag is more semantic, and styled bold by default
461 out.Print(' <th>')
462 out.Print(raw_html)
463 out.Print('</th>\n')
464 i += 1
465
466 out.Print('</tr>\n')
467 out.Print('</thead>\n')
468
469 # Write each row
470 for tr_attrs, row in table['tr']:
471
472 # Print tr tag and attrs
473 out.Print('<tr')
474 if tr_attrs:
475 for name, raw_value in tr_attrs:
476 out.Print(' ')
477 out.Print(name)
478 # No escaping because it's raw. It can't contain quotes.
479 out.Print('="%s"' % raw_value)
480 out.Print('>\n')
481
482 # Print cells
483 i = 0
484 for row_td_attrs, raw_html in row:
485 # Inherited from header
486 thead_td_attrs = col_attrs.get(i)
487 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
488
489 out.Print(' <td')
490 for name, raw_value in merged_attrs:
491 out.Print(' ')
492 out.Print(name)
493 # No escaping because it's raw. It can't contain quotes.
494 out.Print('="%s"' % raw_value)
495 out.Print('>')
496
497 out.Print(raw_html)
498 out.Print('</td>\n')
499 i += 1
500 out.Print('</tr>\n')
501
502 out.PrintTheRest()
503
504 return f.getvalue()
505
506
507if __name__ == '__main__':
508 # Simple CLI filter
509 sys.stdout.write(ReplaceTables(sys.stdin.read()))