OILS / doctools / ul_table.py View on Github | oilshell.org

502 lines, 250 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4try:
5 from cStringIO import StringIO
6except ImportError:
7 from io import StringIO
8import re
9import sys
10
11from doctools.util import log
12from lazylex import html
13
14_WHITESPACE_RE = re.compile(r'\s*')
15
16
17class UlTableParser(object):
18
19 def __init__(self, lexer, tag_lexer):
20 self.lexer = lexer
21 self.tag_lexer = tag_lexer
22
23 self.tok_id = html.Invalid
24 self.start_pos = 0
25 self.end_pos = 0
26
27 def _CurrentString(self):
28 part = self.lexer.s[self.start_pos:self.end_pos]
29 return part
30
31 def _Next(self):
32 """
33 Advance and set self.tok_id, self.start_pos, self.end_pos
34 """
35 self.start_pos = self.end_pos
36 self.tok_id, self.end_pos = self.lexer.Read()
37 if 0:
38 part = self._CurrentString()
39 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
40
41 #self.tok_id = html.EndOfStream
42 # Don't change self.end_pos
43
44 def _EatRawData(self, regex):
45 # type: (str) -> None
46 """
47 Assert that we got text data matching a regex, and advance
48 """
49 if self.tok_id != html.RawData:
50 raise html.ParseError('Expected RawData, got %s',
51 html.TokenName(self.tok_id))
52 actual = self._CurrentString()
53 m = re.match(regex, actual) # could compile this
54 if m is None:
55 raise html.ParseError('Expected to match %r, got %r', regex,
56 actual)
57 self._Next()
58
59 def _Eat(self, expected_id, expected_tag):
60 """
61 Assert that we got a start or end tag, with the given name, and advance
62
63 Args:
64 expected_id: html.StartTag or html.EndTag
65 expected_tag: 'a', 'span', etc.
66 """
67 assert expected_id in (html.StartTag,
68 html.EndTag), html.TokenName(expected_id)
69
70 if self.tok_id != expected_id:
71 raise html.ParseError('Expected token %s, got %s',
72 html.TokenName(expected_id),
73 html.TokenName(self.tok_id))
74 self.tag_lexer.Reset(self.start_pos, self.end_pos)
75 tag_name = self.tag_lexer.TagName()
76 if expected_tag != tag_name:
77 raise html.ParseError('Expected tag %r, got %r', expected_tag,
78 tag_name)
79
80 self._Next()
81
82 def _WhitespaceOk(self):
83 """
84 Optional whitespace
85 """
86 if (self.tok_id == html.RawData and
87 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
88 self._Next()
89
90 def FindUlTable(self):
91 """Find <table ...> <ul>
92
93 Return the START position of the <ul>
94 Similar algorithm as html.ReadUntilStartTag()
95 """
96 tag_lexer = self.tag_lexer
97
98 # Find first table
99 while True:
100 self._Next()
101 if self.tok_id == html.EndOfStream:
102 return -1
103
104 tag_lexer.Reset(self.start_pos, self.end_pos)
105 if (self.tok_id == html.StartTag and
106 tag_lexer.TagName() == 'table'):
107 while True:
108 self._Next()
109 if self.tok_id != html.RawData:
110 break
111
112 tag_lexer.Reset(self.start_pos, self.end_pos)
113 if (self.tok_id == html.StartTag and
114 tag_lexer.TagName() == 'ul'):
115 return self.start_pos
116 return -1
117
118 def _ListItem(self):
119 """Parse a list item nested below thead or tr.
120
121 Returns:
122 A pair (td_attrs, inner_html)
123
124 Grammar:
125
126 LIST_ITEM =
127 [RawData \s*]?
128 [StartTag 'li']
129 [StartEndTag 'cell-attrs']?
130 ANY* # NOT context-free - anything that's not the end
131 # This is what we should capture in CELLS
132 [EndTag 'li']
133
134 Example of attribute borrowing:
135
136 - hi there ==>
137 <li>hi there</li> ==>
138 <td>hi there</td>
139
140 - <cell-attrs class=foo /> hi there ==>
141 <li><cell-attrs class=foo /> hi there </li> ==>
142 <td class=foo> hi there </td> ==>
143 """
144 self._WhitespaceOk()
145
146 if self.tok_id != html.StartTag:
147 return None, None
148
149 inner_html = None
150 td_attrs = None # Can we also have col-attrs?
151
152 self._Eat(html.StartTag, 'li')
153
154 if self.tok_id == html.StartEndTag:
155 self.tag_lexer.Reset(self.start_pos, self.end_pos)
156 tag_name = self.tag_lexer.TagName()
157 # TODO: remove td-attrs backward compat
158 if tag_name not in ('td-attrs', 'cell-attrs'):
159 raise html.ParseError('Expected <cell-attrs />, got %r' %
160 tag_name)
161 td_attrs = self.tag_lexer.AllAttrsRaw()
162 self._Next()
163
164 left = self.start_pos
165
166 # Find the closing </li>, taking into accounted NESTED tags:
167 # <li> <li>foo</li> </li>
168 # because cells can have bulleted lists
169 balance = 0
170 while True:
171 if self.tok_id == html.StartTag:
172 self.tag_lexer.Reset(self.start_pos, self.end_pos)
173 if self.tag_lexer.TagName() == 'li':
174 balance += 1
175
176 if self.tok_id == html.EndTag:
177 self.tag_lexer.Reset(self.start_pos, self.end_pos)
178 if self.tag_lexer.TagName() == 'li':
179 balance -= 1
180 if balance < 0:
181 break
182 self._Next()
183
184 right = self.start_pos # start of the end tag
185
186 inner_html = self.tag_lexer.s[left:right]
187 #log('RAW inner html %r', inner_html)
188
189 #self._Eat(html.EndTag, 'li')
190 self._Next()
191
192 return td_attrs, inner_html
193
194 def _ParseTHead(self):
195 """
196 Assume we're looking at the first <ul> tag. Now we want to find
197 <li>thead and the nested <ul>
198
199 Grammar:
200
201 THEAD =
202 [StartTag 'ul']
203 [RawData \s*]?
204 [StartTag 'li']
205 [RawData thead\s*]
206 [StartTag 'ul'] # Indented bullet that starts -
207 LIST_ITEM+
208 [RawData \s*]?
209 [EndTag 'ul']
210 [RawData thead\s+]
211 [End 'li']
212
213 Two Algorithms:
214
215 1. Replacement:
216 - skip over the first ul 'thead' li, and ul 'tr' li
217 - then replace the next ul -> tr, and li -> td
218 2. Parsing and Rendering:
219 - parse them into a structure
220 - skip all the text
221 - print your own HTML
222
223 I think the second one is better, because it allows attribute extensions
224 to thead
225
226 - thead
227 - name [link][]
228 - colgroup=foo align=left
229 - age
230 - colgroup=foo align=right
231 """
232 #log('*** _ParseTHead')
233 cells = []
234
235 self._WhitespaceOk()
236 self._Eat(html.StartTag, 'li')
237
238 # In CommonMark, r'thead\n' is enough, because it strips trailing
239 # whitespace. I'm not sure if other Markdown processors do that, so
240 # use r'thead\s+'.
241 self._EatRawData(r'thead\s+')
242
243 # This is the row data
244 self._Eat(html.StartTag, 'ul')
245
246 while True:
247 td_attrs, inner_html = self._ListItem()
248 if inner_html is None:
249 break
250 cells.append((td_attrs, inner_html))
251 self._WhitespaceOk()
252
253 self._Eat(html.EndTag, 'ul')
254
255 self._WhitespaceOk()
256 self._Eat(html.EndTag, 'li')
257
258 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
259 return cells
260
261 def _ParseTr(self):
262 """
263 Assume we're looking at the first <ul> tag. Now we want to find
264 <li>tr and the nested <ul>
265
266 Grammar:
267
268 TR =
269 [RawData \s*]?
270 [StartTag 'li']
271 [RawData thead\s*]
272 [StartTag 'ul'] # Indented bullet that starts -
273 ( [StartEndTag row-attrs] [RawData \s*] )?
274 LIST_ITEM+ # Defined above
275 [RawData \s*]?
276 [EndTag 'ul']
277 """
278 #log('*** _ParseTr')
279
280 cells = []
281
282 self._WhitespaceOk()
283
284 # Could be a </ul>
285 if self.tok_id != html.StartTag:
286 return None, None
287
288 self._Eat(html.StartTag, 'li')
289
290 self._EatRawData(r'tr\s*')
291
292 tr_attrs = None
293 if self.tok_id == html.StartEndTag:
294 self.tag_lexer.Reset(self.start_pos, self.end_pos)
295 tag_name = self.tag_lexer.TagName()
296 if tag_name != 'row-attrs':
297 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
298 tr_attrs = self.tag_lexer.AllAttrsRaw()
299 self._Next()
300 self._WhitespaceOk()
301
302 # This is the row data
303 self._Eat(html.StartTag, 'ul')
304
305 while True:
306 td_attrs, inner_html = self._ListItem()
307 if inner_html is None:
308 break
309 cells.append((td_attrs, inner_html))
310 # TODO: assert
311
312 self._WhitespaceOk()
313
314 self._Eat(html.EndTag, 'ul')
315
316 self._WhitespaceOk()
317 self._Eat(html.EndTag, 'li')
318
319 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
320 return tr_attrs, cells
321
322 def ParseTable(self):
323 """
324 Returns a structure like this
325 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
326 'tr': [ # raw HTML that you surround with <td>
327 [ 'cell1 html', 'cell2 html' ],
328 [ 'cell1 html', 'cell2 html' ],
329 ]
330 }
331
332 Grammar:
333
334 UL_TABLE =
335 [StartTag 'ul']
336 THEAD # this this returns the number of cells, so it's NOT context
337 # free
338 TR*
339 [EndTag 'ul']
340 """
341 table = {'tr': []}
342
343 ul_start = self.start_pos
344 self._Eat(html.StartTag, 'ul')
345
346 # Look ahead 2 or 3 tokens:
347 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
348 thead = self._ParseTHead()
349 else:
350 thead = None
351 #log('___ THEAD %s', thead)
352
353 while True:
354 tr_attrs, tr = self._ParseTr()
355 if tr is None:
356 break
357 # Not validating because of colspan
358 if 0:
359 if thead and len(tr) != len(thead):
360 raise html.ParseError('Expected %d cells, got %d: %s',
361 len(thead), len(tr), tr)
362
363 #log('___ TR %s', tr)
364 table['tr'].append((tr_attrs, tr))
365
366 self._Eat(html.EndTag, 'ul')
367
368 self._WhitespaceOk()
369
370 ul_end = self.start_pos
371
372 table['thead'] = thead
373 table['ul_start'] = ul_start
374 table['ul_end'] = ul_end
375
376 if 0:
377 log('table %s', table)
378 from pprint import pprint
379 pprint(table)
380
381 return table
382
383
384def MergeAttrs(thead_td_attrs, row_td_attrs):
385 merged_attrs = []
386
387 if row_td_attrs is None:
388 row_lookup = {}
389 else:
390 row_lookup = {n: v for n, v in row_td_attrs}
391
392 done_for_row = set()
393
394 if thead_td_attrs:
395 for name, raw_value in thead_td_attrs:
396 more_values = row_lookup.get(name)
397 if more_values is not None:
398 raw_value += ' %s' % more_values
399 done_for_row.add(name)
400 merged_attrs.append((name, raw_value))
401
402 if row_td_attrs:
403 for name, raw_value in row_td_attrs:
404 if name in done_for_row:
405 continue
406 merged_attrs.append((name, raw_value))
407
408 return merged_attrs
409
410
411def ReplaceTables(s, debug_out=None):
412 """
413 ul-table: Write tables using bulleted list
414 """
415 if debug_out is None:
416 debug_out = []
417
418 f = StringIO()
419 out = html.Output(s, f)
420
421 tag_lexer = html.TagLexer(s)
422 lexer = html.Lexer(s)
423
424 p = UlTableParser(lexer, tag_lexer)
425
426 while True:
427 ul_start = p.FindUlTable()
428 if ul_start == -1:
429 break
430
431 #log('UL START %d', ul_start)
432 out.PrintUntil(ul_start)
433
434 table = p.ParseTable()
435 #log('UL END %d', ul_end)
436
437 # Don't write the matching </u> of the LAST row, but write everything
438 # after that
439 out.SkipTo(table['ul_end'])
440
441 # Write the header
442 thead = table['thead']
443
444 col_attrs = {} # integer -> td_attrs
445 if thead:
446 out.Print('<thead>\n')
447 out.Print('<tr>\n')
448
449 i = 0
450 for td_attrs, raw_html in thead:
451 if td_attrs:
452 col_attrs[i] = td_attrs
453 # <th> tag is more semantic, and styled bold by default
454 out.Print(' <th>')
455 out.Print(raw_html)
456 out.Print('</th>\n')
457 i += 1
458
459 out.Print('</tr>\n')
460 out.Print('</thead>\n')
461
462 # Write each row
463 for tr_attrs, row in table['tr']:
464
465 # Print tr tag and attrs
466 out.Print('<tr')
467 if tr_attrs:
468 for name, raw_value in tr_attrs:
469 out.Print(' ')
470 out.Print(name)
471 # No escaping because it's raw. It can't contain quotes.
472 out.Print('="%s"' % raw_value)
473 out.Print('>\n')
474
475 # Print cells
476 i = 0
477 for row_td_attrs, raw_html in row:
478 # Inherited from header
479 thead_td_attrs = col_attrs.get(i)
480 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
481
482 out.Print(' <td')
483 for name, raw_value in merged_attrs:
484 out.Print(' ')
485 out.Print(name)
486 # No escaping because it's raw. It can't contain quotes.
487 out.Print('="%s"' % raw_value)
488 out.Print('>')
489
490 out.Print(raw_html)
491 out.Print('</td>\n')
492 i += 1
493 out.Print('</tr>\n')
494
495 out.PrintTheRest()
496
497 return f.getvalue()
498
499
500if __name__ == '__main__':
501 # Simple CLI filter
502 sys.stdout.write(ReplaceTables(sys.stdin.read()))