OILS / doctools / ul_table.py View on Github | oilshell.org

496 lines, 247 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4try:
5 from cStringIO import StringIO
6except ImportError:
7 from io import StringIO
8import re
9
10from doctools.util import log
11from lazylex import html
12
13_WHITESPACE_RE = re.compile(r'\s*')
14
15
16class UlTableParser(object):
17
18 def __init__(self, lexer, tag_lexer):
19 self.lexer = lexer
20 self.tag_lexer = tag_lexer
21
22 self.tok_id = html.Invalid
23 self.start_pos = 0
24 self.end_pos = 0
25
26 def _CurrentString(self):
27 part = self.lexer.s[self.start_pos:self.end_pos]
28 return part
29
30 def _Next(self):
31 """
32 Advance and set self.tok_id, self.start_pos, self.end_pos
33 """
34 self.start_pos = self.end_pos
35 self.tok_id, self.end_pos = self.lexer.Read()
36 if 0:
37 part = self._CurrentString()
38 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
39
40 #self.tok_id = html.EndOfStream
41 # Don't change self.end_pos
42
43 def _EatRawData(self, regex):
44 # type: (str) -> None
45 """
46 Assert that we got text data matching a regex, and advance
47 """
48 if self.tok_id != html.RawData:
49 raise html.ParseError('Expected RawData, got %s',
50 html.TokenName(self.tok_id))
51 actual = self._CurrentString()
52 m = re.match(regex, actual) # could compile this
53 if m is None:
54 raise html.ParseError('Expected to match %r, got %r', regex,
55 actual)
56 self._Next()
57
58 def _Eat(self, expected_id, expected_tag):
59 """
60 Assert that we got a start or end tag, with the given name, and advance
61
62 Args:
63 expected_id: html.StartTag or html.EndTag
64 expected_tag: 'a', 'span', etc.
65 """
66 assert expected_id in (html.StartTag,
67 html.EndTag), html.TokenName(expected_id)
68
69 if self.tok_id != expected_id:
70 raise html.ParseError('Expected token %s, got %s',
71 html.TokenName(expected_id),
72 html.TokenName(self.tok_id))
73 self.tag_lexer.Reset(self.start_pos, self.end_pos)
74 tag_name = self.tag_lexer.TagName()
75 if expected_tag != tag_name:
76 raise html.ParseError('Expected tag %r, got %r', expected_tag,
77 tag_name)
78
79 self._Next()
80
81 def _WhitespaceOk(self):
82 """
83 Optional whitespace
84 """
85 if (self.tok_id == html.RawData and
86 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
87 self._Next()
88
89 def FindUlTable(self):
90 """Find <table ...> <ul>
91
92 Return the START position of the <ul>
93 Similar algorithm as html.ReadUntilStartTag()
94 """
95 tag_lexer = self.tag_lexer
96
97 # Find first table
98 while True:
99 self._Next()
100 if self.tok_id == html.EndOfStream:
101 return -1
102
103 tag_lexer.Reset(self.start_pos, self.end_pos)
104 if (self.tok_id == html.StartTag and
105 tag_lexer.TagName() == 'table'):
106 while True:
107 self._Next()
108 if self.tok_id != html.RawData:
109 break
110
111 tag_lexer.Reset(self.start_pos, self.end_pos)
112 if (self.tok_id == html.StartTag and
113 tag_lexer.TagName() == 'ul'):
114 return self.start_pos
115 return -1
116
117 def _ListItem(self):
118 """Parse a list item nested below thead or tr.
119
120 Returns:
121 A pair (td_attrs, inner_html)
122
123 Grammar:
124
125 LIST_ITEM =
126 [RawData \s*]?
127 [StartTag 'li']
128 [StartEndTag 'cell-attrs']?
129 ANY* # NOT context-free - anything that's not the end
130 # This is what we should capture in CELLS
131 [EndTag 'li']
132
133 Example of attribute borrowing:
134
135 - hi there ==>
136 <li>hi there</li> ==>
137 <td>hi there</td>
138
139 - <cell-attrs class=foo /> hi there ==>
140 <li><cell-attrs class=foo /> hi there </li> ==>
141 <td class=foo> hi there </td> ==>
142 """
143 self._WhitespaceOk()
144
145 if self.tok_id != html.StartTag:
146 return None, None
147
148 inner_html = None
149 td_attrs = None # Can we also have col-attrs?
150
151 self._Eat(html.StartTag, 'li')
152
153 if self.tok_id == html.StartEndTag:
154 self.tag_lexer.Reset(self.start_pos, self.end_pos)
155 tag_name = self.tag_lexer.TagName()
156 # TODO: remove td-attrs backward compat
157 if tag_name not in ('td-attrs', 'cell-attrs'):
158 raise html.ParseError('Expected <cell-attrs />, got %r' %
159 tag_name)
160 td_attrs = self.tag_lexer.AllAttrsRaw()
161 self._Next()
162
163 left = self.start_pos
164
165 # Find the closing </li>, taking into accounted NESTED tags:
166 # <li> <li>foo</li> </li>
167 # because cells can have bulleted lists
168 balance = 0
169 while True:
170 if self.tok_id == html.StartTag:
171 self.tag_lexer.Reset(self.start_pos, self.end_pos)
172 if self.tag_lexer.TagName() == 'li':
173 balance += 1
174
175 if self.tok_id == html.EndTag:
176 self.tag_lexer.Reset(self.start_pos, self.end_pos)
177 if self.tag_lexer.TagName() == 'li':
178 balance -= 1
179 if balance < 0:
180 break
181 self._Next()
182
183 right = self.start_pos # start of the end tag
184
185 inner_html = self.tag_lexer.s[left:right]
186 #log('RAW inner html %r', inner_html)
187
188 #self._Eat(html.EndTag, 'li')
189 self._Next()
190
191 return td_attrs, inner_html
192
193 def _ParseTHead(self):
194 """
195 Assume we're looking at the first <ul> tag. Now we want to find
196 <li>thead and the nested <ul>
197
198 Grammar:
199
200 THEAD =
201 [StartTag 'ul']
202 [RawData \s*]?
203 [StartTag 'li']
204 [RawData thead\s*]
205 [StartTag 'ul'] # Indented bullet that starts -
206 LIST_ITEM+
207 [RawData \s*]?
208 [EndTag 'ul']
209 [RawData thead\s+]
210 [End 'li']
211
212 Two Algorithms:
213
214 1. Replacement:
215 - skip over the first ul 'thead' li, and ul 'tr' li
216 - then replace the next ul -> tr, and li -> td
217 2. Parsing and Rendering:
218 - parse them into a structure
219 - skip all the text
220 - print your own HTML
221
222 I think the second one is better, because it allows attribute extensions
223 to thead
224
225 - thead
226 - name [link][]
227 - colgroup=foo align=left
228 - age
229 - colgroup=foo align=right
230 """
231 #log('*** _ParseTHead')
232 cells = []
233
234 self._WhitespaceOk()
235 self._Eat(html.StartTag, 'li')
236
237 # In CommonMark, r'thead\n' is enough, because it strips trailing
238 # whitespace. I'm not sure if other Markdown processors do that, so
239 # use r'thead\s+'.
240 self._EatRawData(r'thead\s+')
241
242 # This is the row data
243 self._Eat(html.StartTag, 'ul')
244
245 while True:
246 td_attrs, inner_html = self._ListItem()
247 if inner_html is None:
248 break
249 cells.append((td_attrs, inner_html))
250 self._WhitespaceOk()
251
252 self._Eat(html.EndTag, 'ul')
253
254 self._WhitespaceOk()
255 self._Eat(html.EndTag, 'li')
256
257 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
258 return cells
259
260 def _ParseTr(self):
261 """
262 Assume we're looking at the first <ul> tag. Now we want to find
263 <li>tr and the nested <ul>
264
265 Grammar:
266
267 TR =
268 [RawData \s*]?
269 [StartTag 'li']
270 [RawData thead\s*]
271 [StartTag 'ul'] # Indented bullet that starts -
272 ( [StartEndTag row-attrs] [RawData \s*] )?
273 LIST_ITEM+ # Defined above
274 [RawData \s*]?
275 [EndTag 'ul']
276 """
277 #log('*** _ParseTr')
278
279 cells = []
280
281 self._WhitespaceOk()
282
283 # Could be a </ul>
284 if self.tok_id != html.StartTag:
285 return None, None
286
287 self._Eat(html.StartTag, 'li')
288
289 self._EatRawData(r'tr\s*')
290
291 tr_attrs = None
292 if self.tok_id == html.StartEndTag:
293 self.tag_lexer.Reset(self.start_pos, self.end_pos)
294 tag_name = self.tag_lexer.TagName()
295 if tag_name != 'row-attrs':
296 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
297 tr_attrs = self.tag_lexer.AllAttrsRaw()
298 self._Next()
299 self._WhitespaceOk()
300
301 # This is the row data
302 self._Eat(html.StartTag, 'ul')
303
304 while True:
305 td_attrs, inner_html = self._ListItem()
306 if inner_html is None:
307 break
308 cells.append((td_attrs, inner_html))
309 # TODO: assert
310
311 self._WhitespaceOk()
312
313 self._Eat(html.EndTag, 'ul')
314
315 self._WhitespaceOk()
316 self._Eat(html.EndTag, 'li')
317
318 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
319 return tr_attrs, cells
320
321 def ParseTable(self):
322 """
323 Returns a structure like this
324 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
325 'tr': [ # raw HTML that you surround with <td>
326 [ 'cell1 html', 'cell2 html' ],
327 [ 'cell1 html', 'cell2 html' ],
328 ]
329 }
330
331 Grammar:
332
333 UL_TABLE =
334 [StartTag 'ul']
335 THEAD # this this returns the number of cells, so it's NOT context
336 # free
337 TR*
338 [EndTag 'ul']
339 """
340 table = {'tr': []}
341
342 ul_start = self.start_pos
343 self._Eat(html.StartTag, 'ul')
344
345 # Look ahead 2 or 3 tokens:
346 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
347 thead = self._ParseTHead()
348 else:
349 thead = None
350 #log('___ THEAD %s', thead)
351
352 while True:
353 tr_attrs, tr = self._ParseTr()
354 if tr is None:
355 break
356 # Not validating because of colspan
357 if 0:
358 if thead and len(tr) != len(thead):
359 raise html.ParseError('Expected %d cells, got %d: %s',
360 len(thead), len(tr), tr)
361
362 #log('___ TR %s', tr)
363 table['tr'].append((tr_attrs, tr))
364
365 self._Eat(html.EndTag, 'ul')
366
367 self._WhitespaceOk()
368
369 ul_end = self.start_pos
370
371 table['thead'] = thead
372 table['ul_start'] = ul_start
373 table['ul_end'] = ul_end
374
375 if 0:
376 log('table %s', table)
377 from pprint import pprint
378 pprint(table)
379
380 return table
381
382
383def MergeAttrs(thead_td_attrs, row_td_attrs):
384 merged_attrs = []
385
386 if row_td_attrs is None:
387 row_lookup = {}
388 else:
389 row_lookup = {n: v for n, v in row_td_attrs}
390
391 done_for_row = set()
392
393 if thead_td_attrs:
394 for name, raw_value in thead_td_attrs:
395 more_values = row_lookup.get(name)
396 if more_values is not None:
397 raw_value += ' %s' % more_values
398 done_for_row.add(name)
399 merged_attrs.append((name, raw_value))
400
401 if row_td_attrs:
402 for name, raw_value in row_td_attrs:
403 if name in done_for_row:
404 continue
405 merged_attrs.append((name, raw_value))
406
407 return merged_attrs
408
409
410def ReplaceTables(s, debug_out=None):
411 """
412 ul-table: Write tables using bulleted list
413 """
414 if debug_out is None:
415 debug_out = []
416
417 f = StringIO()
418 out = html.Output(s, f)
419
420 tag_lexer = html.TagLexer(s)
421 lexer = html.Lexer(s)
422
423 p = UlTableParser(lexer, tag_lexer)
424
425 while True:
426 ul_start = p.FindUlTable()
427 if ul_start == -1:
428 break
429
430 #log('UL START %d', ul_start)
431 out.PrintUntil(ul_start)
432
433 table = p.ParseTable()
434 #log('UL END %d', ul_end)
435
436 # Don't write the matching </u> of the LAST row, but write everything
437 # after that
438 out.SkipTo(table['ul_end'])
439
440 # Write the header
441 thead = table['thead']
442
443 col_attrs = {} # integer -> td_attrs
444 if thead:
445 out.Print('<thead>\n')
446 out.Print('<tr>\n')
447
448 i = 0
449 for td_attrs, raw_html in thead:
450 if td_attrs:
451 col_attrs[i] = td_attrs
452 # <th> tag is more semantic, and styled bold by default
453 out.Print(' <th>')
454 out.Print(raw_html)
455 out.Print('</th>\n')
456 i += 1
457
458 out.Print('</tr>\n')
459 out.Print('</thead>\n')
460
461 # Write each row
462 for tr_attrs, row in table['tr']:
463
464 # Print tr tag and attrs
465 out.Print('<tr')
466 if tr_attrs:
467 for name, raw_value in tr_attrs:
468 out.Print(' ')
469 out.Print(name)
470 # No escaping because it's raw. It can't contain quotes.
471 out.Print('="%s"' % raw_value)
472 out.Print('>\n')
473
474 # Print cells
475 i = 0
476 for row_td_attrs, raw_html in row:
477 # Inherited from header
478 thead_td_attrs = col_attrs.get(i)
479 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
480
481 out.Print(' <td')
482 for name, raw_value in merged_attrs:
483 out.Print(' ')
484 out.Print(name)
485 # No escaping because it's raw. It can't contain quotes.
486 out.Print('="%s"' % raw_value)
487 out.Print('>')
488
489 out.Print(raw_html)
490 out.Print('</td>\n')
491 i += 1
492 out.Print('</tr>\n')
493
494 out.PrintTheRest()
495
496 return f.getvalue()