OILS / doctools / ul_table.py View on Github | oilshell.org

493 lines, 244 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4import cStringIO
5import re
6
7from doctools.util import log
8from lazylex import html
9
10_WHITESPACE_RE = re.compile(r'\s*')
11
12
13class UlTableParser(object):
14
15 def __init__(self, lexer, tag_lexer):
16 self.lexer = lexer
17 self.tag_lexer = tag_lexer
18
19 self.tok_id = html.Invalid
20 self.start_pos = 0
21 self.end_pos = 0
22
23 def _CurrentString(self):
24 part = self.lexer.s[self.start_pos:self.end_pos]
25 return part
26
27 def _Next(self):
28 """
29 Advance and set self.tok_id, self.start_pos, self.end_pos
30 """
31 self.start_pos = self.end_pos
32 self.tok_id, self.end_pos = self.lexer.Read()
33 if 0:
34 part = self._CurrentString()
35 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
36
37 #self.tok_id = html.EndOfStream
38 # Don't change self.end_pos
39
40 def _EatRawData(self, regex):
41 # type: (str) -> None
42 """
43 Assert that we got text data matching a regex, and advance
44 """
45 if self.tok_id != html.RawData:
46 raise html.ParseError('Expected RawData, got %s',
47 html.TokenName(self.tok_id))
48 actual = self._CurrentString()
49 m = re.match(regex, actual) # could compile this
50 if m is None:
51 raise html.ParseError('Expected to match %r, got %r', regex,
52 actual)
53 self._Next()
54
55 def _Eat(self, expected_id, expected_tag):
56 """
57 Assert that we got a start or end tag, with the given name, and advance
58
59 Args:
60 expected_id: html.StartTag or html.EndTag
61 expected_tag: 'a', 'span', etc.
62 """
63 assert expected_id in (html.StartTag,
64 html.EndTag), html.TokenName(expected_id)
65
66 if self.tok_id != expected_id:
67 raise html.ParseError('Expected token %s, got %s',
68 html.TokenName(expected_id),
69 html.TokenName(self.tok_id))
70 self.tag_lexer.Reset(self.start_pos, self.end_pos)
71 tag_name = self.tag_lexer.TagName()
72 if expected_tag != tag_name:
73 raise html.ParseError('Expected tag %r, got %r', expected_tag,
74 tag_name)
75
76 self._Next()
77
78 def _WhitespaceOk(self):
79 """
80 Optional whitespace
81 """
82 if (self.tok_id == html.RawData and
83 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
84 self._Next()
85
86 def FindUlTable(self):
87 """Find <table ...> <ul>
88
89 Return the START position of the <ul>
90 Similar algorithm as html.ReadUntilStartTag()
91 """
92 tag_lexer = self.tag_lexer
93
94 # Find first table
95 while True:
96 self._Next()
97 if self.tok_id == html.EndOfStream:
98 return -1
99
100 tag_lexer.Reset(self.start_pos, self.end_pos)
101 if (self.tok_id == html.StartTag and
102 tag_lexer.TagName() == 'table'):
103 while True:
104 self._Next()
105 if self.tok_id != html.RawData:
106 break
107
108 tag_lexer.Reset(self.start_pos, self.end_pos)
109 if (self.tok_id == html.StartTag and
110 tag_lexer.TagName() == 'ul'):
111 return self.start_pos
112 return -1
113
114 def _ListItem(self):
115 """Parse a list item nested below thead or tr.
116
117 Returns:
118 A pair (td_attrs, inner_html)
119
120 Grammar:
121
122 LIST_ITEM =
123 [RawData \s*]?
124 [StartTag 'li']
125 [StartEndTag 'cell-attrs']?
126 ANY* # NOT context-free - anything that's not the end
127 # This is what we should capture in CELLS
128 [EndTag 'li']
129
130 Example of attribute borrowing:
131
132 - hi there ==>
133 <li>hi there</li> ==>
134 <td>hi there</td>
135
136 - <cell-attrs class=foo /> hi there ==>
137 <li><cell-attrs class=foo /> hi there </li> ==>
138 <td class=foo> hi there </td> ==>
139 """
140 self._WhitespaceOk()
141
142 if self.tok_id != html.StartTag:
143 return None, None
144
145 inner_html = None
146 td_attrs = None # Can we also have col-attrs?
147
148 self._Eat(html.StartTag, 'li')
149
150 if self.tok_id == html.StartEndTag:
151 self.tag_lexer.Reset(self.start_pos, self.end_pos)
152 tag_name = self.tag_lexer.TagName()
153 # TODO: remove td-attrs backward compat
154 if tag_name not in ('td-attrs', 'cell-attrs'):
155 raise html.ParseError('Expected <cell-attrs />, got %r' %
156 tag_name)
157 td_attrs = self.tag_lexer.AllAttrsRaw()
158 self._Next()
159
160 left = self.start_pos
161
162 # Find the closing </li>, taking into accounted NESTED tags:
163 # <li> <li>foo</li> </li>
164 # because cells can have bulleted lists
165 balance = 0
166 while True:
167 if self.tok_id == html.StartTag:
168 self.tag_lexer.Reset(self.start_pos, self.end_pos)
169 if self.tag_lexer.TagName() == 'li':
170 balance += 1
171
172 if self.tok_id == html.EndTag:
173 self.tag_lexer.Reset(self.start_pos, self.end_pos)
174 if self.tag_lexer.TagName() == 'li':
175 balance -= 1
176 if balance < 0:
177 break
178 self._Next()
179
180 right = self.start_pos # start of the end tag
181
182 inner_html = self.tag_lexer.s[left:right]
183 #log('RAW inner html %r', inner_html)
184
185 #self._Eat(html.EndTag, 'li')
186 self._Next()
187
188 return td_attrs, inner_html
189
190 def _ParseTHead(self):
191 """
192 Assume we're looking at the first <ul> tag. Now we want to find
193 <li>thead and the nested <ul>
194
195 Grammar:
196
197 THEAD =
198 [StartTag 'ul']
199 [RawData \s*]?
200 [StartTag 'li']
201 [RawData thead\s*]
202 [StartTag 'ul'] # Indented bullet that starts -
203 LIST_ITEM+
204 [RawData \s*]?
205 [EndTag 'ul']
206 [RawData thead\s+]
207 [End 'li']
208
209 Two Algorithms:
210
211 1. Replacement:
212 - skip over the first ul 'thead' li, and ul 'tr' li
213 - then replace the next ul -> tr, and li -> td
214 2. Parsing and Rendering:
215 - parse them into a structure
216 - skip all the text
217 - print your own HTML
218
219 I think the second one is better, because it allows attribute extensions
220 to thead
221
222 - thead
223 - name [link][]
224 - colgroup=foo align=left
225 - age
226 - colgroup=foo align=right
227 """
228 #log('*** _ParseTHead')
229 cells = []
230
231 self._WhitespaceOk()
232 self._Eat(html.StartTag, 'li')
233
234 # In CommonMark, r'thead\n' is enough, because it strips trailing
235 # whitespace. I'm not sure if other Markdown processors do that, so
236 # use r'thead\s+'.
237 self._EatRawData(r'thead\s+')
238
239 # This is the row data
240 self._Eat(html.StartTag, 'ul')
241
242 while True:
243 td_attrs, inner_html = self._ListItem()
244 if inner_html is None:
245 break
246 cells.append((td_attrs, inner_html))
247 self._WhitespaceOk()
248
249 self._Eat(html.EndTag, 'ul')
250
251 self._WhitespaceOk()
252 self._Eat(html.EndTag, 'li')
253
254 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
255 return cells
256
257 def _ParseTr(self):
258 """
259 Assume we're looking at the first <ul> tag. Now we want to find
260 <li>tr and the nested <ul>
261
262 Grammar:
263
264 TR =
265 [RawData \s*]?
266 [StartTag 'li']
267 [RawData thead\s*]
268 [StartTag 'ul'] # Indented bullet that starts -
269 ( [StartEndTag row-attrs] [RawData \s*] )?
270 LIST_ITEM+ # Defined above
271 [RawData \s*]?
272 [EndTag 'ul']
273 """
274 #log('*** _ParseTr')
275
276 cells = []
277
278 self._WhitespaceOk()
279
280 # Could be a </ul>
281 if self.tok_id != html.StartTag:
282 return None, None
283
284 self._Eat(html.StartTag, 'li')
285
286 self._EatRawData(r'tr\s*')
287
288 tr_attrs = None
289 if self.tok_id == html.StartEndTag:
290 self.tag_lexer.Reset(self.start_pos, self.end_pos)
291 tag_name = self.tag_lexer.TagName()
292 if tag_name != 'row-attrs':
293 raise html.ParseError('Expected row-attrs, got %r' % tag_name)
294 tr_attrs = self.tag_lexer.AllAttrsRaw()
295 self._Next()
296 self._WhitespaceOk()
297
298 # This is the row data
299 self._Eat(html.StartTag, 'ul')
300
301 while True:
302 td_attrs, inner_html = self._ListItem()
303 if inner_html is None:
304 break
305 cells.append((td_attrs, inner_html))
306 # TODO: assert
307
308 self._WhitespaceOk()
309
310 self._Eat(html.EndTag, 'ul')
311
312 self._WhitespaceOk()
313 self._Eat(html.EndTag, 'li')
314
315 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
316 return tr_attrs, cells
317
318 def ParseTable(self):
319 """
320 Returns a structure like this
321 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
322 'tr': [ # raw HTML that you surround with <td>
323 [ 'cell1 html', 'cell2 html' ],
324 [ 'cell1 html', 'cell2 html' ],
325 ]
326 }
327
328 Grammar:
329
330 UL_TABLE =
331 [StartTag 'ul']
332 THEAD # this this returns the number of cells, so it's NOT context
333 # free
334 TR*
335 [EndTag 'ul']
336 """
337 table = {'tr': []}
338
339 ul_start = self.start_pos
340 self._Eat(html.StartTag, 'ul')
341
342 # Look ahead 2 or 3 tokens:
343 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
344 thead = self._ParseTHead()
345 else:
346 thead = None
347 #log('___ THEAD %s', thead)
348
349 while True:
350 tr_attrs, tr = self._ParseTr()
351 if tr is None:
352 break
353 # Not validating because of colspan
354 if 0:
355 if thead and len(tr) != len(thead):
356 raise html.ParseError('Expected %d cells, got %d: %s',
357 len(thead), len(tr), tr)
358
359 #log('___ TR %s', tr)
360 table['tr'].append((tr_attrs, tr))
361
362 self._Eat(html.EndTag, 'ul')
363
364 self._WhitespaceOk()
365
366 ul_end = self.start_pos
367
368 table['thead'] = thead
369 table['ul_start'] = ul_start
370 table['ul_end'] = ul_end
371
372 if 0:
373 log('table %s', table)
374 from pprint import pprint
375 pprint(table)
376
377 return table
378
379
380def MergeAttrs(thead_td_attrs, row_td_attrs):
381 merged_attrs = []
382
383 if row_td_attrs is None:
384 row_lookup = {}
385 else:
386 row_lookup = {n: v for n, v in row_td_attrs}
387
388 done_for_row = set()
389
390 if thead_td_attrs:
391 for name, raw_value in thead_td_attrs:
392 more_values = row_lookup.get(name)
393 if more_values is not None:
394 raw_value += ' %s' % more_values
395 done_for_row.add(name)
396 merged_attrs.append((name, raw_value))
397
398 if row_td_attrs:
399 for name, raw_value in row_td_attrs:
400 if name in done_for_row:
401 continue
402 merged_attrs.append((name, raw_value))
403
404 return merged_attrs
405
406
407def ReplaceTables(s, debug_out=None):
408 """
409 ul-table: Write tables using bulleted list
410 """
411 if debug_out is None:
412 debug_out = []
413
414 f = cStringIO.StringIO()
415 out = html.Output(s, f)
416
417 tag_lexer = html.TagLexer(s)
418 lexer = html.Lexer(s)
419
420 p = UlTableParser(lexer, tag_lexer)
421
422 while True:
423 ul_start = p.FindUlTable()
424 if ul_start == -1:
425 break
426
427 #log('UL START %d', ul_start)
428 out.PrintUntil(ul_start)
429
430 table = p.ParseTable()
431 #log('UL END %d', ul_end)
432
433 # Don't write the matching </u> of the LAST row, but write everything
434 # after that
435 out.SkipTo(table['ul_end'])
436
437 # Write the header
438 thead = table['thead']
439
440 col_attrs = {} # integer -> td_attrs
441 if thead:
442 out.Print('<thead>\n')
443 out.Print('<tr>\n')
444
445 i = 0
446 for td_attrs, raw_html in thead:
447 if td_attrs:
448 col_attrs[i] = td_attrs
449 # <th> tag is more semantic, and styled bold by default
450 out.Print(' <th>')
451 out.Print(raw_html)
452 out.Print('</th>\n')
453 i += 1
454
455 out.Print('</tr>\n')
456 out.Print('</thead>\n')
457
458 # Write each row
459 for tr_attrs, row in table['tr']:
460
461 # Print tr tag and attrs
462 out.Print('<tr')
463 if tr_attrs:
464 for name, raw_value in tr_attrs:
465 out.Print(' ')
466 out.Print(name)
467 # No escaping because it's raw. It can't contain quotes.
468 out.Print('="%s"' % raw_value)
469 out.Print('>\n')
470
471 # Print cells
472 i = 0
473 for row_td_attrs, raw_html in row:
474 # Inherited from header
475 thead_td_attrs = col_attrs.get(i)
476 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
477
478 out.Print(' <td')
479 for name, raw_value in merged_attrs:
480 out.Print(' ')
481 out.Print(name)
482 # No escaping because it's raw. It can't contain quotes.
483 out.Print('="%s"' % raw_value)
484 out.Print('>')
485
486 out.Print(raw_html)
487 out.Print('</td>\n')
488 i += 1
489 out.Print('</tr>\n')
490
491 out.PrintTheRest()
492
493 return f.getvalue()