OILS / doctools / ul_table.py View on Github | oilshell.org

491 lines, 241 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4import cStringIO
5import re
6
7from doctools.util import log
8from lazylex import html
9
10_WHITESPACE_RE = re.compile(r'\s*')
11
12
13class UlTableParser(object):
14
15 def __init__(self, lexer, tag_lexer):
16 self.lexer = lexer
17 self.tag_lexer = tag_lexer
18
19 self.tok_id = html.Invalid
20 self.start_pos = 0
21 self.end_pos = 0
22
23 def _CurrentString(self):
24 part = self.lexer.s[self.start_pos:self.end_pos]
25 return part
26
27 def _Next(self):
28 """
29 Advance and set self.tok_id, self.start_pos, self.end_pos
30 """
31 self.start_pos = self.end_pos
32 self.tok_id, self.end_pos = self.lexer.Read()
33 if 0:
34 part = self._CurrentString()
35 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
36
37 #self.tok_id = html.EndOfStream
38 # Don't change self.end_pos
39
40 def _EatRawData(self, regex):
41 # type: (str) -> None
42 """
43 Assert that we got text data matching a regex, and advance
44 """
45 if self.tok_id != html.RawData:
46 raise html.ParseError('Expected RawData, got %s',
47 html.TokenName(self.tok_id))
48 actual = self._CurrentString()
49 m = re.match(regex, actual) # could compile this
50 if m is None:
51 raise html.ParseError('Expected to match %r, got %r', regex,
52 actual)
53 self._Next()
54
55 def _Eat(self, expected_id, expected_tag):
56 """
57 Assert that we got a start or end tag, with the given name, and advance
58
59 Args:
60 expected_id: html.StartTag or html.EndTag
61 expected_tag: 'a', 'span', etc.
62 """
63 assert expected_id in (html.StartTag,
64 html.EndTag), html.TokenName(expected_id)
65
66 if self.tok_id != expected_id:
67 raise html.ParseError('Expected token %s, got %s',
68 html.TokenName(expected_id),
69 html.TokenName(self.tok_id))
70 self.tag_lexer.Reset(self.start_pos, self.end_pos)
71 tag_name = self.tag_lexer.TagName()
72 if expected_tag != tag_name:
73 raise html.ParseError('Expected tag %r, got %r', expected_tag,
74 tag_name)
75
76 self._Next()
77
78 def _WhitespaceOk(self):
79 """
80 Optional whitespace
81 """
82 if (self.tok_id == html.RawData and
83 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
84 self._Next()
85
86 def FindUlTable(self):
87 """Find <table ...> <ul>
88
89 Return the START position of the <ul>
90 Similar algorithm as html.ReadUntilStartTag()
91 """
92 tag_lexer = self.tag_lexer
93
94 # Find first table
95 while True:
96 self._Next()
97 if self.tok_id == html.EndOfStream:
98 return -1
99
100 tag_lexer.Reset(self.start_pos, self.end_pos)
101 if (self.tok_id == html.StartTag and
102 tag_lexer.TagName() == 'table'):
103 while True:
104 self._Next()
105 if self.tok_id != html.RawData:
106 break
107
108 tag_lexer.Reset(self.start_pos, self.end_pos)
109 if (self.tok_id == html.StartTag and
110 tag_lexer.TagName() == 'ul'):
111 return self.start_pos
112 return -1
113
114 def _ListItem(self):
115 """Parse a list item nested below thead or tr.
116
117 Returns:
118 A pair (td_attrs, inner_html)
119
120 Grammar:
121
122 LIST_ITEM =
123 [RawData \s*]?
124 [StartTag 'li']
125 [StartEndTag 'td-attrs']?
126 ANY* # NOT context-free - anything that's not the end
127 # This is what we should capture in CELLS
128 [EndTag 'li']
129
130 Example of attribute borrowing:
131
132 - hi there ==>
133 <li>hi there</li> ==>
134 <td>hi there</td>
135
136 - <td-attrs class=foo /> hi there ==>
137 <li><td-attrs class=foo /> hi there </li> ==>
138 <td class=foo> hi there </td> ==>
139 """
140 self._WhitespaceOk()
141
142 if self.tok_id != html.StartTag:
143 return None, None
144
145 inner_html = None
146 td_attrs = None # Can we also have col-attrs?
147
148 self._Eat(html.StartTag, 'li')
149
150 if self.tok_id == html.StartEndTag:
151 self.tag_lexer.Reset(self.start_pos, self.end_pos)
152 tag_name = self.tag_lexer.TagName()
153 if tag_name != 'td-attrs':
154 raise html.ParseError('Expected <td-attrs />, got %r' %
155 tag_name)
156 td_attrs = self.tag_lexer.AllAttrsRaw()
157 self._Next()
158
159 left = self.start_pos
160
161 # Find the closing </li>
162 balance = 0
163 while True:
164 # TODO: This has to match NESTED
165 # <li> <li>foo</li> </li>
166 # Because cells can have bulleted lists
167
168 if self.tok_id == html.StartTag:
169 self.tag_lexer.Reset(self.start_pos, self.end_pos)
170 if self.tag_lexer.TagName() == 'li':
171 balance += 1
172
173 if self.tok_id == html.EndTag:
174 self.tag_lexer.Reset(self.start_pos, self.end_pos)
175 if self.tag_lexer.TagName() == 'li':
176 balance -= 1
177 if balance < 0:
178 break
179 self._Next()
180
181 right = self.start_pos # start of the end tag
182
183 inner_html = self.tag_lexer.s[left:right]
184 #log('RAW inner html %r', inner_html)
185
186 #self._Eat(html.EndTag, 'li')
187 self._Next()
188
189 return td_attrs, inner_html
190
191 def _ParseTHead(self):
192 """
193 Assume we're looking at the first <ul> tag. Now we want to find
194 <li>thead and the nested <ul>
195
196 Grammar:
197
198 THEAD =
199 [StartTag 'ul']
200 [RawData \s*]?
201 [StartTag 'li']
202 [RawData thead\s*]
203 [StartTag 'ul'] # Indented bullet that starts -
204 LIST_ITEM+
205 [RawData \s*]?
206 [EndTag 'ul']
207 [RawData thead\s+]
208 [End 'li']
209
210 Two Algorithms:
211
212 1. Replacement:
213 - skip over the first ul 'thead' li, and ul 'tr' li
214 - then replace the next ul -> tr, and li -> td
215 2. Parsing and Rendering:
216 - parse them into a structure
217 - skip all the text
218 - print your own HTML
219
220 I think the second one is better, because it allows attribute extensions
221 to thead
222
223 - thead
224 - name [link][]
225 - colgroup=foo align=left
226 - age
227 - colgroup=foo align=right
228 """
229 #log('*** _ParseTHead')
230 cells = []
231
232 self._WhitespaceOk()
233 self._Eat(html.StartTag, 'li')
234
235 # In CommonMark, r'thead\n' is enough, because it strips trailing
236 # whitespace. I'm not sure if other Markdown processors do that, so
237 # use r'thead\s+'.
238 self._EatRawData(r'thead\s+')
239
240 # This is the row data
241 self._Eat(html.StartTag, 'ul')
242
243 while True:
244 td_attrs, inner_html = self._ListItem()
245 if inner_html is None:
246 break
247 cells.append((td_attrs, inner_html))
248 self._WhitespaceOk()
249
250 self._Eat(html.EndTag, 'ul')
251
252 self._WhitespaceOk()
253 self._Eat(html.EndTag, 'li')
254
255 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
256 return cells
257
258 def _ParseTr(self):
259 """
260 Assume we're looking at the first <ul> tag. Now we want to find
261 <li>tr and the nested <ul>
262
263 Grammar:
264
265 TR =
266 [RawData \s*]?
267 [StartTag 'li']
268 [RawData thead\s*]
269 [StartTag 'ul'] # Indented bullet that starts -
270 ( [StartEndTag tr-attrs] [RawData \s*] )?
271 LIST_ITEM+ # Defined above
272 [RawData \s*]?
273 [EndTag 'ul']
274 """
275 #log('*** _ParseTr')
276
277 cells = []
278
279 self._WhitespaceOk()
280
281 # Could be a </ul>
282 if self.tok_id != html.StartTag:
283 return None, None
284
285 self._Eat(html.StartTag, 'li')
286
287 self._EatRawData(r'tr\s*')
288
289 tr_attrs = None
290 if self.tok_id == html.StartEndTag:
291 self.tag_lexer.Reset(self.start_pos, self.end_pos)
292 tr_attrs = self.tag_lexer.AllAttrsRaw()
293 self._Next()
294 self._WhitespaceOk()
295
296 # This is the row data
297 self._Eat(html.StartTag, 'ul')
298
299 while True:
300 td_attrs, inner_html = self._ListItem()
301 if inner_html is None:
302 break
303 cells.append((td_attrs, inner_html))
304 # TODO: assert
305
306 self._WhitespaceOk()
307
308 self._Eat(html.EndTag, 'ul')
309
310 self._WhitespaceOk()
311 self._Eat(html.EndTag, 'li')
312
313 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
314 return tr_attrs, cells
315
316 def ParseTable(self):
317 """
318 Returns a structure like this
319 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
320 'tr': [ # raw HTML that you surround with <td>
321 [ 'cell1 html', 'cell2 html' ],
322 [ 'cell1 html', 'cell2 html' ],
323 ]
324 }
325
326 Grammar:
327
328 UL_TABLE =
329 [StartTag 'ul']
330 THEAD # this this returns the number of cells, so it's NOT context
331 # free
332 TR*
333 [EndTag 'ul']
334 """
335 table = {'tr': []}
336
337 ul_start = self.start_pos
338 self._Eat(html.StartTag, 'ul')
339
340 # Look ahead 2 or 3 tokens:
341 if self.lexer.LookAhead(r'\s*<li>thead\s+'):
342 thead = self._ParseTHead()
343 else:
344 thead = None
345 #log('___ THEAD %s', thead)
346
347 while True:
348 tr_attrs, tr = self._ParseTr()
349 if tr is None:
350 break
351 # Not validating because of colspan
352 if 0:
353 if thead and len(tr) != len(thead):
354 raise html.ParseError('Expected %d cells, got %d: %s',
355 len(thead), len(tr), tr)
356
357 #log('___ TR %s', tr)
358 table['tr'].append((tr_attrs, tr))
359
360 self._Eat(html.EndTag, 'ul')
361
362 self._WhitespaceOk()
363
364 ul_end = self.start_pos
365
366 table['thead'] = thead
367 table['ul_start'] = ul_start
368 table['ul_end'] = ul_end
369
370 if 0:
371 log('table %s', table)
372 from pprint import pprint
373 pprint(table)
374
375 return table
376
377
378def MergeAttrs(thead_td_attrs, row_td_attrs):
379 merged_attrs = []
380
381 if row_td_attrs is None:
382 row_lookup = {}
383 else:
384 row_lookup = {n: v for n, v in row_td_attrs}
385
386 done_for_row = set()
387
388 if thead_td_attrs:
389 for name, raw_value in thead_td_attrs:
390 more_values = row_lookup.get(name)
391 if more_values is not None:
392 raw_value += ' %s' % more_values
393 done_for_row.add(name)
394 merged_attrs.append((name, raw_value))
395
396 if row_td_attrs:
397 for name, raw_value in row_td_attrs:
398 if name in done_for_row:
399 continue
400 merged_attrs.append((name, raw_value))
401
402 return merged_attrs
403
404
405def ReplaceTables(s, debug_out=None):
406 """
407 ul-table: Write tables using bulleted list
408 """
409 if debug_out is None:
410 debug_out = []
411
412 f = cStringIO.StringIO()
413 out = html.Output(s, f)
414
415 tag_lexer = html.TagLexer(s)
416 lexer = html.Lexer(s)
417
418 p = UlTableParser(lexer, tag_lexer)
419
420 while True:
421 ul_start = p.FindUlTable()
422 if ul_start == -1:
423 break
424
425 #log('UL START %d', ul_start)
426 out.PrintUntil(ul_start)
427
428 table = p.ParseTable()
429 #log('UL END %d', ul_end)
430
431 # Don't write the matching </u> of the LAST row, but write everything
432 # after that
433 out.SkipTo(table['ul_end'])
434
435 # Write the header
436 thead = table['thead']
437
438 col_attrs = {} # integer -> td_attrs
439 if thead:
440 out.Print('<thead>\n')
441 out.Print('<tr>\n')
442
443 i = 0
444 for td_attrs, raw_html in thead:
445 if td_attrs:
446 col_attrs[i] = td_attrs
447 # <th> tag is more semantic, and styled bold by default
448 out.Print(' <th>')
449 out.Print(raw_html)
450 out.Print('</th>\n')
451 i += 1
452
453 out.Print('</tr>\n')
454 out.Print('</thead>\n')
455
456 # Write each row
457 for tr_attrs, row in table['tr']:
458
459 # Print tr tag and attrs
460 out.Print('<tr')
461 if tr_attrs:
462 for name, raw_value in tr_attrs:
463 out.Print(' ')
464 out.Print(name)
465 # No escaping because it's raw. It can't contain quotes.
466 out.Print('="%s"' % raw_value)
467 out.Print('>\n')
468
469 # Print cells
470 i = 0
471 for row_td_attrs, raw_html in row:
472 # Inherited from header
473 thead_td_attrs = col_attrs.get(i)
474 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
475
476 out.Print(' <td')
477 for name, raw_value in merged_attrs:
478 out.Print(' ')
479 out.Print(name)
480 # No escaping because it's raw. It can't contain quotes.
481 out.Print('="%s"' % raw_value)
482 out.Print('>')
483
484 out.Print(raw_html)
485 out.Print('</td>\n')
486 i += 1
487 out.Print('</tr>\n')
488
489 out.PrintTheRest()
490
491 return f.getvalue()