OILS / doctools / ul_table.py View on Github | oilshell.org

481 lines, 239 significant
1#!/usr/bin/env python2
2"""ul_table.py: Markdown Tables Without New Syntax."""
3
4import cStringIO
5import re
6
7from doctools.util import log
8from lazylex import html
9
10
11class UlTableParser(object):
12
13 def __init__(self, lexer, tag_lexer):
14 self.lexer = lexer
15 self.tag_lexer = tag_lexer
16
17 self.tok_id = html.Invalid
18 self.start_pos = 0
19 self.end_pos = 0
20
21 def _CurrentString(self):
22 part = self.tag_lexer.s[self.start_pos:self.end_pos]
23 return part
24
25 def _Next(self):
26 """
27 Advance and set self.tok_id, self.start_pos, self.end_pos
28 """
29 self.start_pos = self.end_pos
30 try:
31 self.tok_id, self.end_pos = next(self.lexer)
32 except StopIteration:
33 raise
34 if 0:
35 part = self._CurrentString()
36 log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
37
38 #self.tok_id = html.EndOfStream
39 # Don't change self.end_pos
40
41 def _EatRawData(self, regex):
42 # type: (str) -> None
43 """
44 Assert that we got text data matching a regex, and advance
45 """
46 if self.tok_id != html.RawData:
47 raise html.ParseError('Expected RawData, got %s',
48 html.TokenName(self.tok_id))
49 actual = self._CurrentString()
50 m = re.match(regex, actual) # could compile this
51 if m is None:
52 raise html.ParseError('Expected to match %r, got %r', regex,
53 actual)
54 self._Next()
55
56 def _Eat(self, tok_id, s):
57 """
58 Assert that we got a start or end tag, with the given name, and advance
59 """
60 if self.tok_id != tok_id:
61 raise html.ParseError('Expected token %s, got %s',
62 html.TokenName(tok_id),
63 html.TokenName(self.tok_id))
64 if tok_id in (html.StartTag, html.EndTag):
65 self.tag_lexer.Reset(self.start_pos, self.end_pos)
66 tag_name = self.tag_lexer.TagName()
67 if s != tag_name:
68 raise html.ParseError('Expected tag %r, got %r', s, tag_name)
69 else:
70 if s is not None:
71 raise AssertionError("Don't know what to do with %r" % s)
72 self._Next()
73
74 def _WhitespaceOk(self):
75 """
76 Optional whitespace
77 """
78 if self.tok_id == html.RawData and self._CurrentString().isspace():
79 self._Next()
80
81 def FindUlTable(self):
82 """Find <table ...> <ul>
83
84 Return the START position of the <ul>
85 Similar algorithm as html.ReadUntilStartTag()
86 """
87 tag_lexer = self.tag_lexer
88
89 # Find first table
90 while True:
91 self._Next()
92 if self.tok_id == html.EndOfStream:
93 return -1
94
95 tag_lexer.Reset(self.start_pos, self.end_pos)
96 if (self.tok_id == html.StartTag and
97 tag_lexer.TagName() == 'table'):
98 while True:
99 self._Next()
100 if self.tok_id != html.RawData:
101 break
102
103 tag_lexer.Reset(self.start_pos, self.end_pos)
104 if (self.tok_id == html.StartTag and
105 tag_lexer.TagName() == 'ul'):
106 return self.start_pos
107 return -1
108
109 def _ListItem(self):
110 """Parse a list item nested below thead or tr.
111
112 Returns:
113 A pair (td_attrs, inner_html)
114
115 Grammar:
116
117 LIST_ITEM =
118 [RawData \s*]?
119 [StartTag 'li']
120 [StartEndTag 'td-attrs']?
121 ANY* # NOT context-free - anything that's not the end
122 # This is what we should capture in CELLS
123 [EndTag 'li']
124
125 Example of attribute borrowing:
126
127 - hi there ==>
128 <li>hi there</li> ==>
129 <td>hi there</td>
130
131 - <td-attrs class=foo /> hi there ==>
132 <li><td-attrs class=foo /> hi there </li> ==>
133 <td class=foo> hi there </td> ==>
134 """
135 self._WhitespaceOk()
136
137 if self.tok_id != html.StartTag:
138 return None, None
139
140 inner_html = None
141 td_attrs = None # Can we also have col-attrs?
142
143 self._Eat(html.StartTag, 'li')
144
145 if self.tok_id == html.StartEndTag:
146 self.tag_lexer.Reset(self.start_pos, self.end_pos)
147 tag_name = self.tag_lexer.TagName()
148 if tag_name != 'td-attrs':
149 raise html.ParseError('Expected <td-attrs />, got %r' %
150 tag_name)
151 td_attrs = self.tag_lexer.AllAttrsRaw()
152 self._Next()
153
154 left = self.start_pos
155
156 # Find the closing </li>
157 balance = 0
158 while True:
159 # TODO: This has to match NESTED
160 # <li> <li>foo</li> </li>
161 # Because cells can have bulleted lists
162
163 if self.tok_id == html.StartTag:
164 self.tag_lexer.Reset(self.start_pos, self.end_pos)
165 if self.tag_lexer.TagName() == 'li':
166 balance += 1
167
168 if self.tok_id == html.EndTag:
169 self.tag_lexer.Reset(self.start_pos, self.end_pos)
170 if self.tag_lexer.TagName() == 'li':
171 balance -= 1
172 if balance < 0:
173 break
174 self._Next()
175
176 right = self.start_pos # start of the end tag
177
178 inner_html = self.tag_lexer.s[left:right]
179 #log('RAW inner html %r', inner_html)
180
181 #self._Eat(html.EndTag, 'li')
182 self._Next()
183
184 return td_attrs, inner_html
185
186 def _ParseTHead(self):
187 """
188 Assume we're looking at the first <ul> tag. Now we want to find
189 <li>thead and the nested <ul>
190
191 Grammar:
192
193 THEAD =
194 [StartTag 'ul']
195 [RawData \s*]?
196 [StartTag 'li']
197 [RawData thead\s*]
198 [StartTag 'ul'] # Indented bullet that starts -
199 LIST_ITEM+
200 [RawData \s*]?
201 [EndTag 'ul']
202 [RawData thead\s*]
203 [End 'li']
204
205 Two Algorithms:
206
207 1. Replacement:
208 - skip over the first ul 'thead' li, and ul 'tr' li
209 - then replace the next ul -> tr, and li -> td
210 2. Parsing and Rendering:
211 - parse them into a structure
212 - skip all the text
213 - print your own HTML
214
215 I think the second one is better, because it allows attribute extensions
216 to thead
217
218 - thead
219 - name [link][]
220 - colgroup=foo align=left
221 - age
222 - colgroup=foo align=right
223 """
224 #log('*** _ParseTHead')
225 cells = []
226
227 self._WhitespaceOk()
228 self._Eat(html.StartTag, 'li')
229
230 # In CommonMark, r'thead\n' is enough, because it strips trailing
231 # whitespace. I'm not sure if other Markdown processors do that, so
232 # use r'thead\n'.
233 self._EatRawData(r'thead\s*')
234
235 # This is the row data
236 self._Eat(html.StartTag, 'ul')
237
238 while True:
239 td_attrs, inner_html = self._ListItem()
240 if inner_html is None:
241 break
242 cells.append((td_attrs, inner_html))
243 self._WhitespaceOk()
244
245 self._Eat(html.EndTag, 'ul')
246
247 self._WhitespaceOk()
248 self._Eat(html.EndTag, 'li')
249
250 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
251 return cells
252
253 def _ParseTr(self):
254 """
255 Assume we're looking at the first <ul> tag. Now we want to find
256 <li>tr and the nested <ul>
257
258 Grammar:
259
260 TR =
261 [RawData \s*]?
262 [StartTag 'li']
263 [RawData thead\s*]
264 [StartTag 'ul'] # Indented bullet that starts -
265 ( [StartEndTag tr-attrs] [RawData \s*] )?
266 LIST_ITEM+ # Defined above
267 [RawData \s*]?
268 [EndTag 'ul']
269 """
270 #log('*** _ParseTr')
271
272 cells = []
273
274 self._WhitespaceOk()
275
276 # Could be a </ul>
277 if self.tok_id != html.StartTag:
278 return None, None
279
280 self._Eat(html.StartTag, 'li')
281
282 self._EatRawData(r'tr\s*')
283
284 tr_attrs = None
285 if self.tok_id == html.StartEndTag:
286 self.tag_lexer.Reset(self.start_pos, self.end_pos)
287 tr_attrs = self.tag_lexer.AllAttrsRaw()
288 self._Next()
289 self._WhitespaceOk()
290
291 # This is the row data
292 self._Eat(html.StartTag, 'ul')
293
294 while True:
295 td_attrs, inner_html = self._ListItem()
296 if inner_html is None:
297 break
298 cells.append((td_attrs, inner_html))
299 # TODO: assert
300
301 self._WhitespaceOk()
302
303 self._Eat(html.EndTag, 'ul')
304
305 self._WhitespaceOk()
306 self._Eat(html.EndTag, 'li')
307
308 #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
309 return tr_attrs, cells
310
311 def ParseTable(self):
312 """
313 Returns a structure like this
314 { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
315 'tr': [ # raw HTML that you surround with <td>
316 [ 'cell1 html', 'cell2 html' ],
317 [ 'cell1 html', 'cell2 html' ],
318 ]
319 }
320
321 Grammar:
322
323 UL_TABLE =
324 [StartTag 'ul']
325 THEAD # this this returns the number of cells, so it's NOT context
326 # free
327 TR*
328 [EndTag 'ul']
329 """
330 table = {'tr': []}
331
332 ul_start = self.start_pos
333 self._Eat(html.StartTag, 'ul')
334
335 thead = self._ParseTHead()
336 #log('___ THEAD %s', thead)
337
338 num_cells = len(thead)
339 while True:
340 tr_attrs, tr = self._ParseTr()
341 if tr is None:
342 break
343 # Not validating because of colspan
344 if 0:
345 if len(tr) != num_cells:
346 raise html.ParseError('Expected %d cells, got %d: %s',
347 num_cells, len(tr), tr)
348
349 #log('___ TR %s', tr)
350 table['tr'].append((tr_attrs, tr))
351
352 self._Eat(html.EndTag, 'ul')
353
354 self._WhitespaceOk()
355
356 ul_end = self.start_pos
357
358 table['thead'] = thead
359 table['ul_start'] = ul_start
360 table['ul_end'] = ul_end
361
362 if 0:
363 log('table %s', table)
364 from pprint import pprint
365 pprint(table)
366
367 return table
368
369
370def MergeAttrs(thead_td_attrs, row_td_attrs):
371 merged_attrs = []
372
373 if row_td_attrs is None:
374 row_lookup = {}
375 else:
376 row_lookup = {n: v for n, v in row_td_attrs}
377
378 done_for_row = set()
379
380 if thead_td_attrs:
381 for name, raw_value in thead_td_attrs:
382 more_values = row_lookup.get(name)
383 if more_values is not None:
384 raw_value += ' %s' % more_values
385 done_for_row.add(name)
386 merged_attrs.append((name, raw_value))
387
388 if row_td_attrs:
389 for name, raw_value in row_td_attrs:
390 if name in done_for_row:
391 continue
392 merged_attrs.append((name, raw_value))
393
394 return merged_attrs
395
396
397def ReplaceTables(s, debug_out=None):
398 """
399 ul-table: Write tables using bulleted list
400 """
401 if debug_out is None:
402 debug_out = []
403
404 f = cStringIO.StringIO()
405 out = html.Output(s, f)
406
407 tag_lexer = html.TagLexer(s)
408 it = html.ValidTokens(s)
409
410 p = UlTableParser(it, tag_lexer)
411
412 while True:
413 ul_start = p.FindUlTable()
414 if ul_start == -1:
415 break
416
417 #log('UL START %d', ul_start)
418 out.PrintUntil(ul_start)
419
420 table = p.ParseTable()
421 #log('UL END %d', ul_end)
422
423 # Don't write the matching </u> of the LAST row, but write everything
424 # after that
425 out.SkipTo(table['ul_end'])
426
427 # Write the header
428 out.Print('<thead>\n')
429 out.Print('<tr>\n')
430
431 col_attrs = {} # integer -> td_attrs
432
433 i = 0
434 for td_attrs, raw_html in table['thead']:
435 if td_attrs:
436 col_attrs[i] = td_attrs
437 # <th> tag is more semantic, and styled bold by default
438 out.Print(' <th>')
439 out.Print(raw_html)
440 out.Print('</th>\n')
441 i += 1
442
443 out.Print('</tr>\n')
444 out.Print('</thead>\n')
445
446 # Write each row
447 for tr_attrs, row in table['tr']:
448
449 # Print tr tag and attrs
450 out.Print('<tr')
451 if tr_attrs:
452 for name, raw_value in tr_attrs:
453 out.Print(' ')
454 out.Print(name)
455 # No escaping because it's raw. It can't contain quotes.
456 out.Print('="%s"' % raw_value)
457 out.Print('>\n')
458
459 # Print cells
460 i = 0
461 for row_td_attrs, raw_html in row:
462 # Inherited from header
463 thead_td_attrs = col_attrs.get(i)
464 merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
465
466 out.Print(' <td')
467 for name, raw_value in merged_attrs:
468 out.Print(' ')
469 out.Print(name)
470 # No escaping because it's raw. It can't contain quotes.
471 out.Print('="%s"' % raw_value)
472 out.Print('>')
473
474 out.Print(raw_html)
475 out.Print('</td>\n')
476 i += 1
477 out.Print('</tr>\n')
478
479 out.PrintTheRest()
480
481 return f.getvalue()