1 | #!/usr/bin/env python2
|
2 | """ul_table.py: Markdown Tables Without New Syntax."""
|
3 |
|
4 | import cStringIO
|
5 | import re
|
6 |
|
7 | from doctools.util import log
|
8 | from lazylex import html
|
9 |
|
10 |
|
11 | class UlTableParser(object):
|
12 |
|
13 | def __init__(self, lexer, tag_lexer):
|
14 | self.lexer = lexer
|
15 | self.tag_lexer = tag_lexer
|
16 |
|
17 | self.tok_id = html.Invalid
|
18 | self.start_pos = 0
|
19 | self.end_pos = 0
|
20 |
|
21 | def _CurrentString(self):
|
22 | part = self.tag_lexer.s[self.start_pos:self.end_pos]
|
23 | return part
|
24 |
|
25 | def _Next(self):
|
26 | """
|
27 | Advance and set self.tok_id, self.start_pos, self.end_pos
|
28 | """
|
29 | self.start_pos = self.end_pos
|
30 | try:
|
31 | self.tok_id, self.end_pos = next(self.lexer)
|
32 | except StopIteration:
|
33 | raise
|
34 | if 0:
|
35 | part = self._CurrentString()
|
36 | log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
|
37 |
|
38 | #self.tok_id = html.EndOfStream
|
39 | # Don't change self.end_pos
|
40 |
|
41 | def _EatRawData(self, regex):
|
42 | # type: (str) -> None
|
43 | """
|
44 | Assert that we got text data matching a regex, and advance
|
45 | """
|
46 | if self.tok_id != html.RawData:
|
47 | raise html.ParseError('Expected RawData, got %s',
|
48 | html.TokenName(self.tok_id))
|
49 | actual = self._CurrentString()
|
50 | m = re.match(regex, actual) # could compile this
|
51 | if m is None:
|
52 | raise html.ParseError('Expected to match %r, got %r', regex,
|
53 | actual)
|
54 | self._Next()
|
55 |
|
56 | def _Eat(self, tok_id, s):
|
57 | """
|
58 | Assert that we got a start or end tag, with the given name, and advance
|
59 | """
|
60 | if self.tok_id != tok_id:
|
61 | raise html.ParseError('Expected token %s, got %s',
|
62 | html.TokenName(tok_id),
|
63 | html.TokenName(self.tok_id))
|
64 | if tok_id in (html.StartTag, html.EndTag):
|
65 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
66 | tag_name = self.tag_lexer.TagName()
|
67 | if s != tag_name:
|
68 | raise html.ParseError('Expected tag %r, got %r', s, tag_name)
|
69 | else:
|
70 | if s is not None:
|
71 | raise AssertionError("Don't know what to do with %r" % s)
|
72 | self._Next()
|
73 |
|
74 | def _WhitespaceOk(self):
|
75 | """
|
76 | Optional whitespace
|
77 | """
|
78 | if self.tok_id == html.RawData and self._CurrentString().isspace():
|
79 | self._Next()
|
80 |
|
81 | def FindUlTable(self):
|
82 | """Find <table ...> <ul>
|
83 |
|
84 | Return the START position of the <ul>
|
85 | Similar algorithm as html.ReadUntilStartTag()
|
86 | """
|
87 | tag_lexer = self.tag_lexer
|
88 |
|
89 | # Find first table
|
90 | while True:
|
91 | self._Next()
|
92 | if self.tok_id == html.EndOfStream:
|
93 | return -1
|
94 |
|
95 | tag_lexer.Reset(self.start_pos, self.end_pos)
|
96 | if (self.tok_id == html.StartTag and
|
97 | tag_lexer.TagName() == 'table'):
|
98 | while True:
|
99 | self._Next()
|
100 | if self.tok_id != html.RawData:
|
101 | break
|
102 |
|
103 | tag_lexer.Reset(self.start_pos, self.end_pos)
|
104 | if (self.tok_id == html.StartTag and
|
105 | tag_lexer.TagName() == 'ul'):
|
106 | return self.start_pos
|
107 | return -1
|
108 |
|
109 | def _ListItem(self):
|
110 | """Parse a list item nested below thead or tr.
|
111 |
|
112 | Returns:
|
113 | A pair (td_attrs, inner_html)
|
114 |
|
115 | Grammar:
|
116 |
|
117 | LIST_ITEM =
|
118 | [RawData \s*]?
|
119 | [StartTag 'li']
|
120 | [StartEndTag 'td-attrs']?
|
121 | ANY* # NOT context-free - anything that's not the end
|
122 | # This is what we should capture in CELLS
|
123 | [EndTag 'li']
|
124 |
|
125 | Example of attribute borrowing:
|
126 |
|
127 | - hi there ==>
|
128 | <li>hi there</li> ==>
|
129 | <td>hi there</td>
|
130 |
|
131 | - <td-attrs class=foo /> hi there ==>
|
132 | <li><td-attrs class=foo /> hi there </li> ==>
|
133 | <td class=foo> hi there </td> ==>
|
134 | """
|
135 | self._WhitespaceOk()
|
136 |
|
137 | if self.tok_id != html.StartTag:
|
138 | return None, None
|
139 |
|
140 | inner_html = None
|
141 | td_attrs = None # Can we also have col-attrs?
|
142 |
|
143 | self._Eat(html.StartTag, 'li')
|
144 |
|
145 | if self.tok_id == html.StartEndTag:
|
146 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
147 | tag_name = self.tag_lexer.TagName()
|
148 | if tag_name != 'td-attrs':
|
149 | raise html.ParseError('Expected <td-attrs />, got %r' %
|
150 | tag_name)
|
151 | td_attrs = self.tag_lexer.AllAttrsRaw()
|
152 | self._Next()
|
153 |
|
154 | left = self.start_pos
|
155 |
|
156 | # Find the closing </li>
|
157 | balance = 0
|
158 | while True:
|
159 | # TODO: This has to match NESTED
|
160 | # <li> <li>foo</li> </li>
|
161 | # Because cells can have bulleted lists
|
162 |
|
163 | if self.tok_id == html.StartTag:
|
164 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
165 | if self.tag_lexer.TagName() == 'li':
|
166 | balance += 1
|
167 |
|
168 | if self.tok_id == html.EndTag:
|
169 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
170 | if self.tag_lexer.TagName() == 'li':
|
171 | balance -= 1
|
172 | if balance < 0:
|
173 | break
|
174 | self._Next()
|
175 |
|
176 | right = self.start_pos # start of the end tag
|
177 |
|
178 | inner_html = self.tag_lexer.s[left:right]
|
179 | #log('RAW inner html %r', inner_html)
|
180 |
|
181 | #self._Eat(html.EndTag, 'li')
|
182 | self._Next()
|
183 |
|
184 | return td_attrs, inner_html
|
185 |
|
186 | def _ParseTHead(self):
|
187 | """
|
188 | Assume we're looking at the first <ul> tag. Now we want to find
|
189 | <li>thead and the nested <ul>
|
190 |
|
191 | Grammar:
|
192 |
|
193 | THEAD =
|
194 | [StartTag 'ul']
|
195 | [RawData \s*]?
|
196 | [StartTag 'li']
|
197 | [RawData thead\s*]
|
198 | [StartTag 'ul'] # Indented bullet that starts -
|
199 | LIST_ITEM+
|
200 | [RawData \s*]?
|
201 | [EndTag 'ul']
|
202 | [RawData thead\s*]
|
203 | [End 'li']
|
204 |
|
205 | Two Algorithms:
|
206 |
|
207 | 1. Replacement:
|
208 | - skip over the first ul 'thead' li, and ul 'tr' li
|
209 | - then replace the next ul -> tr, and li -> td
|
210 | 2. Parsing and Rendering:
|
211 | - parse them into a structure
|
212 | - skip all the text
|
213 | - print your own HTML
|
214 |
|
215 | I think the second one is better, because it allows attribute extensions
|
216 | to thead
|
217 |
|
218 | - thead
|
219 | - name [link][]
|
220 | - colgroup=foo align=left
|
221 | - age
|
222 | - colgroup=foo align=right
|
223 | """
|
224 | #log('*** _ParseTHead')
|
225 | cells = []
|
226 |
|
227 | self._WhitespaceOk()
|
228 | self._Eat(html.StartTag, 'li')
|
229 |
|
230 | # In CommonMark, r'thead\n' is enough, because it strips trailing
|
231 | # whitespace. I'm not sure if other Markdown processors do that, so
|
232 | # use r'thead\n'.
|
233 | self._EatRawData(r'thead\s*')
|
234 |
|
235 | # This is the row data
|
236 | self._Eat(html.StartTag, 'ul')
|
237 |
|
238 | while True:
|
239 | td_attrs, inner_html = self._ListItem()
|
240 | if inner_html is None:
|
241 | break
|
242 | cells.append((td_attrs, inner_html))
|
243 | self._WhitespaceOk()
|
244 |
|
245 | self._Eat(html.EndTag, 'ul')
|
246 |
|
247 | self._WhitespaceOk()
|
248 | self._Eat(html.EndTag, 'li')
|
249 |
|
250 | #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
|
251 | return cells
|
252 |
|
253 | def _ParseTr(self):
|
254 | """
|
255 | Assume we're looking at the first <ul> tag. Now we want to find
|
256 | <li>tr and the nested <ul>
|
257 |
|
258 | Grammar:
|
259 |
|
260 | TR =
|
261 | [RawData \s*]?
|
262 | [StartTag 'li']
|
263 | [RawData thead\s*]
|
264 | [StartTag 'ul'] # Indented bullet that starts -
|
265 | ( [StartEndTag tr-attrs] [RawData \s*] )?
|
266 | LIST_ITEM+ # Defined above
|
267 | [RawData \s*]?
|
268 | [EndTag 'ul']
|
269 | """
|
270 | #log('*** _ParseTr')
|
271 |
|
272 | cells = []
|
273 |
|
274 | self._WhitespaceOk()
|
275 |
|
276 | # Could be a </ul>
|
277 | if self.tok_id != html.StartTag:
|
278 | return None, None
|
279 |
|
280 | self._Eat(html.StartTag, 'li')
|
281 |
|
282 | self._EatRawData(r'tr\s*')
|
283 |
|
284 | tr_attrs = None
|
285 | if self.tok_id == html.StartEndTag:
|
286 | self.tag_lexer.Reset(self.start_pos, self.end_pos)
|
287 | tr_attrs = self.tag_lexer.AllAttrsRaw()
|
288 | self._Next()
|
289 | self._WhitespaceOk()
|
290 |
|
291 | # This is the row data
|
292 | self._Eat(html.StartTag, 'ul')
|
293 |
|
294 | while True:
|
295 | td_attrs, inner_html = self._ListItem()
|
296 | if inner_html is None:
|
297 | break
|
298 | cells.append((td_attrs, inner_html))
|
299 | # TODO: assert
|
300 |
|
301 | self._WhitespaceOk()
|
302 |
|
303 | self._Eat(html.EndTag, 'ul')
|
304 |
|
305 | self._WhitespaceOk()
|
306 | self._Eat(html.EndTag, 'li')
|
307 |
|
308 | #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
|
309 | return tr_attrs, cells
|
310 |
|
311 | def ParseTable(self):
|
312 | """
|
313 | Returns a structure like this
|
314 | { 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
|
315 | 'tr': [ # raw HTML that you surround with <td>
|
316 | [ 'cell1 html', 'cell2 html' ],
|
317 | [ 'cell1 html', 'cell2 html' ],
|
318 | ]
|
319 | }
|
320 |
|
321 | Grammar:
|
322 |
|
323 | UL_TABLE =
|
324 | [StartTag 'ul']
|
325 | THEAD # this this returns the number of cells, so it's NOT context
|
326 | # free
|
327 | TR*
|
328 | [EndTag 'ul']
|
329 | """
|
330 | table = {'tr': []}
|
331 |
|
332 | ul_start = self.start_pos
|
333 | self._Eat(html.StartTag, 'ul')
|
334 |
|
335 | thead = self._ParseTHead()
|
336 | #log('___ THEAD %s', thead)
|
337 |
|
338 | num_cells = len(thead)
|
339 | while True:
|
340 | tr_attrs, tr = self._ParseTr()
|
341 | if tr is None:
|
342 | break
|
343 | # Not validating because of colspan
|
344 | if 0:
|
345 | if len(tr) != num_cells:
|
346 | raise html.ParseError('Expected %d cells, got %d: %s',
|
347 | num_cells, len(tr), tr)
|
348 |
|
349 | #log('___ TR %s', tr)
|
350 | table['tr'].append((tr_attrs, tr))
|
351 |
|
352 | self._Eat(html.EndTag, 'ul')
|
353 |
|
354 | self._WhitespaceOk()
|
355 |
|
356 | ul_end = self.start_pos
|
357 |
|
358 | table['thead'] = thead
|
359 | table['ul_start'] = ul_start
|
360 | table['ul_end'] = ul_end
|
361 |
|
362 | if 0:
|
363 | log('table %s', table)
|
364 | from pprint import pprint
|
365 | pprint(table)
|
366 |
|
367 | return table
|
368 |
|
369 |
|
370 | def MergeAttrs(thead_td_attrs, row_td_attrs):
|
371 | merged_attrs = []
|
372 |
|
373 | if row_td_attrs is None:
|
374 | row_lookup = {}
|
375 | else:
|
376 | row_lookup = {n: v for n, v in row_td_attrs}
|
377 |
|
378 | done_for_row = set()
|
379 |
|
380 | if thead_td_attrs:
|
381 | for name, raw_value in thead_td_attrs:
|
382 | more_values = row_lookup.get(name)
|
383 | if more_values is not None:
|
384 | raw_value += ' %s' % more_values
|
385 | done_for_row.add(name)
|
386 | merged_attrs.append((name, raw_value))
|
387 |
|
388 | if row_td_attrs:
|
389 | for name, raw_value in row_td_attrs:
|
390 | if name in done_for_row:
|
391 | continue
|
392 | merged_attrs.append((name, raw_value))
|
393 |
|
394 | return merged_attrs
|
395 |
|
396 |
|
397 | def ReplaceTables(s, debug_out=None):
|
398 | """
|
399 | ul-table: Write tables using bulleted list
|
400 | """
|
401 | if debug_out is None:
|
402 | debug_out = []
|
403 |
|
404 | f = cStringIO.StringIO()
|
405 | out = html.Output(s, f)
|
406 |
|
407 | tag_lexer = html.TagLexer(s)
|
408 | it = html.ValidTokens(s)
|
409 |
|
410 | p = UlTableParser(it, tag_lexer)
|
411 |
|
412 | while True:
|
413 | ul_start = p.FindUlTable()
|
414 | if ul_start == -1:
|
415 | break
|
416 |
|
417 | #log('UL START %d', ul_start)
|
418 | out.PrintUntil(ul_start)
|
419 |
|
420 | table = p.ParseTable()
|
421 | #log('UL END %d', ul_end)
|
422 |
|
423 | # Don't write the matching </u> of the LAST row, but write everything
|
424 | # after that
|
425 | out.SkipTo(table['ul_end'])
|
426 |
|
427 | # Write the header
|
428 | out.Print('<thead>\n')
|
429 | out.Print('<tr>\n')
|
430 |
|
431 | col_attrs = {} # integer -> td_attrs
|
432 |
|
433 | i = 0
|
434 | for td_attrs, raw_html in table['thead']:
|
435 | if td_attrs:
|
436 | col_attrs[i] = td_attrs
|
437 | # <th> tag is more semantic, and styled bold by default
|
438 | out.Print(' <th>')
|
439 | out.Print(raw_html)
|
440 | out.Print('</th>\n')
|
441 | i += 1
|
442 |
|
443 | out.Print('</tr>\n')
|
444 | out.Print('</thead>\n')
|
445 |
|
446 | # Write each row
|
447 | for tr_attrs, row in table['tr']:
|
448 |
|
449 | # Print tr tag and attrs
|
450 | out.Print('<tr')
|
451 | if tr_attrs:
|
452 | for name, raw_value in tr_attrs:
|
453 | out.Print(' ')
|
454 | out.Print(name)
|
455 | # No escaping because it's raw. It can't contain quotes.
|
456 | out.Print('="%s"' % raw_value)
|
457 | out.Print('>\n')
|
458 |
|
459 | # Print cells
|
460 | i = 0
|
461 | for row_td_attrs, raw_html in row:
|
462 | # Inherited from header
|
463 | thead_td_attrs = col_attrs.get(i)
|
464 | merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
|
465 |
|
466 | out.Print(' <td')
|
467 | for name, raw_value in merged_attrs:
|
468 | out.Print(' ')
|
469 | out.Print(name)
|
470 | # No escaping because it's raw. It can't contain quotes.
|
471 | out.Print('="%s"' % raw_value)
|
472 | out.Print('>')
|
473 |
|
474 | out.Print(raw_html)
|
475 | out.Print('</td>\n')
|
476 | i += 1
|
477 | out.Print('</tr>\n')
|
478 |
|
479 | out.PrintTheRest()
|
480 |
|
481 | return f.getvalue()
|