doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

541 lines, 272 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	try:
5	from cStringIO import StringIO
6	except ImportError:
7	from io import StringIO
8	import re
9	import sys
10
11	from doctools.util import log
12	from lazylex import html
13
14
15	def RemoveComments(s):
16	"""Remove <!-- comments -->
17
18	This is a required preprocessing step for ul-table.
19	"""
20	f = StringIO()
21	out = html.Output(s, f)
22
23	tag_lexer = html.TagLexer(s)
24
25	pos = 0
26
27	for tok_id, end_pos in html.ValidTokens(s):
28	if tok_id == html.Comment:
29	value = s[pos:end_pos]
30	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
31	if 'REPLACE' not in value:
32	out.PrintUntil(pos)
33	out.SkipTo(end_pos)
34	pos = end_pos
35
36	out.PrintTheRest()
37	return f.getvalue()
38
39
40	_WHITESPACE_RE = re.compile(r'\s*')
41
42
43	class UlTableParser(object):
44
45	def __init__(self, lexer, tag_lexer):
46	self.lexer = lexer
47	self.tag_lexer = tag_lexer
48
49	self.tok_id = html.Invalid
50	self.start_pos = 0
51	self.end_pos = 0
52
53	def _CurrentString(self):
54	part = self.lexer.s[self.start_pos:self.end_pos]
55	return part
56
57	def _Next(self, comment_ok=False):
58	"""
59	Advance and set self.tok_id, self.start_pos, self.end_pos
60	"""
61	self.start_pos = self.end_pos
62	self.tok_id, self.end_pos = self.lexer.Read()
63
64	# Should have called RemoveComments() beforehand. That can still leave
65	# some REPLACE cmoments
66	if not comment_ok and self.tok_id == html.Comment:
67	raise html.ParseError('Unexpected HTML comment')
68
69	if 0:
70	part = self._CurrentString()
71	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
72
73	def _EatRawData(self, regex):
74	# type: (str) -> None
75	"""
76	Assert that we got text data matching a regex, and advance
77	"""
78	if self.tok_id != html.RawData:
79	raise html.ParseError('Expected RawData, got %s',
80	html.TokenName(self.tok_id))
81	actual = self._CurrentString()
82	m = re.match(regex, actual) # could compile this
83	if m is None:
84	raise html.ParseError('Expected to match %r, got %r', regex,
85	actual)
86	self._Next()
87
88	def _Eat(self, expected_id, expected_tag):
89	"""
90	Assert that we got a start or end tag, with the given name, and advance
91
92	Args:
93	expected_id: html.StartTag or html.EndTag
94	expected_tag: 'a', 'span', etc.
95	"""
96	assert expected_id in (html.StartTag,
97	html.EndTag), html.TokenName(expected_id)
98
99	if self.tok_id != expected_id:
100	raise html.ParseError('Expected token %s, got %s',
101	html.TokenName(expected_id),
102	html.TokenName(self.tok_id))
103	self.tag_lexer.Reset(self.start_pos, self.end_pos)
104	tag_name = self.tag_lexer.TagName()
105	if expected_tag != tag_name:
106	raise html.ParseError('Expected tag %r, got %r', expected_tag,
107	tag_name)
108
109	self._Next()
110
111	def _WhitespaceOk(self):
112	"""
113	Optional whitespace
114	"""
115	if (self.tok_id == html.RawData and
116	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
117	self._Next()
118
119	def FindUlTable(self):
120	"""Find <table ...> <ul>
121
122	Return the START position of the <ul>
123	Similar algorithm as html.ReadUntilStartTag()
124	"""
125	tag_lexer = self.tag_lexer
126
127	# Find first table
128	while True:
129	self._Next(comment_ok=True)
130	if self.tok_id == html.EndOfStream:
131	return -1
132
133	tag_lexer.Reset(self.start_pos, self.end_pos)
134	if (self.tok_id == html.StartTag and
135	tag_lexer.TagName() == 'table'):
136	while True:
137	self._Next(comment_ok=True)
138	if self.tok_id != html.RawData:
139	break
140
141	tag_lexer.Reset(self.start_pos, self.end_pos)
142	if (self.tok_id == html.StartTag and
143	tag_lexer.TagName() == 'ul'):
144	return self.start_pos
145	return -1
146
147	def _ListItem(self):
148	"""Parse a list item nested below thead or tr.
149
150	Returns:
151	A pair (td_attrs, inner_html)
152
153	Grammar:
154
155	LIST_ITEM =
156	[RawData \s*]?
157	[StartTag 'li']
158	ANY* # NOT context-free:
159	# - we MATCH <li> and </li> with a tack
160	# - We search for [StartEndTag 'cell-attrs']?
161	[EndTag 'li']
162
163	Example of attribute borrowing:
164
165	- hi there ==>
166	<li>hi there</li> ==>
167	<td>hi there</td>
168
169	- <cell-attrs class=foo /> hi there ==>
170	<li><cell-attrs class=foo /> hi there </li> ==>
171	<td class=foo> hi there </td> ==>
172	"""
173	self._WhitespaceOk()
174
175	if self.tok_id != html.StartTag:
176	return None, None
177
178	inner_html = None
179	td_attrs = None # Can we also have col-attrs?
180	td_attrs_span = None
181
182	self._Eat(html.StartTag, 'li')
183
184	left = self.start_pos
185
186	# Find the closing </li>, taking into accounted NESTED tags:
187	# <li> <li>foo</li> </li>
188	# because cells can have bulleted lists
189	balance = 0
190	while True:
191	if self.tok_id == html.StartEndTag:
192	self.tag_lexer.Reset(self.start_pos, self.end_pos)
193	tag_name = self.tag_lexer.TagName()
194	# TODO: remove td-attrs backward compat
195	if tag_name in ('td-attrs', 'cell-attrs'):
196	td_attrs_span = self.start_pos, self.end_pos
197	td_attrs = self.tag_lexer.AllAttrsRaw()
198	#log('CELL ATTRS %r', self._CurrentString())
199
200	elif self.tok_id == html.StartTag:
201	self.tag_lexer.Reset(self.start_pos, self.end_pos)
202	if self.tag_lexer.TagName() == 'li':
203	balance += 1
204
205	elif self.tok_id == html.EndTag:
206	self.tag_lexer.Reset(self.start_pos, self.end_pos)
207	if self.tag_lexer.TagName() == 'li':
208	balance -= 1
209	if balance < 0:
210	break
211	self._Next()
212
213	right = self.start_pos # start of the end tag
214
215	s = self.tag_lexer.s
216	if td_attrs_span:
217	# everything except the <cell-attrs />
218	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
219	#log('LEFT %r', s[left:td_attrs_span[0]])
220	#log('RIGHT %r', s[td_attrs_span[1]:right])
221	else:
222	inner_html = s[left:right]
223	#log('RAW inner html %r', inner_html)
224
225	#self._Eat(html.EndTag, 'li')
226	self._Next()
227
228	return td_attrs, inner_html
229
230	def _ParseTHead(self):
231	"""
232	Assume we're looking at the first <ul> tag. Now we want to find
233	<li>thead and the nested <ul>
234
235	Grammar:
236
237	THEAD =
238	[StartTag 'ul']
239	[RawData \s*]?
240	[StartTag 'li']
241	[RawData thead\s*]
242	[StartTag 'ul'] # Indented bullet that starts -
243	LIST_ITEM+
244	[RawData \s*]?
245	[EndTag 'ul']
246	[RawData thead\s+]
247	[End 'li']
248
249	Two Algorithms:
250
251	1. Replacement:
252	- skip over the first ul 'thead' li, and ul 'tr' li
253	- then replace the next ul -> tr, and li -> td
254	2. Parsing and Rendering:
255	- parse them into a structure
256	- skip all the text
257	- print your own HTML
258
259	I think the second one is better, because it allows attribute extensions
260	to thead
261
262	- thead
263	- name [link][]
264	- colgroup=foo align=left
265	- age
266	- colgroup=foo align=right
267	"""
268	#log('*** _ParseTHead')
269	cells = []
270
271	self._WhitespaceOk()
272	self._Eat(html.StartTag, 'li')
273
274	# In CommonMark, r'thead\n' is enough, because it strips trailing
275	# whitespace. I'm not sure if other Markdown processors do that, so
276	# use r'thead\s+'.
277	self._EatRawData(r'thead\s+')
278
279	# This is the row data
280	self._Eat(html.StartTag, 'ul')
281
282	while True:
283	td_attrs, inner_html = self._ListItem()
284	if inner_html is None:
285	break
286	cells.append((td_attrs, inner_html))
287	self._WhitespaceOk()
288
289	self._Eat(html.EndTag, 'ul')
290
291	self._WhitespaceOk()
292	self._Eat(html.EndTag, 'li')
293
294	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
295	return cells
296
297	def _ParseTr(self):
298	"""
299	Assume we're looking at the first <ul> tag. Now we want to find
300	<li>tr and the nested <ul>
301
302	Grammar:
303
304	TR =
305	[RawData \s*]?
306	[StartTag 'li']
307	[RawData thead\s*]
308	[StartTag 'ul'] # Indented bullet that starts -
309	( [StartEndTag row-attrs] [RawData \s*] )?
310	LIST_ITEM+ # Defined above
311	[RawData \s*]?
312	[EndTag 'ul']
313	"""
314	#log('*** _ParseTr')
315
316	cells = []
317
318	self._WhitespaceOk()
319
320	# Could be a </ul>
321	if self.tok_id != html.StartTag:
322	return None, None
323
324	self._Eat(html.StartTag, 'li')
325
326	self._EatRawData(r'tr\s*')
327
328	tr_attrs = None
329	if self.tok_id == html.StartEndTag:
330	self.tag_lexer.Reset(self.start_pos, self.end_pos)
331	tag_name = self.tag_lexer.TagName()
332	if tag_name != 'row-attrs':
333	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
334	tr_attrs = self.tag_lexer.AllAttrsRaw()
335	self._Next()
336	self._WhitespaceOk()
337
338	# This is the row data
339	self._Eat(html.StartTag, 'ul')
340
341	while True:
342	td_attrs, inner_html = self._ListItem()
343	if inner_html is None:
344	break
345	cells.append((td_attrs, inner_html))
346	# TODO: assert
347
348	self._WhitespaceOk()
349
350	self._Eat(html.EndTag, 'ul')
351
352	self._WhitespaceOk()
353	self._Eat(html.EndTag, 'li')
354
355	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
356	return tr_attrs, cells
357
358	def ParseTable(self):
359	"""
360	Returns a structure like this
361	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
362	'tr': [ # raw HTML that you surround with <td>
363	[ 'cell1 html', 'cell2 html' ],
364	[ 'cell1 html', 'cell2 html' ],
365	]
366	}
367
368	Grammar:
369
370	UL_TABLE =
371	[StartTag 'ul']
372	THEAD # this this returns the number of cells, so it's NOT context
373	# free
374	TR*
375	[EndTag 'ul']
376	"""
377	table = {'tr': []}
378
379	ul_start = self.start_pos
380	self._Eat(html.StartTag, 'ul')
381
382	# Look ahead 2 or 3 tokens:
383	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
384	thead = self._ParseTHead()
385	else:
386	thead = None
387	#log('___ THEAD %s', thead)
388
389	while True:
390	tr_attrs, tr = self._ParseTr()
391	if tr is None:
392	break
393	# Not validating because of colspan
394	if 0:
395	if thead and len(tr) != len(thead):
396	raise html.ParseError('Expected %d cells, got %d: %s',
397	len(thead), len(tr), tr)
398
399	#log('___ TR %s', tr)
400	table['tr'].append((tr_attrs, tr))
401
402	self._Eat(html.EndTag, 'ul')
403
404	self._WhitespaceOk()
405
406	ul_end = self.start_pos
407
408	table['thead'] = thead
409	table['ul_start'] = ul_start
410	table['ul_end'] = ul_end
411
412	if 0:
413	log('table %s', table)
414	from pprint import pprint
415	pprint(table)
416
417	return table
418
419
420	def MergeAttrs(thead_td_attrs, row_td_attrs):
421	merged_attrs = []
422
423	if row_td_attrs is None:
424	row_lookup = {}
425	else:
426	row_lookup = {n: v for n, v in row_td_attrs}
427
428	done_for_row = set()
429
430	if thead_td_attrs:
431	for name, raw_value in thead_td_attrs:
432	more_values = row_lookup.get(name)
433	if more_values is not None:
434	raw_value += ' %s' % more_values
435	done_for_row.add(name)
436	merged_attrs.append((name, raw_value))
437
438	if row_td_attrs:
439	for name, raw_value in row_td_attrs:
440	if name in done_for_row:
441	continue
442	merged_attrs.append((name, raw_value))
443
444	return merged_attrs
445
446
447	def ReplaceTables(s, debug_out=None):
448	"""
449	ul-table: Write tables using bulleted list
450	"""
451	if debug_out is None:
452	debug_out = []
453
454	f = StringIO()
455	out = html.Output(s, f)
456
457	tag_lexer = html.TagLexer(s)
458	lexer = html.Lexer(s)
459
460	p = UlTableParser(lexer, tag_lexer)
461
462	while True:
463	ul_start = p.FindUlTable()
464	if ul_start == -1:
465	break
466
467	#log('UL START %d', ul_start)
468	out.PrintUntil(ul_start)
469
470	table = p.ParseTable()
471	#log('UL END %d', ul_end)
472
473	# Don't write the matching </u> of the LAST row, but write everything
474	# after that
475	out.SkipTo(table['ul_end'])
476
477	# Write the header
478	thead = table['thead']
479
480	col_attrs = {} # integer -> td_attrs
481	if thead:
482	out.Print('<thead>\n')
483	out.Print('<tr>\n')
484
485	i = 0
486	for td_attrs, raw_html in thead:
487	if td_attrs:
488	col_attrs[i] = td_attrs
489	# <th> tag is more semantic, and styled bold by default
490	out.Print(' <th>')
491	out.Print(raw_html)
492	out.Print('</th>\n')
493	i += 1
494
495	out.Print('</tr>\n')
496	out.Print('</thead>\n')
497
498	# Write each row
499	for tr_attrs, row in table['tr']:
500
501	# Print tr tag and attrs
502	out.Print('<tr')
503	if tr_attrs:
504	for name, raw_value in tr_attrs:
505	out.Print(' ')
506	out.Print(name)
507	# No escaping because it's raw. It can't contain quotes.
508	out.Print('="%s"' % raw_value)
509	out.Print('>\n')
510
511	# Print cells
512	i = 0
513	for row_td_attrs, raw_html in row:
514	# Inherited from header
515	thead_td_attrs = col_attrs.get(i)
516	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
517
518	out.Print(' <td')
519	for name, raw_value in merged_attrs:
520	out.Print(' ')
521	out.Print(name)
522	# No escaping because it's raw. It can't contain quotes.
523	out.Print('="%s"' % raw_value)
524	out.Print('>')
525
526	out.Print(raw_html)
527	out.Print('</td>\n')
528	i += 1
529	out.Print('</tr>\n')
530
531	out.PrintTheRest()
532
533	return f.getvalue()
534
535
536	if __name__ == '__main__':
537	# Simple CLI filter
538	h = sys.stdin.read()
539	h = RemoveComments(h)
540	h = ReplaceTables(h)
541	sys.stdout.write(h)