doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

509 lines, 253 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	try:
5	from cStringIO import StringIO
6	except ImportError:
7	from io import StringIO
8	import re
9	import sys
10
11	from doctools.util import log
12	from lazylex import html
13
14	_WHITESPACE_RE = re.compile(r'\s*')
15
16
17	class UlTableParser(object):
18
19	def __init__(self, lexer, tag_lexer):
20	self.lexer = lexer
21	self.tag_lexer = tag_lexer
22
23	self.tok_id = html.Invalid
24	self.start_pos = 0
25	self.end_pos = 0
26
27	def _CurrentString(self):
28	part = self.lexer.s[self.start_pos:self.end_pos]
29	return part
30
31	def _Next(self):
32	"""
33	Advance and set self.tok_id, self.start_pos, self.end_pos
34	"""
35	self.start_pos = self.end_pos
36	self.tok_id, self.end_pos = self.lexer.Read()
37	if 0:
38	part = self._CurrentString()
39	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
40
41	#self.tok_id = html.EndOfStream
42	# Don't change self.end_pos
43
44	def _EatRawData(self, regex):
45	# type: (str) -> None
46	"""
47	Assert that we got text data matching a regex, and advance
48	"""
49	if self.tok_id != html.RawData:
50	raise html.ParseError('Expected RawData, got %s',
51	html.TokenName(self.tok_id))
52	actual = self._CurrentString()
53	m = re.match(regex, actual) # could compile this
54	if m is None:
55	raise html.ParseError('Expected to match %r, got %r', regex,
56	actual)
57	self._Next()
58
59	def _Eat(self, expected_id, expected_tag):
60	"""
61	Assert that we got a start or end tag, with the given name, and advance
62
63	Args:
64	expected_id: html.StartTag or html.EndTag
65	expected_tag: 'a', 'span', etc.
66	"""
67	assert expected_id in (html.StartTag,
68	html.EndTag), html.TokenName(expected_id)
69
70	if self.tok_id != expected_id:
71	raise html.ParseError('Expected token %s, got %s',
72	html.TokenName(expected_id),
73	html.TokenName(self.tok_id))
74	self.tag_lexer.Reset(self.start_pos, self.end_pos)
75	tag_name = self.tag_lexer.TagName()
76	if expected_tag != tag_name:
77	raise html.ParseError('Expected tag %r, got %r', expected_tag,
78	tag_name)
79
80	self._Next()
81
82	def _WhitespaceOk(self):
83	"""
84	Optional whitespace
85	"""
86	if (self.tok_id == html.RawData and
87	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
88	self._Next()
89
90	def FindUlTable(self):
91	"""Find <table ...> <ul>
92
93	Return the START position of the <ul>
94	Similar algorithm as html.ReadUntilStartTag()
95	"""
96	tag_lexer = self.tag_lexer
97
98	# Find first table
99	while True:
100	self._Next()
101	if self.tok_id == html.EndOfStream:
102	return -1
103
104	tag_lexer.Reset(self.start_pos, self.end_pos)
105	if (self.tok_id == html.StartTag and
106	tag_lexer.TagName() == 'table'):
107	while True:
108	self._Next()
109	if self.tok_id != html.RawData:
110	break
111
112	tag_lexer.Reset(self.start_pos, self.end_pos)
113	if (self.tok_id == html.StartTag and
114	tag_lexer.TagName() == 'ul'):
115	return self.start_pos
116	return -1
117
118	def _ListItem(self):
119	"""Parse a list item nested below thead or tr.
120
121	Returns:
122	A pair (td_attrs, inner_html)
123
124	Grammar:
125
126	LIST_ITEM =
127	[RawData \s*]?
128	[StartTag 'li']
129	ANY* # NOT context-free:
130	# - we MATCH <li> and </li> with a tack
131	# - We search for [StartEndTag 'cell-attrs']?
132	[EndTag 'li']
133
134	Example of attribute borrowing:
135
136	- hi there ==>
137	<li>hi there</li> ==>
138	<td>hi there</td>
139
140	- <cell-attrs class=foo /> hi there ==>
141	<li><cell-attrs class=foo /> hi there </li> ==>
142	<td class=foo> hi there </td> ==>
143	"""
144	self._WhitespaceOk()
145
146	if self.tok_id != html.StartTag:
147	return None, None
148
149	inner_html = None
150	td_attrs = None # Can we also have col-attrs?
151	td_attrs_span = None
152
153	self._Eat(html.StartTag, 'li')
154
155	left = self.start_pos
156
157	# Find the closing </li>, taking into accounted NESTED tags:
158	# <li> <li>foo</li> </li>
159	# because cells can have bulleted lists
160	balance = 0
161	while True:
162	if self.tok_id == html.StartEndTag:
163	self.tag_lexer.Reset(self.start_pos, self.end_pos)
164	tag_name = self.tag_lexer.TagName()
165	# TODO: remove td-attrs backward compat
166	if tag_name in ('td-attrs', 'cell-attrs'):
167	td_attrs_span = self.start_pos, self.end_pos
168	td_attrs = self.tag_lexer.AllAttrsRaw()
169	#log('CELL ATTRS %r', self._CurrentString())
170
171	elif self.tok_id == html.StartTag:
172	self.tag_lexer.Reset(self.start_pos, self.end_pos)
173	if self.tag_lexer.TagName() == 'li':
174	balance += 1
175
176	elif self.tok_id == html.EndTag:
177	self.tag_lexer.Reset(self.start_pos, self.end_pos)
178	if self.tag_lexer.TagName() == 'li':
179	balance -= 1
180	if balance < 0:
181	break
182	self._Next()
183
184	right = self.start_pos # start of the end tag
185
186	s = self.tag_lexer.s
187	if td_attrs_span:
188	# everything except the <cell-attrs />
189	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
190	#log('LEFT %r', s[left:td_attrs_span[0]])
191	#log('RIGHT %r', s[td_attrs_span[1]:right])
192	else:
193	inner_html = s[left:right]
194	#log('RAW inner html %r', inner_html)
195
196	#self._Eat(html.EndTag, 'li')
197	self._Next()
198
199	return td_attrs, inner_html
200
201	def _ParseTHead(self):
202	"""
203	Assume we're looking at the first <ul> tag. Now we want to find
204	<li>thead and the nested <ul>
205
206	Grammar:
207
208	THEAD =
209	[StartTag 'ul']
210	[RawData \s*]?
211	[StartTag 'li']
212	[RawData thead\s*]
213	[StartTag 'ul'] # Indented bullet that starts -
214	LIST_ITEM+
215	[RawData \s*]?
216	[EndTag 'ul']
217	[RawData thead\s+]
218	[End 'li']
219
220	Two Algorithms:
221
222	1. Replacement:
223	- skip over the first ul 'thead' li, and ul 'tr' li
224	- then replace the next ul -> tr, and li -> td
225	2. Parsing and Rendering:
226	- parse them into a structure
227	- skip all the text
228	- print your own HTML
229
230	I think the second one is better, because it allows attribute extensions
231	to thead
232
233	- thead
234	- name [link][]
235	- colgroup=foo align=left
236	- age
237	- colgroup=foo align=right
238	"""
239	#log('*** _ParseTHead')
240	cells = []
241
242	self._WhitespaceOk()
243	self._Eat(html.StartTag, 'li')
244
245	# In CommonMark, r'thead\n' is enough, because it strips trailing
246	# whitespace. I'm not sure if other Markdown processors do that, so
247	# use r'thead\s+'.
248	self._EatRawData(r'thead\s+')
249
250	# This is the row data
251	self._Eat(html.StartTag, 'ul')
252
253	while True:
254	td_attrs, inner_html = self._ListItem()
255	if inner_html is None:
256	break
257	cells.append((td_attrs, inner_html))
258	self._WhitespaceOk()
259
260	self._Eat(html.EndTag, 'ul')
261
262	self._WhitespaceOk()
263	self._Eat(html.EndTag, 'li')
264
265	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
266	return cells
267
268	def _ParseTr(self):
269	"""
270	Assume we're looking at the first <ul> tag. Now we want to find
271	<li>tr and the nested <ul>
272
273	Grammar:
274
275	TR =
276	[RawData \s*]?
277	[StartTag 'li']
278	[RawData thead\s*]
279	[StartTag 'ul'] # Indented bullet that starts -
280	( [StartEndTag row-attrs] [RawData \s*] )?
281	LIST_ITEM+ # Defined above
282	[RawData \s*]?
283	[EndTag 'ul']
284	"""
285	#log('*** _ParseTr')
286
287	cells = []
288
289	self._WhitespaceOk()
290
291	# Could be a </ul>
292	if self.tok_id != html.StartTag:
293	return None, None
294
295	self._Eat(html.StartTag, 'li')
296
297	self._EatRawData(r'tr\s*')
298
299	tr_attrs = None
300	if self.tok_id == html.StartEndTag:
301	self.tag_lexer.Reset(self.start_pos, self.end_pos)
302	tag_name = self.tag_lexer.TagName()
303	if tag_name != 'row-attrs':
304	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
305	tr_attrs = self.tag_lexer.AllAttrsRaw()
306	self._Next()
307	self._WhitespaceOk()
308
309	# This is the row data
310	self._Eat(html.StartTag, 'ul')
311
312	while True:
313	td_attrs, inner_html = self._ListItem()
314	if inner_html is None:
315	break
316	cells.append((td_attrs, inner_html))
317	# TODO: assert
318
319	self._WhitespaceOk()
320
321	self._Eat(html.EndTag, 'ul')
322
323	self._WhitespaceOk()
324	self._Eat(html.EndTag, 'li')
325
326	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
327	return tr_attrs, cells
328
329	def ParseTable(self):
330	"""
331	Returns a structure like this
332	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
333	'tr': [ # raw HTML that you surround with <td>
334	[ 'cell1 html', 'cell2 html' ],
335	[ 'cell1 html', 'cell2 html' ],
336	]
337	}
338
339	Grammar:
340
341	UL_TABLE =
342	[StartTag 'ul']
343	THEAD # this this returns the number of cells, so it's NOT context
344	# free
345	TR*
346	[EndTag 'ul']
347	"""
348	table = {'tr': []}
349
350	ul_start = self.start_pos
351	self._Eat(html.StartTag, 'ul')
352
353	# Look ahead 2 or 3 tokens:
354	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
355	thead = self._ParseTHead()
356	else:
357	thead = None
358	#log('___ THEAD %s', thead)
359
360	while True:
361	tr_attrs, tr = self._ParseTr()
362	if tr is None:
363	break
364	# Not validating because of colspan
365	if 0:
366	if thead and len(tr) != len(thead):
367	raise html.ParseError('Expected %d cells, got %d: %s',
368	len(thead), len(tr), tr)
369
370	#log('___ TR %s', tr)
371	table['tr'].append((tr_attrs, tr))
372
373	self._Eat(html.EndTag, 'ul')
374
375	self._WhitespaceOk()
376
377	ul_end = self.start_pos
378
379	table['thead'] = thead
380	table['ul_start'] = ul_start
381	table['ul_end'] = ul_end
382
383	if 0:
384	log('table %s', table)
385	from pprint import pprint
386	pprint(table)
387
388	return table
389
390
391	def MergeAttrs(thead_td_attrs, row_td_attrs):
392	merged_attrs = []
393
394	if row_td_attrs is None:
395	row_lookup = {}
396	else:
397	row_lookup = {n: v for n, v in row_td_attrs}
398
399	done_for_row = set()
400
401	if thead_td_attrs:
402	for name, raw_value in thead_td_attrs:
403	more_values = row_lookup.get(name)
404	if more_values is not None:
405	raw_value += ' %s' % more_values
406	done_for_row.add(name)
407	merged_attrs.append((name, raw_value))
408
409	if row_td_attrs:
410	for name, raw_value in row_td_attrs:
411	if name in done_for_row:
412	continue
413	merged_attrs.append((name, raw_value))
414
415	return merged_attrs
416
417
418	def ReplaceTables(s, debug_out=None):
419	"""
420	ul-table: Write tables using bulleted list
421	"""
422	if debug_out is None:
423	debug_out = []
424
425	f = StringIO()
426	out = html.Output(s, f)
427
428	tag_lexer = html.TagLexer(s)
429	lexer = html.Lexer(s)
430
431	p = UlTableParser(lexer, tag_lexer)
432
433	while True:
434	ul_start = p.FindUlTable()
435	if ul_start == -1:
436	break
437
438	#log('UL START %d', ul_start)
439	out.PrintUntil(ul_start)
440
441	table = p.ParseTable()
442	#log('UL END %d', ul_end)
443
444	# Don't write the matching </u> of the LAST row, but write everything
445	# after that
446	out.SkipTo(table['ul_end'])
447
448	# Write the header
449	thead = table['thead']
450
451	col_attrs = {} # integer -> td_attrs
452	if thead:
453	out.Print('<thead>\n')
454	out.Print('<tr>\n')
455
456	i = 0
457	for td_attrs, raw_html in thead:
458	if td_attrs:
459	col_attrs[i] = td_attrs
460	# <th> tag is more semantic, and styled bold by default
461	out.Print(' <th>')
462	out.Print(raw_html)
463	out.Print('</th>\n')
464	i += 1
465
466	out.Print('</tr>\n')
467	out.Print('</thead>\n')
468
469	# Write each row
470	for tr_attrs, row in table['tr']:
471
472	# Print tr tag and attrs
473	out.Print('<tr')
474	if tr_attrs:
475	for name, raw_value in tr_attrs:
476	out.Print(' ')
477	out.Print(name)
478	# No escaping because it's raw. It can't contain quotes.
479	out.Print('="%s"' % raw_value)
480	out.Print('>\n')
481
482	# Print cells
483	i = 0
484	for row_td_attrs, raw_html in row:
485	# Inherited from header
486	thead_td_attrs = col_attrs.get(i)
487	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
488
489	out.Print(' <td')
490	for name, raw_value in merged_attrs:
491	out.Print(' ')
492	out.Print(name)
493	# No escaping because it's raw. It can't contain quotes.
494	out.Print('="%s"' % raw_value)
495	out.Print('>')
496
497	out.Print(raw_html)
498	out.Print('</td>\n')
499	i += 1
500	out.Print('</tr>\n')
501
502	out.PrintTheRest()
503
504	return f.getvalue()
505
506
507	if __name__ == '__main__':
508	# Simple CLI filter
509	sys.stdout.write(ReplaceTables(sys.stdin.read()))