doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oilshell.org

502 lines, 250 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	try:
5	from cStringIO import StringIO
6	except ImportError:
7	from io import StringIO
8	import re
9	import sys
10
11	from doctools.util import log
12	from lazylex import html
13
14	_WHITESPACE_RE = re.compile(r'\s*')
15
16
17	class UlTableParser(object):
18
19	def __init__(self, lexer, tag_lexer):
20	self.lexer = lexer
21	self.tag_lexer = tag_lexer
22
23	self.tok_id = html.Invalid
24	self.start_pos = 0
25	self.end_pos = 0
26
27	def _CurrentString(self):
28	part = self.lexer.s[self.start_pos:self.end_pos]
29	return part
30
31	def _Next(self):
32	"""
33	Advance and set self.tok_id, self.start_pos, self.end_pos
34	"""
35	self.start_pos = self.end_pos
36	self.tok_id, self.end_pos = self.lexer.Read()
37	if 0:
38	part = self._CurrentString()
39	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
40
41	#self.tok_id = html.EndOfStream
42	# Don't change self.end_pos
43
44	def _EatRawData(self, regex):
45	# type: (str) -> None
46	"""
47	Assert that we got text data matching a regex, and advance
48	"""
49	if self.tok_id != html.RawData:
50	raise html.ParseError('Expected RawData, got %s',
51	html.TokenName(self.tok_id))
52	actual = self._CurrentString()
53	m = re.match(regex, actual) # could compile this
54	if m is None:
55	raise html.ParseError('Expected to match %r, got %r', regex,
56	actual)
57	self._Next()
58
59	def _Eat(self, expected_id, expected_tag):
60	"""
61	Assert that we got a start or end tag, with the given name, and advance
62
63	Args:
64	expected_id: html.StartTag or html.EndTag
65	expected_tag: 'a', 'span', etc.
66	"""
67	assert expected_id in (html.StartTag,
68	html.EndTag), html.TokenName(expected_id)
69
70	if self.tok_id != expected_id:
71	raise html.ParseError('Expected token %s, got %s',
72	html.TokenName(expected_id),
73	html.TokenName(self.tok_id))
74	self.tag_lexer.Reset(self.start_pos, self.end_pos)
75	tag_name = self.tag_lexer.TagName()
76	if expected_tag != tag_name:
77	raise html.ParseError('Expected tag %r, got %r', expected_tag,
78	tag_name)
79
80	self._Next()
81
82	def _WhitespaceOk(self):
83	"""
84	Optional whitespace
85	"""
86	if (self.tok_id == html.RawData and
87	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
88	self._Next()
89
90	def FindUlTable(self):
91	"""Find <table ...> <ul>
92
93	Return the START position of the <ul>
94	Similar algorithm as html.ReadUntilStartTag()
95	"""
96	tag_lexer = self.tag_lexer
97
98	# Find first table
99	while True:
100	self._Next()
101	if self.tok_id == html.EndOfStream:
102	return -1
103
104	tag_lexer.Reset(self.start_pos, self.end_pos)
105	if (self.tok_id == html.StartTag and
106	tag_lexer.TagName() == 'table'):
107	while True:
108	self._Next()
109	if self.tok_id != html.RawData:
110	break
111
112	tag_lexer.Reset(self.start_pos, self.end_pos)
113	if (self.tok_id == html.StartTag and
114	tag_lexer.TagName() == 'ul'):
115	return self.start_pos
116	return -1
117
118	def _ListItem(self):
119	"""Parse a list item nested below thead or tr.
120
121	Returns:
122	A pair (td_attrs, inner_html)
123
124	Grammar:
125
126	LIST_ITEM =
127	[RawData \s*]?
128	[StartTag 'li']
129	[StartEndTag 'cell-attrs']?
130	ANY* # NOT context-free - anything that's not the end
131	# This is what we should capture in CELLS
132	[EndTag 'li']
133
134	Example of attribute borrowing:
135
136	- hi there ==>
137	<li>hi there</li> ==>
138	<td>hi there</td>
139
140	- <cell-attrs class=foo /> hi there ==>
141	<li><cell-attrs class=foo /> hi there </li> ==>
142	<td class=foo> hi there </td> ==>
143	"""
144	self._WhitespaceOk()
145
146	if self.tok_id != html.StartTag:
147	return None, None
148
149	inner_html = None
150	td_attrs = None # Can we also have col-attrs?
151
152	self._Eat(html.StartTag, 'li')
153
154	if self.tok_id == html.StartEndTag:
155	self.tag_lexer.Reset(self.start_pos, self.end_pos)
156	tag_name = self.tag_lexer.TagName()
157	# TODO: remove td-attrs backward compat
158	if tag_name not in ('td-attrs', 'cell-attrs'):
159	raise html.ParseError('Expected <cell-attrs />, got %r' %
160	tag_name)
161	td_attrs = self.tag_lexer.AllAttrsRaw()
162	self._Next()
163
164	left = self.start_pos
165
166	# Find the closing </li>, taking into accounted NESTED tags:
167	# <li> <li>foo</li> </li>
168	# because cells can have bulleted lists
169	balance = 0
170	while True:
171	if self.tok_id == html.StartTag:
172	self.tag_lexer.Reset(self.start_pos, self.end_pos)
173	if self.tag_lexer.TagName() == 'li':
174	balance += 1
175
176	if self.tok_id == html.EndTag:
177	self.tag_lexer.Reset(self.start_pos, self.end_pos)
178	if self.tag_lexer.TagName() == 'li':
179	balance -= 1
180	if balance < 0:
181	break
182	self._Next()
183
184	right = self.start_pos # start of the end tag
185
186	inner_html = self.tag_lexer.s[left:right]
187	#log('RAW inner html %r', inner_html)
188
189	#self._Eat(html.EndTag, 'li')
190	self._Next()
191
192	return td_attrs, inner_html
193
194	def _ParseTHead(self):
195	"""
196	Assume we're looking at the first <ul> tag. Now we want to find
197	<li>thead and the nested <ul>
198
199	Grammar:
200
201	THEAD =
202	[StartTag 'ul']
203	[RawData \s*]?
204	[StartTag 'li']
205	[RawData thead\s*]
206	[StartTag 'ul'] # Indented bullet that starts -
207	LIST_ITEM+
208	[RawData \s*]?
209	[EndTag 'ul']
210	[RawData thead\s+]
211	[End 'li']
212
213	Two Algorithms:
214
215	1. Replacement:
216	- skip over the first ul 'thead' li, and ul 'tr' li
217	- then replace the next ul -> tr, and li -> td
218	2. Parsing and Rendering:
219	- parse them into a structure
220	- skip all the text
221	- print your own HTML
222
223	I think the second one is better, because it allows attribute extensions
224	to thead
225
226	- thead
227	- name [link][]
228	- colgroup=foo align=left
229	- age
230	- colgroup=foo align=right
231	"""
232	#log('*** _ParseTHead')
233	cells = []
234
235	self._WhitespaceOk()
236	self._Eat(html.StartTag, 'li')
237
238	# In CommonMark, r'thead\n' is enough, because it strips trailing
239	# whitespace. I'm not sure if other Markdown processors do that, so
240	# use r'thead\s+'.
241	self._EatRawData(r'thead\s+')
242
243	# This is the row data
244	self._Eat(html.StartTag, 'ul')
245
246	while True:
247	td_attrs, inner_html = self._ListItem()
248	if inner_html is None:
249	break
250	cells.append((td_attrs, inner_html))
251	self._WhitespaceOk()
252
253	self._Eat(html.EndTag, 'ul')
254
255	self._WhitespaceOk()
256	self._Eat(html.EndTag, 'li')
257
258	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
259	return cells
260
261	def _ParseTr(self):
262	"""
263	Assume we're looking at the first <ul> tag. Now we want to find
264	<li>tr and the nested <ul>
265
266	Grammar:
267
268	TR =
269	[RawData \s*]?
270	[StartTag 'li']
271	[RawData thead\s*]
272	[StartTag 'ul'] # Indented bullet that starts -
273	( [StartEndTag row-attrs] [RawData \s*] )?
274	LIST_ITEM+ # Defined above
275	[RawData \s*]?
276	[EndTag 'ul']
277	"""
278	#log('*** _ParseTr')
279
280	cells = []
281
282	self._WhitespaceOk()
283
284	# Could be a </ul>
285	if self.tok_id != html.StartTag:
286	return None, None
287
288	self._Eat(html.StartTag, 'li')
289
290	self._EatRawData(r'tr\s*')
291
292	tr_attrs = None
293	if self.tok_id == html.StartEndTag:
294	self.tag_lexer.Reset(self.start_pos, self.end_pos)
295	tag_name = self.tag_lexer.TagName()
296	if tag_name != 'row-attrs':
297	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
298	tr_attrs = self.tag_lexer.AllAttrsRaw()
299	self._Next()
300	self._WhitespaceOk()
301
302	# This is the row data
303	self._Eat(html.StartTag, 'ul')
304
305	while True:
306	td_attrs, inner_html = self._ListItem()
307	if inner_html is None:
308	break
309	cells.append((td_attrs, inner_html))
310	# TODO: assert
311
312	self._WhitespaceOk()
313
314	self._Eat(html.EndTag, 'ul')
315
316	self._WhitespaceOk()
317	self._Eat(html.EndTag, 'li')
318
319	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
320	return tr_attrs, cells
321
322	def ParseTable(self):
323	"""
324	Returns a structure like this
325	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
326	'tr': [ # raw HTML that you surround with <td>
327	[ 'cell1 html', 'cell2 html' ],
328	[ 'cell1 html', 'cell2 html' ],
329	]
330	}
331
332	Grammar:
333
334	UL_TABLE =
335	[StartTag 'ul']
336	THEAD # this this returns the number of cells, so it's NOT context
337	# free
338	TR*
339	[EndTag 'ul']
340	"""
341	table = {'tr': []}
342
343	ul_start = self.start_pos
344	self._Eat(html.StartTag, 'ul')
345
346	# Look ahead 2 or 3 tokens:
347	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
348	thead = self._ParseTHead()
349	else:
350	thead = None
351	#log('___ THEAD %s', thead)
352
353	while True:
354	tr_attrs, tr = self._ParseTr()
355	if tr is None:
356	break
357	# Not validating because of colspan
358	if 0:
359	if thead and len(tr) != len(thead):
360	raise html.ParseError('Expected %d cells, got %d: %s',
361	len(thead), len(tr), tr)
362
363	#log('___ TR %s', tr)
364	table['tr'].append((tr_attrs, tr))
365
366	self._Eat(html.EndTag, 'ul')
367
368	self._WhitespaceOk()
369
370	ul_end = self.start_pos
371
372	table['thead'] = thead
373	table['ul_start'] = ul_start
374	table['ul_end'] = ul_end
375
376	if 0:
377	log('table %s', table)
378	from pprint import pprint
379	pprint(table)
380
381	return table
382
383
384	def MergeAttrs(thead_td_attrs, row_td_attrs):
385	merged_attrs = []
386
387	if row_td_attrs is None:
388	row_lookup = {}
389	else:
390	row_lookup = {n: v for n, v in row_td_attrs}
391
392	done_for_row = set()
393
394	if thead_td_attrs:
395	for name, raw_value in thead_td_attrs:
396	more_values = row_lookup.get(name)
397	if more_values is not None:
398	raw_value += ' %s' % more_values
399	done_for_row.add(name)
400	merged_attrs.append((name, raw_value))
401
402	if row_td_attrs:
403	for name, raw_value in row_td_attrs:
404	if name in done_for_row:
405	continue
406	merged_attrs.append((name, raw_value))
407
408	return merged_attrs
409
410
411	def ReplaceTables(s, debug_out=None):
412	"""
413	ul-table: Write tables using bulleted list
414	"""
415	if debug_out is None:
416	debug_out = []
417
418	f = StringIO()
419	out = html.Output(s, f)
420
421	tag_lexer = html.TagLexer(s)
422	lexer = html.Lexer(s)
423
424	p = UlTableParser(lexer, tag_lexer)
425
426	while True:
427	ul_start = p.FindUlTable()
428	if ul_start == -1:
429	break
430
431	#log('UL START %d', ul_start)
432	out.PrintUntil(ul_start)
433
434	table = p.ParseTable()
435	#log('UL END %d', ul_end)
436
437	# Don't write the matching </u> of the LAST row, but write everything
438	# after that
439	out.SkipTo(table['ul_end'])
440
441	# Write the header
442	thead = table['thead']
443
444	col_attrs = {} # integer -> td_attrs
445	if thead:
446	out.Print('<thead>\n')
447	out.Print('<tr>\n')
448
449	i = 0
450	for td_attrs, raw_html in thead:
451	if td_attrs:
452	col_attrs[i] = td_attrs
453	# <th> tag is more semantic, and styled bold by default
454	out.Print(' <th>')
455	out.Print(raw_html)
456	out.Print('</th>\n')
457	i += 1
458
459	out.Print('</tr>\n')
460	out.Print('</thead>\n')
461
462	# Write each row
463	for tr_attrs, row in table['tr']:
464
465	# Print tr tag and attrs
466	out.Print('<tr')
467	if tr_attrs:
468	for name, raw_value in tr_attrs:
469	out.Print(' ')
470	out.Print(name)
471	# No escaping because it's raw. It can't contain quotes.
472	out.Print('="%s"' % raw_value)
473	out.Print('>\n')
474
475	# Print cells
476	i = 0
477	for row_td_attrs, raw_html in row:
478	# Inherited from header
479	thead_td_attrs = col_attrs.get(i)
480	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
481
482	out.Print(' <td')
483	for name, raw_value in merged_attrs:
484	out.Print(' ')
485	out.Print(name)
486	# No escaping because it's raw. It can't contain quotes.
487	out.Print('="%s"' % raw_value)
488	out.Print('>')
489
490	out.Print(raw_html)
491	out.Print('</td>\n')
492	i += 1
493	out.Print('</tr>\n')
494
495	out.PrintTheRest()
496
497	return f.getvalue()
498
499
500	if __name__ == '__main__':
501	# Simple CLI filter
502	sys.stdout.write(ReplaceTables(sys.stdin.read()))