doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oilshell.org

496 lines, 247 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	try:
5	from cStringIO import StringIO
6	except ImportError:
7	from io import StringIO
8	import re
9
10	from doctools.util import log
11	from lazylex import html
12
13	_WHITESPACE_RE = re.compile(r'\s*')
14
15
16	class UlTableParser(object):
17
18	def __init__(self, lexer, tag_lexer):
19	self.lexer = lexer
20	self.tag_lexer = tag_lexer
21
22	self.tok_id = html.Invalid
23	self.start_pos = 0
24	self.end_pos = 0
25
26	def _CurrentString(self):
27	part = self.lexer.s[self.start_pos:self.end_pos]
28	return part
29
30	def _Next(self):
31	"""
32	Advance and set self.tok_id, self.start_pos, self.end_pos
33	"""
34	self.start_pos = self.end_pos
35	self.tok_id, self.end_pos = self.lexer.Read()
36	if 0:
37	part = self._CurrentString()
38	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
39
40	#self.tok_id = html.EndOfStream
41	# Don't change self.end_pos
42
43	def _EatRawData(self, regex):
44	# type: (str) -> None
45	"""
46	Assert that we got text data matching a regex, and advance
47	"""
48	if self.tok_id != html.RawData:
49	raise html.ParseError('Expected RawData, got %s',
50	html.TokenName(self.tok_id))
51	actual = self._CurrentString()
52	m = re.match(regex, actual) # could compile this
53	if m is None:
54	raise html.ParseError('Expected to match %r, got %r', regex,
55	actual)
56	self._Next()
57
58	def _Eat(self, expected_id, expected_tag):
59	"""
60	Assert that we got a start or end tag, with the given name, and advance
61
62	Args:
63	expected_id: html.StartTag or html.EndTag
64	expected_tag: 'a', 'span', etc.
65	"""
66	assert expected_id in (html.StartTag,
67	html.EndTag), html.TokenName(expected_id)
68
69	if self.tok_id != expected_id:
70	raise html.ParseError('Expected token %s, got %s',
71	html.TokenName(expected_id),
72	html.TokenName(self.tok_id))
73	self.tag_lexer.Reset(self.start_pos, self.end_pos)
74	tag_name = self.tag_lexer.TagName()
75	if expected_tag != tag_name:
76	raise html.ParseError('Expected tag %r, got %r', expected_tag,
77	tag_name)
78
79	self._Next()
80
81	def _WhitespaceOk(self):
82	"""
83	Optional whitespace
84	"""
85	if (self.tok_id == html.RawData and
86	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
87	self._Next()
88
89	def FindUlTable(self):
90	"""Find <table ...> <ul>
91
92	Return the START position of the <ul>
93	Similar algorithm as html.ReadUntilStartTag()
94	"""
95	tag_lexer = self.tag_lexer
96
97	# Find first table
98	while True:
99	self._Next()
100	if self.tok_id == html.EndOfStream:
101	return -1
102
103	tag_lexer.Reset(self.start_pos, self.end_pos)
104	if (self.tok_id == html.StartTag and
105	tag_lexer.TagName() == 'table'):
106	while True:
107	self._Next()
108	if self.tok_id != html.RawData:
109	break
110
111	tag_lexer.Reset(self.start_pos, self.end_pos)
112	if (self.tok_id == html.StartTag and
113	tag_lexer.TagName() == 'ul'):
114	return self.start_pos
115	return -1
116
117	def _ListItem(self):
118	"""Parse a list item nested below thead or tr.
119
120	Returns:
121	A pair (td_attrs, inner_html)
122
123	Grammar:
124
125	LIST_ITEM =
126	[RawData \s*]?
127	[StartTag 'li']
128	[StartEndTag 'cell-attrs']?
129	ANY* # NOT context-free - anything that's not the end
130	# This is what we should capture in CELLS
131	[EndTag 'li']
132
133	Example of attribute borrowing:
134
135	- hi there ==>
136	<li>hi there</li> ==>
137	<td>hi there</td>
138
139	- <cell-attrs class=foo /> hi there ==>
140	<li><cell-attrs class=foo /> hi there </li> ==>
141	<td class=foo> hi there </td> ==>
142	"""
143	self._WhitespaceOk()
144
145	if self.tok_id != html.StartTag:
146	return None, None
147
148	inner_html = None
149	td_attrs = None # Can we also have col-attrs?
150
151	self._Eat(html.StartTag, 'li')
152
153	if self.tok_id == html.StartEndTag:
154	self.tag_lexer.Reset(self.start_pos, self.end_pos)
155	tag_name = self.tag_lexer.TagName()
156	# TODO: remove td-attrs backward compat
157	if tag_name not in ('td-attrs', 'cell-attrs'):
158	raise html.ParseError('Expected <cell-attrs />, got %r' %
159	tag_name)
160	td_attrs = self.tag_lexer.AllAttrsRaw()
161	self._Next()
162
163	left = self.start_pos
164
165	# Find the closing </li>, taking into accounted NESTED tags:
166	# <li> <li>foo</li> </li>
167	# because cells can have bulleted lists
168	balance = 0
169	while True:
170	if self.tok_id == html.StartTag:
171	self.tag_lexer.Reset(self.start_pos, self.end_pos)
172	if self.tag_lexer.TagName() == 'li':
173	balance += 1
174
175	if self.tok_id == html.EndTag:
176	self.tag_lexer.Reset(self.start_pos, self.end_pos)
177	if self.tag_lexer.TagName() == 'li':
178	balance -= 1
179	if balance < 0:
180	break
181	self._Next()
182
183	right = self.start_pos # start of the end tag
184
185	inner_html = self.tag_lexer.s[left:right]
186	#log('RAW inner html %r', inner_html)
187
188	#self._Eat(html.EndTag, 'li')
189	self._Next()
190
191	return td_attrs, inner_html
192
193	def _ParseTHead(self):
194	"""
195	Assume we're looking at the first <ul> tag. Now we want to find
196	<li>thead and the nested <ul>
197
198	Grammar:
199
200	THEAD =
201	[StartTag 'ul']
202	[RawData \s*]?
203	[StartTag 'li']
204	[RawData thead\s*]
205	[StartTag 'ul'] # Indented bullet that starts -
206	LIST_ITEM+
207	[RawData \s*]?
208	[EndTag 'ul']
209	[RawData thead\s+]
210	[End 'li']
211
212	Two Algorithms:
213
214	1. Replacement:
215	- skip over the first ul 'thead' li, and ul 'tr' li
216	- then replace the next ul -> tr, and li -> td
217	2. Parsing and Rendering:
218	- parse them into a structure
219	- skip all the text
220	- print your own HTML
221
222	I think the second one is better, because it allows attribute extensions
223	to thead
224
225	- thead
226	- name [link][]
227	- colgroup=foo align=left
228	- age
229	- colgroup=foo align=right
230	"""
231	#log('*** _ParseTHead')
232	cells = []
233
234	self._WhitespaceOk()
235	self._Eat(html.StartTag, 'li')
236
237	# In CommonMark, r'thead\n' is enough, because it strips trailing
238	# whitespace. I'm not sure if other Markdown processors do that, so
239	# use r'thead\s+'.
240	self._EatRawData(r'thead\s+')
241
242	# This is the row data
243	self._Eat(html.StartTag, 'ul')
244
245	while True:
246	td_attrs, inner_html = self._ListItem()
247	if inner_html is None:
248	break
249	cells.append((td_attrs, inner_html))
250	self._WhitespaceOk()
251
252	self._Eat(html.EndTag, 'ul')
253
254	self._WhitespaceOk()
255	self._Eat(html.EndTag, 'li')
256
257	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
258	return cells
259
260	def _ParseTr(self):
261	"""
262	Assume we're looking at the first <ul> tag. Now we want to find
263	<li>tr and the nested <ul>
264
265	Grammar:
266
267	TR =
268	[RawData \s*]?
269	[StartTag 'li']
270	[RawData thead\s*]
271	[StartTag 'ul'] # Indented bullet that starts -
272	( [StartEndTag row-attrs] [RawData \s*] )?
273	LIST_ITEM+ # Defined above
274	[RawData \s*]?
275	[EndTag 'ul']
276	"""
277	#log('*** _ParseTr')
278
279	cells = []
280
281	self._WhitespaceOk()
282
283	# Could be a </ul>
284	if self.tok_id != html.StartTag:
285	return None, None
286
287	self._Eat(html.StartTag, 'li')
288
289	self._EatRawData(r'tr\s*')
290
291	tr_attrs = None
292	if self.tok_id == html.StartEndTag:
293	self.tag_lexer.Reset(self.start_pos, self.end_pos)
294	tag_name = self.tag_lexer.TagName()
295	if tag_name != 'row-attrs':
296	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
297	tr_attrs = self.tag_lexer.AllAttrsRaw()
298	self._Next()
299	self._WhitespaceOk()
300
301	# This is the row data
302	self._Eat(html.StartTag, 'ul')
303
304	while True:
305	td_attrs, inner_html = self._ListItem()
306	if inner_html is None:
307	break
308	cells.append((td_attrs, inner_html))
309	# TODO: assert
310
311	self._WhitespaceOk()
312
313	self._Eat(html.EndTag, 'ul')
314
315	self._WhitespaceOk()
316	self._Eat(html.EndTag, 'li')
317
318	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
319	return tr_attrs, cells
320
321	def ParseTable(self):
322	"""
323	Returns a structure like this
324	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
325	'tr': [ # raw HTML that you surround with <td>
326	[ 'cell1 html', 'cell2 html' ],
327	[ 'cell1 html', 'cell2 html' ],
328	]
329	}
330
331	Grammar:
332
333	UL_TABLE =
334	[StartTag 'ul']
335	THEAD # this this returns the number of cells, so it's NOT context
336	# free
337	TR*
338	[EndTag 'ul']
339	"""
340	table = {'tr': []}
341
342	ul_start = self.start_pos
343	self._Eat(html.StartTag, 'ul')
344
345	# Look ahead 2 or 3 tokens:
346	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
347	thead = self._ParseTHead()
348	else:
349	thead = None
350	#log('___ THEAD %s', thead)
351
352	while True:
353	tr_attrs, tr = self._ParseTr()
354	if tr is None:
355	break
356	# Not validating because of colspan
357	if 0:
358	if thead and len(tr) != len(thead):
359	raise html.ParseError('Expected %d cells, got %d: %s',
360	len(thead), len(tr), tr)
361
362	#log('___ TR %s', tr)
363	table['tr'].append((tr_attrs, tr))
364
365	self._Eat(html.EndTag, 'ul')
366
367	self._WhitespaceOk()
368
369	ul_end = self.start_pos
370
371	table['thead'] = thead
372	table['ul_start'] = ul_start
373	table['ul_end'] = ul_end
374
375	if 0:
376	log('table %s', table)
377	from pprint import pprint
378	pprint(table)
379
380	return table
381
382
383	def MergeAttrs(thead_td_attrs, row_td_attrs):
384	merged_attrs = []
385
386	if row_td_attrs is None:
387	row_lookup = {}
388	else:
389	row_lookup = {n: v for n, v in row_td_attrs}
390
391	done_for_row = set()
392
393	if thead_td_attrs:
394	for name, raw_value in thead_td_attrs:
395	more_values = row_lookup.get(name)
396	if more_values is not None:
397	raw_value += ' %s' % more_values
398	done_for_row.add(name)
399	merged_attrs.append((name, raw_value))
400
401	if row_td_attrs:
402	for name, raw_value in row_td_attrs:
403	if name in done_for_row:
404	continue
405	merged_attrs.append((name, raw_value))
406
407	return merged_attrs
408
409
410	def ReplaceTables(s, debug_out=None):
411	"""
412	ul-table: Write tables using bulleted list
413	"""
414	if debug_out is None:
415	debug_out = []
416
417	f = StringIO()
418	out = html.Output(s, f)
419
420	tag_lexer = html.TagLexer(s)
421	lexer = html.Lexer(s)
422
423	p = UlTableParser(lexer, tag_lexer)
424
425	while True:
426	ul_start = p.FindUlTable()
427	if ul_start == -1:
428	break
429
430	#log('UL START %d', ul_start)
431	out.PrintUntil(ul_start)
432
433	table = p.ParseTable()
434	#log('UL END %d', ul_end)
435
436	# Don't write the matching </u> of the LAST row, but write everything
437	# after that
438	out.SkipTo(table['ul_end'])
439
440	# Write the header
441	thead = table['thead']
442
443	col_attrs = {} # integer -> td_attrs
444	if thead:
445	out.Print('<thead>\n')
446	out.Print('<tr>\n')
447
448	i = 0
449	for td_attrs, raw_html in thead:
450	if td_attrs:
451	col_attrs[i] = td_attrs
452	# <th> tag is more semantic, and styled bold by default
453	out.Print(' <th>')
454	out.Print(raw_html)
455	out.Print('</th>\n')
456	i += 1
457
458	out.Print('</tr>\n')
459	out.Print('</thead>\n')
460
461	# Write each row
462	for tr_attrs, row in table['tr']:
463
464	# Print tr tag and attrs
465	out.Print('<tr')
466	if tr_attrs:
467	for name, raw_value in tr_attrs:
468	out.Print(' ')
469	out.Print(name)
470	# No escaping because it's raw. It can't contain quotes.
471	out.Print('="%s"' % raw_value)
472	out.Print('>\n')
473
474	# Print cells
475	i = 0
476	for row_td_attrs, raw_html in row:
477	# Inherited from header
478	thead_td_attrs = col_attrs.get(i)
479	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
480
481	out.Print(' <td')
482	for name, raw_value in merged_attrs:
483	out.Print(' ')
484	out.Print(name)
485	# No escaping because it's raw. It can't contain quotes.
486	out.Print('="%s"' % raw_value)
487	out.Print('>')
488
489	out.Print(raw_html)
490	out.Print('</td>\n')
491	i += 1
492	out.Print('</tr>\n')
493
494	out.PrintTheRest()
495
496	return f.getvalue()