doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oilshell.org

493 lines, 244 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	import cStringIO
5	import re
6
7	from doctools.util import log
8	from lazylex import html
9
10	_WHITESPACE_RE = re.compile(r'\s*')
11
12
13	class UlTableParser(object):
14
15	def __init__(self, lexer, tag_lexer):
16	self.lexer = lexer
17	self.tag_lexer = tag_lexer
18
19	self.tok_id = html.Invalid
20	self.start_pos = 0
21	self.end_pos = 0
22
23	def _CurrentString(self):
24	part = self.lexer.s[self.start_pos:self.end_pos]
25	return part
26
27	def _Next(self):
28	"""
29	Advance and set self.tok_id, self.start_pos, self.end_pos
30	"""
31	self.start_pos = self.end_pos
32	self.tok_id, self.end_pos = self.lexer.Read()
33	if 0:
34	part = self._CurrentString()
35	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
36
37	#self.tok_id = html.EndOfStream
38	# Don't change self.end_pos
39
40	def _EatRawData(self, regex):
41	# type: (str) -> None
42	"""
43	Assert that we got text data matching a regex, and advance
44	"""
45	if self.tok_id != html.RawData:
46	raise html.ParseError('Expected RawData, got %s',
47	html.TokenName(self.tok_id))
48	actual = self._CurrentString()
49	m = re.match(regex, actual) # could compile this
50	if m is None:
51	raise html.ParseError('Expected to match %r, got %r', regex,
52	actual)
53	self._Next()
54
55	def _Eat(self, expected_id, expected_tag):
56	"""
57	Assert that we got a start or end tag, with the given name, and advance
58
59	Args:
60	expected_id: html.StartTag or html.EndTag
61	expected_tag: 'a', 'span', etc.
62	"""
63	assert expected_id in (html.StartTag,
64	html.EndTag), html.TokenName(expected_id)
65
66	if self.tok_id != expected_id:
67	raise html.ParseError('Expected token %s, got %s',
68	html.TokenName(expected_id),
69	html.TokenName(self.tok_id))
70	self.tag_lexer.Reset(self.start_pos, self.end_pos)
71	tag_name = self.tag_lexer.TagName()
72	if expected_tag != tag_name:
73	raise html.ParseError('Expected tag %r, got %r', expected_tag,
74	tag_name)
75
76	self._Next()
77
78	def _WhitespaceOk(self):
79	"""
80	Optional whitespace
81	"""
82	if (self.tok_id == html.RawData and
83	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
84	self._Next()
85
86	def FindUlTable(self):
87	"""Find <table ...> <ul>
88
89	Return the START position of the <ul>
90	Similar algorithm as html.ReadUntilStartTag()
91	"""
92	tag_lexer = self.tag_lexer
93
94	# Find first table
95	while True:
96	self._Next()
97	if self.tok_id == html.EndOfStream:
98	return -1
99
100	tag_lexer.Reset(self.start_pos, self.end_pos)
101	if (self.tok_id == html.StartTag and
102	tag_lexer.TagName() == 'table'):
103	while True:
104	self._Next()
105	if self.tok_id != html.RawData:
106	break
107
108	tag_lexer.Reset(self.start_pos, self.end_pos)
109	if (self.tok_id == html.StartTag and
110	tag_lexer.TagName() == 'ul'):
111	return self.start_pos
112	return -1
113
114	def _ListItem(self):
115	"""Parse a list item nested below thead or tr.
116
117	Returns:
118	A pair (td_attrs, inner_html)
119
120	Grammar:
121
122	LIST_ITEM =
123	[RawData \s*]?
124	[StartTag 'li']
125	[StartEndTag 'cell-attrs']?
126	ANY* # NOT context-free - anything that's not the end
127	# This is what we should capture in CELLS
128	[EndTag 'li']
129
130	Example of attribute borrowing:
131
132	- hi there ==>
133	<li>hi there</li> ==>
134	<td>hi there</td>
135
136	- <cell-attrs class=foo /> hi there ==>
137	<li><cell-attrs class=foo /> hi there </li> ==>
138	<td class=foo> hi there </td> ==>
139	"""
140	self._WhitespaceOk()
141
142	if self.tok_id != html.StartTag:
143	return None, None
144
145	inner_html = None
146	td_attrs = None # Can we also have col-attrs?
147
148	self._Eat(html.StartTag, 'li')
149
150	if self.tok_id == html.StartEndTag:
151	self.tag_lexer.Reset(self.start_pos, self.end_pos)
152	tag_name = self.tag_lexer.TagName()
153	# TODO: remove td-attrs backward compat
154	if tag_name not in ('td-attrs', 'cell-attrs'):
155	raise html.ParseError('Expected <cell-attrs />, got %r' %
156	tag_name)
157	td_attrs = self.tag_lexer.AllAttrsRaw()
158	self._Next()
159
160	left = self.start_pos
161
162	# Find the closing </li>, taking into accounted NESTED tags:
163	# <li> <li>foo</li> </li>
164	# because cells can have bulleted lists
165	balance = 0
166	while True:
167	if self.tok_id == html.StartTag:
168	self.tag_lexer.Reset(self.start_pos, self.end_pos)
169	if self.tag_lexer.TagName() == 'li':
170	balance += 1
171
172	if self.tok_id == html.EndTag:
173	self.tag_lexer.Reset(self.start_pos, self.end_pos)
174	if self.tag_lexer.TagName() == 'li':
175	balance -= 1
176	if balance < 0:
177	break
178	self._Next()
179
180	right = self.start_pos # start of the end tag
181
182	inner_html = self.tag_lexer.s[left:right]
183	#log('RAW inner html %r', inner_html)
184
185	#self._Eat(html.EndTag, 'li')
186	self._Next()
187
188	return td_attrs, inner_html
189
190	def _ParseTHead(self):
191	"""
192	Assume we're looking at the first <ul> tag. Now we want to find
193	<li>thead and the nested <ul>
194
195	Grammar:
196
197	THEAD =
198	[StartTag 'ul']
199	[RawData \s*]?
200	[StartTag 'li']
201	[RawData thead\s*]
202	[StartTag 'ul'] # Indented bullet that starts -
203	LIST_ITEM+
204	[RawData \s*]?
205	[EndTag 'ul']
206	[RawData thead\s+]
207	[End 'li']
208
209	Two Algorithms:
210
211	1. Replacement:
212	- skip over the first ul 'thead' li, and ul 'tr' li
213	- then replace the next ul -> tr, and li -> td
214	2. Parsing and Rendering:
215	- parse them into a structure
216	- skip all the text
217	- print your own HTML
218
219	I think the second one is better, because it allows attribute extensions
220	to thead
221
222	- thead
223	- name [link][]
224	- colgroup=foo align=left
225	- age
226	- colgroup=foo align=right
227	"""
228	#log('*** _ParseTHead')
229	cells = []
230
231	self._WhitespaceOk()
232	self._Eat(html.StartTag, 'li')
233
234	# In CommonMark, r'thead\n' is enough, because it strips trailing
235	# whitespace. I'm not sure if other Markdown processors do that, so
236	# use r'thead\s+'.
237	self._EatRawData(r'thead\s+')
238
239	# This is the row data
240	self._Eat(html.StartTag, 'ul')
241
242	while True:
243	td_attrs, inner_html = self._ListItem()
244	if inner_html is None:
245	break
246	cells.append((td_attrs, inner_html))
247	self._WhitespaceOk()
248
249	self._Eat(html.EndTag, 'ul')
250
251	self._WhitespaceOk()
252	self._Eat(html.EndTag, 'li')
253
254	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
255	return cells
256
257	def _ParseTr(self):
258	"""
259	Assume we're looking at the first <ul> tag. Now we want to find
260	<li>tr and the nested <ul>
261
262	Grammar:
263
264	TR =
265	[RawData \s*]?
266	[StartTag 'li']
267	[RawData thead\s*]
268	[StartTag 'ul'] # Indented bullet that starts -
269	( [StartEndTag row-attrs] [RawData \s*] )?
270	LIST_ITEM+ # Defined above
271	[RawData \s*]?
272	[EndTag 'ul']
273	"""
274	#log('*** _ParseTr')
275
276	cells = []
277
278	self._WhitespaceOk()
279
280	# Could be a </ul>
281	if self.tok_id != html.StartTag:
282	return None, None
283
284	self._Eat(html.StartTag, 'li')
285
286	self._EatRawData(r'tr\s*')
287
288	tr_attrs = None
289	if self.tok_id == html.StartEndTag:
290	self.tag_lexer.Reset(self.start_pos, self.end_pos)
291	tag_name = self.tag_lexer.TagName()
292	if tag_name != 'row-attrs':
293	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
294	tr_attrs = self.tag_lexer.AllAttrsRaw()
295	self._Next()
296	self._WhitespaceOk()
297
298	# This is the row data
299	self._Eat(html.StartTag, 'ul')
300
301	while True:
302	td_attrs, inner_html = self._ListItem()
303	if inner_html is None:
304	break
305	cells.append((td_attrs, inner_html))
306	# TODO: assert
307
308	self._WhitespaceOk()
309
310	self._Eat(html.EndTag, 'ul')
311
312	self._WhitespaceOk()
313	self._Eat(html.EndTag, 'li')
314
315	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
316	return tr_attrs, cells
317
318	def ParseTable(self):
319	"""
320	Returns a structure like this
321	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
322	'tr': [ # raw HTML that you surround with <td>
323	[ 'cell1 html', 'cell2 html' ],
324	[ 'cell1 html', 'cell2 html' ],
325	]
326	}
327
328	Grammar:
329
330	UL_TABLE =
331	[StartTag 'ul']
332	THEAD # this this returns the number of cells, so it's NOT context
333	# free
334	TR*
335	[EndTag 'ul']
336	"""
337	table = {'tr': []}
338
339	ul_start = self.start_pos
340	self._Eat(html.StartTag, 'ul')
341
342	# Look ahead 2 or 3 tokens:
343	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
344	thead = self._ParseTHead()
345	else:
346	thead = None
347	#log('___ THEAD %s', thead)
348
349	while True:
350	tr_attrs, tr = self._ParseTr()
351	if tr is None:
352	break
353	# Not validating because of colspan
354	if 0:
355	if thead and len(tr) != len(thead):
356	raise html.ParseError('Expected %d cells, got %d: %s',
357	len(thead), len(tr), tr)
358
359	#log('___ TR %s', tr)
360	table['tr'].append((tr_attrs, tr))
361
362	self._Eat(html.EndTag, 'ul')
363
364	self._WhitespaceOk()
365
366	ul_end = self.start_pos
367
368	table['thead'] = thead
369	table['ul_start'] = ul_start
370	table['ul_end'] = ul_end
371
372	if 0:
373	log('table %s', table)
374	from pprint import pprint
375	pprint(table)
376
377	return table
378
379
380	def MergeAttrs(thead_td_attrs, row_td_attrs):
381	merged_attrs = []
382
383	if row_td_attrs is None:
384	row_lookup = {}
385	else:
386	row_lookup = {n: v for n, v in row_td_attrs}
387
388	done_for_row = set()
389
390	if thead_td_attrs:
391	for name, raw_value in thead_td_attrs:
392	more_values = row_lookup.get(name)
393	if more_values is not None:
394	raw_value += ' %s' % more_values
395	done_for_row.add(name)
396	merged_attrs.append((name, raw_value))
397
398	if row_td_attrs:
399	for name, raw_value in row_td_attrs:
400	if name in done_for_row:
401	continue
402	merged_attrs.append((name, raw_value))
403
404	return merged_attrs
405
406
407	def ReplaceTables(s, debug_out=None):
408	"""
409	ul-table: Write tables using bulleted list
410	"""
411	if debug_out is None:
412	debug_out = []
413
414	f = cStringIO.StringIO()
415	out = html.Output(s, f)
416
417	tag_lexer = html.TagLexer(s)
418	lexer = html.Lexer(s)
419
420	p = UlTableParser(lexer, tag_lexer)
421
422	while True:
423	ul_start = p.FindUlTable()
424	if ul_start == -1:
425	break
426
427	#log('UL START %d', ul_start)
428	out.PrintUntil(ul_start)
429
430	table = p.ParseTable()
431	#log('UL END %d', ul_end)
432
433	# Don't write the matching </u> of the LAST row, but write everything
434	# after that
435	out.SkipTo(table['ul_end'])
436
437	# Write the header
438	thead = table['thead']
439
440	col_attrs = {} # integer -> td_attrs
441	if thead:
442	out.Print('<thead>\n')
443	out.Print('<tr>\n')
444
445	i = 0
446	for td_attrs, raw_html in thead:
447	if td_attrs:
448	col_attrs[i] = td_attrs
449	# <th> tag is more semantic, and styled bold by default
450	out.Print(' <th>')
451	out.Print(raw_html)
452	out.Print('</th>\n')
453	i += 1
454
455	out.Print('</tr>\n')
456	out.Print('</thead>\n')
457
458	# Write each row
459	for tr_attrs, row in table['tr']:
460
461	# Print tr tag and attrs
462	out.Print('<tr')
463	if tr_attrs:
464	for name, raw_value in tr_attrs:
465	out.Print(' ')
466	out.Print(name)
467	# No escaping because it's raw. It can't contain quotes.
468	out.Print('="%s"' % raw_value)
469	out.Print('>\n')
470
471	# Print cells
472	i = 0
473	for row_td_attrs, raw_html in row:
474	# Inherited from header
475	thead_td_attrs = col_attrs.get(i)
476	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
477
478	out.Print(' <td')
479	for name, raw_value in merged_attrs:
480	out.Print(' ')
481	out.Print(name)
482	# No escaping because it's raw. It can't contain quotes.
483	out.Print('="%s"' % raw_value)
484	out.Print('>')
485
486	out.Print(raw_html)
487	out.Print('</td>\n')
488	i += 1
489	out.Print('</tr>\n')
490
491	out.PrintTheRest()
492
493	return f.getvalue()