doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oilshell.org

491 lines, 241 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	import cStringIO
5	import re
6
7	from doctools.util import log
8	from lazylex import html
9
10	_WHITESPACE_RE = re.compile(r'\s*')
11
12
13	class UlTableParser(object):
14
15	def __init__(self, lexer, tag_lexer):
16	self.lexer = lexer
17	self.tag_lexer = tag_lexer
18
19	self.tok_id = html.Invalid
20	self.start_pos = 0
21	self.end_pos = 0
22
23	def _CurrentString(self):
24	part = self.lexer.s[self.start_pos:self.end_pos]
25	return part
26
27	def _Next(self):
28	"""
29	Advance and set self.tok_id, self.start_pos, self.end_pos
30	"""
31	self.start_pos = self.end_pos
32	self.tok_id, self.end_pos = self.lexer.Read()
33	if 0:
34	part = self._CurrentString()
35	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
36
37	#self.tok_id = html.EndOfStream
38	# Don't change self.end_pos
39
40	def _EatRawData(self, regex):
41	# type: (str) -> None
42	"""
43	Assert that we got text data matching a regex, and advance
44	"""
45	if self.tok_id != html.RawData:
46	raise html.ParseError('Expected RawData, got %s',
47	html.TokenName(self.tok_id))
48	actual = self._CurrentString()
49	m = re.match(regex, actual) # could compile this
50	if m is None:
51	raise html.ParseError('Expected to match %r, got %r', regex,
52	actual)
53	self._Next()
54
55	def _Eat(self, expected_id, expected_tag):
56	"""
57	Assert that we got a start or end tag, with the given name, and advance
58
59	Args:
60	expected_id: html.StartTag or html.EndTag
61	expected_tag: 'a', 'span', etc.
62	"""
63	assert expected_id in (html.StartTag,
64	html.EndTag), html.TokenName(expected_id)
65
66	if self.tok_id != expected_id:
67	raise html.ParseError('Expected token %s, got %s',
68	html.TokenName(expected_id),
69	html.TokenName(self.tok_id))
70	self.tag_lexer.Reset(self.start_pos, self.end_pos)
71	tag_name = self.tag_lexer.TagName()
72	if expected_tag != tag_name:
73	raise html.ParseError('Expected tag %r, got %r', expected_tag,
74	tag_name)
75
76	self._Next()
77
78	def _WhitespaceOk(self):
79	"""
80	Optional whitespace
81	"""
82	if (self.tok_id == html.RawData and
83	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
84	self._Next()
85
86	def FindUlTable(self):
87	"""Find <table ...> <ul>
88
89	Return the START position of the <ul>
90	Similar algorithm as html.ReadUntilStartTag()
91	"""
92	tag_lexer = self.tag_lexer
93
94	# Find first table
95	while True:
96	self._Next()
97	if self.tok_id == html.EndOfStream:
98	return -1
99
100	tag_lexer.Reset(self.start_pos, self.end_pos)
101	if (self.tok_id == html.StartTag and
102	tag_lexer.TagName() == 'table'):
103	while True:
104	self._Next()
105	if self.tok_id != html.RawData:
106	break
107
108	tag_lexer.Reset(self.start_pos, self.end_pos)
109	if (self.tok_id == html.StartTag and
110	tag_lexer.TagName() == 'ul'):
111	return self.start_pos
112	return -1
113
114	def _ListItem(self):
115	"""Parse a list item nested below thead or tr.
116
117	Returns:
118	A pair (td_attrs, inner_html)
119
120	Grammar:
121
122	LIST_ITEM =
123	[RawData \s*]?
124	[StartTag 'li']
125	[StartEndTag 'td-attrs']?
126	ANY* # NOT context-free - anything that's not the end
127	# This is what we should capture in CELLS
128	[EndTag 'li']
129
130	Example of attribute borrowing:
131
132	- hi there ==>
133	<li>hi there</li> ==>
134	<td>hi there</td>
135
136	- <td-attrs class=foo /> hi there ==>
137	<li><td-attrs class=foo /> hi there </li> ==>
138	<td class=foo> hi there </td> ==>
139	"""
140	self._WhitespaceOk()
141
142	if self.tok_id != html.StartTag:
143	return None, None
144
145	inner_html = None
146	td_attrs = None # Can we also have col-attrs?
147
148	self._Eat(html.StartTag, 'li')
149
150	if self.tok_id == html.StartEndTag:
151	self.tag_lexer.Reset(self.start_pos, self.end_pos)
152	tag_name = self.tag_lexer.TagName()
153	if tag_name != 'td-attrs':
154	raise html.ParseError('Expected <td-attrs />, got %r' %
155	tag_name)
156	td_attrs = self.tag_lexer.AllAttrsRaw()
157	self._Next()
158
159	left = self.start_pos
160
161	# Find the closing </li>
162	balance = 0
163	while True:
164	# TODO: This has to match NESTED
165	# <li> <li>foo</li> </li>
166	# Because cells can have bulleted lists
167
168	if self.tok_id == html.StartTag:
169	self.tag_lexer.Reset(self.start_pos, self.end_pos)
170	if self.tag_lexer.TagName() == 'li':
171	balance += 1
172
173	if self.tok_id == html.EndTag:
174	self.tag_lexer.Reset(self.start_pos, self.end_pos)
175	if self.tag_lexer.TagName() == 'li':
176	balance -= 1
177	if balance < 0:
178	break
179	self._Next()
180
181	right = self.start_pos # start of the end tag
182
183	inner_html = self.tag_lexer.s[left:right]
184	#log('RAW inner html %r', inner_html)
185
186	#self._Eat(html.EndTag, 'li')
187	self._Next()
188
189	return td_attrs, inner_html
190
191	def _ParseTHead(self):
192	"""
193	Assume we're looking at the first <ul> tag. Now we want to find
194	<li>thead and the nested <ul>
195
196	Grammar:
197
198	THEAD =
199	[StartTag 'ul']
200	[RawData \s*]?
201	[StartTag 'li']
202	[RawData thead\s*]
203	[StartTag 'ul'] # Indented bullet that starts -
204	LIST_ITEM+
205	[RawData \s*]?
206	[EndTag 'ul']
207	[RawData thead\s+]
208	[End 'li']
209
210	Two Algorithms:
211
212	1. Replacement:
213	- skip over the first ul 'thead' li, and ul 'tr' li
214	- then replace the next ul -> tr, and li -> td
215	2. Parsing and Rendering:
216	- parse them into a structure
217	- skip all the text
218	- print your own HTML
219
220	I think the second one is better, because it allows attribute extensions
221	to thead
222
223	- thead
224	- name [link][]
225	- colgroup=foo align=left
226	- age
227	- colgroup=foo align=right
228	"""
229	#log('*** _ParseTHead')
230	cells = []
231
232	self._WhitespaceOk()
233	self._Eat(html.StartTag, 'li')
234
235	# In CommonMark, r'thead\n' is enough, because it strips trailing
236	# whitespace. I'm not sure if other Markdown processors do that, so
237	# use r'thead\s+'.
238	self._EatRawData(r'thead\s+')
239
240	# This is the row data
241	self._Eat(html.StartTag, 'ul')
242
243	while True:
244	td_attrs, inner_html = self._ListItem()
245	if inner_html is None:
246	break
247	cells.append((td_attrs, inner_html))
248	self._WhitespaceOk()
249
250	self._Eat(html.EndTag, 'ul')
251
252	self._WhitespaceOk()
253	self._Eat(html.EndTag, 'li')
254
255	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
256	return cells
257
258	def _ParseTr(self):
259	"""
260	Assume we're looking at the first <ul> tag. Now we want to find
261	<li>tr and the nested <ul>
262
263	Grammar:
264
265	TR =
266	[RawData \s*]?
267	[StartTag 'li']
268	[RawData thead\s*]
269	[StartTag 'ul'] # Indented bullet that starts -
270	( [StartEndTag tr-attrs] [RawData \s*] )?
271	LIST_ITEM+ # Defined above
272	[RawData \s*]?
273	[EndTag 'ul']
274	"""
275	#log('*** _ParseTr')
276
277	cells = []
278
279	self._WhitespaceOk()
280
281	# Could be a </ul>
282	if self.tok_id != html.StartTag:
283	return None, None
284
285	self._Eat(html.StartTag, 'li')
286
287	self._EatRawData(r'tr\s*')
288
289	tr_attrs = None
290	if self.tok_id == html.StartEndTag:
291	self.tag_lexer.Reset(self.start_pos, self.end_pos)
292	tr_attrs = self.tag_lexer.AllAttrsRaw()
293	self._Next()
294	self._WhitespaceOk()
295
296	# This is the row data
297	self._Eat(html.StartTag, 'ul')
298
299	while True:
300	td_attrs, inner_html = self._ListItem()
301	if inner_html is None:
302	break
303	cells.append((td_attrs, inner_html))
304	# TODO: assert
305
306	self._WhitespaceOk()
307
308	self._Eat(html.EndTag, 'ul')
309
310	self._WhitespaceOk()
311	self._Eat(html.EndTag, 'li')
312
313	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
314	return tr_attrs, cells
315
316	def ParseTable(self):
317	"""
318	Returns a structure like this
319	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
320	'tr': [ # raw HTML that you surround with <td>
321	[ 'cell1 html', 'cell2 html' ],
322	[ 'cell1 html', 'cell2 html' ],
323	]
324	}
325
326	Grammar:
327
328	UL_TABLE =
329	[StartTag 'ul']
330	THEAD # this this returns the number of cells, so it's NOT context
331	# free
332	TR*
333	[EndTag 'ul']
334	"""
335	table = {'tr': []}
336
337	ul_start = self.start_pos
338	self._Eat(html.StartTag, 'ul')
339
340	# Look ahead 2 or 3 tokens:
341	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
342	thead = self._ParseTHead()
343	else:
344	thead = None
345	#log('___ THEAD %s', thead)
346
347	while True:
348	tr_attrs, tr = self._ParseTr()
349	if tr is None:
350	break
351	# Not validating because of colspan
352	if 0:
353	if thead and len(tr) != len(thead):
354	raise html.ParseError('Expected %d cells, got %d: %s',
355	len(thead), len(tr), tr)
356
357	#log('___ TR %s', tr)
358	table['tr'].append((tr_attrs, tr))
359
360	self._Eat(html.EndTag, 'ul')
361
362	self._WhitespaceOk()
363
364	ul_end = self.start_pos
365
366	table['thead'] = thead
367	table['ul_start'] = ul_start
368	table['ul_end'] = ul_end
369
370	if 0:
371	log('table %s', table)
372	from pprint import pprint
373	pprint(table)
374
375	return table
376
377
378	def MergeAttrs(thead_td_attrs, row_td_attrs):
379	merged_attrs = []
380
381	if row_td_attrs is None:
382	row_lookup = {}
383	else:
384	row_lookup = {n: v for n, v in row_td_attrs}
385
386	done_for_row = set()
387
388	if thead_td_attrs:
389	for name, raw_value in thead_td_attrs:
390	more_values = row_lookup.get(name)
391	if more_values is not None:
392	raw_value += ' %s' % more_values
393	done_for_row.add(name)
394	merged_attrs.append((name, raw_value))
395
396	if row_td_attrs:
397	for name, raw_value in row_td_attrs:
398	if name in done_for_row:
399	continue
400	merged_attrs.append((name, raw_value))
401
402	return merged_attrs
403
404
405	def ReplaceTables(s, debug_out=None):
406	"""
407	ul-table: Write tables using bulleted list
408	"""
409	if debug_out is None:
410	debug_out = []
411
412	f = cStringIO.StringIO()
413	out = html.Output(s, f)
414
415	tag_lexer = html.TagLexer(s)
416	lexer = html.Lexer(s)
417
418	p = UlTableParser(lexer, tag_lexer)
419
420	while True:
421	ul_start = p.FindUlTable()
422	if ul_start == -1:
423	break
424
425	#log('UL START %d', ul_start)
426	out.PrintUntil(ul_start)
427
428	table = p.ParseTable()
429	#log('UL END %d', ul_end)
430
431	# Don't write the matching </u> of the LAST row, but write everything
432	# after that
433	out.SkipTo(table['ul_end'])
434
435	# Write the header
436	thead = table['thead']
437
438	col_attrs = {} # integer -> td_attrs
439	if thead:
440	out.Print('<thead>\n')
441	out.Print('<tr>\n')
442
443	i = 0
444	for td_attrs, raw_html in thead:
445	if td_attrs:
446	col_attrs[i] = td_attrs
447	# <th> tag is more semantic, and styled bold by default
448	out.Print(' <th>')
449	out.Print(raw_html)
450	out.Print('</th>\n')
451	i += 1
452
453	out.Print('</tr>\n')
454	out.Print('</thead>\n')
455
456	# Write each row
457	for tr_attrs, row in table['tr']:
458
459	# Print tr tag and attrs
460	out.Print('<tr')
461	if tr_attrs:
462	for name, raw_value in tr_attrs:
463	out.Print(' ')
464	out.Print(name)
465	# No escaping because it's raw. It can't contain quotes.
466	out.Print('="%s"' % raw_value)
467	out.Print('>\n')
468
469	# Print cells
470	i = 0
471	for row_td_attrs, raw_html in row:
472	# Inherited from header
473	thead_td_attrs = col_attrs.get(i)
474	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
475
476	out.Print(' <td')
477	for name, raw_value in merged_attrs:
478	out.Print(' ')
479	out.Print(name)
480	# No escaping because it's raw. It can't contain quotes.
481	out.Print('="%s"' % raw_value)
482	out.Print('>')
483
484	out.Print(raw_html)
485	out.Print('</td>\n')
486	i += 1
487	out.Print('</tr>\n')
488
489	out.PrintTheRest()
490
491	return f.getvalue()