doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oilshell.org

481 lines, 239 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	import cStringIO
5	import re
6
7	from doctools.util import log
8	from lazylex import html
9
10
11	class UlTableParser(object):
12
13	def __init__(self, lexer, tag_lexer):
14	self.lexer = lexer
15	self.tag_lexer = tag_lexer
16
17	self.tok_id = html.Invalid
18	self.start_pos = 0
19	self.end_pos = 0
20
21	def _CurrentString(self):
22	part = self.tag_lexer.s[self.start_pos:self.end_pos]
23	return part
24
25	def _Next(self):
26	"""
27	Advance and set self.tok_id, self.start_pos, self.end_pos
28	"""
29	self.start_pos = self.end_pos
30	try:
31	self.tok_id, self.end_pos = next(self.lexer)
32	except StopIteration:
33	raise
34	if 0:
35	part = self._CurrentString()
36	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
37
38	#self.tok_id = html.EndOfStream
39	# Don't change self.end_pos
40
41	def _EatRawData(self, regex):
42	# type: (str) -> None
43	"""
44	Assert that we got text data matching a regex, and advance
45	"""
46	if self.tok_id != html.RawData:
47	raise html.ParseError('Expected RawData, got %s',
48	html.TokenName(self.tok_id))
49	actual = self._CurrentString()
50	m = re.match(regex, actual) # could compile this
51	if m is None:
52	raise html.ParseError('Expected to match %r, got %r', regex,
53	actual)
54	self._Next()
55
56	def _Eat(self, tok_id, s):
57	"""
58	Assert that we got a start or end tag, with the given name, and advance
59	"""
60	if self.tok_id != tok_id:
61	raise html.ParseError('Expected token %s, got %s',
62	html.TokenName(tok_id),
63	html.TokenName(self.tok_id))
64	if tok_id in (html.StartTag, html.EndTag):
65	self.tag_lexer.Reset(self.start_pos, self.end_pos)
66	tag_name = self.tag_lexer.TagName()
67	if s != tag_name:
68	raise html.ParseError('Expected tag %r, got %r', s, tag_name)
69	else:
70	if s is not None:
71	raise AssertionError("Don't know what to do with %r" % s)
72	self._Next()
73
74	def _WhitespaceOk(self):
75	"""
76	Optional whitespace
77	"""
78	if self.tok_id == html.RawData and self._CurrentString().isspace():
79	self._Next()
80
81	def FindUlTable(self):
82	"""Find <table ...> <ul>
83
84	Return the START position of the <ul>
85	Similar algorithm as html.ReadUntilStartTag()
86	"""
87	tag_lexer = self.tag_lexer
88
89	# Find first table
90	while True:
91	self._Next()
92	if self.tok_id == html.EndOfStream:
93	return -1
94
95	tag_lexer.Reset(self.start_pos, self.end_pos)
96	if (self.tok_id == html.StartTag and
97	tag_lexer.TagName() == 'table'):
98	while True:
99	self._Next()
100	if self.tok_id != html.RawData:
101	break
102
103	tag_lexer.Reset(self.start_pos, self.end_pos)
104	if (self.tok_id == html.StartTag and
105	tag_lexer.TagName() == 'ul'):
106	return self.start_pos
107	return -1
108
109	def _ListItem(self):
110	"""Parse a list item nested below thead or tr.
111
112	Returns:
113	A pair (td_attrs, inner_html)
114
115	Grammar:
116
117	LIST_ITEM =
118	[RawData \s*]?
119	[StartTag 'li']
120	[StartEndTag 'td-attrs']?
121	ANY* # NOT context-free - anything that's not the end
122	# This is what we should capture in CELLS
123	[EndTag 'li']
124
125	Example of attribute borrowing:
126
127	- hi there ==>
128	<li>hi there</li> ==>
129	<td>hi there</td>
130
131	- <td-attrs class=foo /> hi there ==>
132	<li><td-attrs class=foo /> hi there </li> ==>
133	<td class=foo> hi there </td> ==>
134	"""
135	self._WhitespaceOk()
136
137	if self.tok_id != html.StartTag:
138	return None, None
139
140	inner_html = None
141	td_attrs = None # Can we also have col-attrs?
142
143	self._Eat(html.StartTag, 'li')
144
145	if self.tok_id == html.StartEndTag:
146	self.tag_lexer.Reset(self.start_pos, self.end_pos)
147	tag_name = self.tag_lexer.TagName()
148	if tag_name != 'td-attrs':
149	raise html.ParseError('Expected <td-attrs />, got %r' %
150	tag_name)
151	td_attrs = self.tag_lexer.AllAttrsRaw()
152	self._Next()
153
154	left = self.start_pos
155
156	# Find the closing </li>
157	balance = 0
158	while True:
159	# TODO: This has to match NESTED
160	# <li> <li>foo</li> </li>
161	# Because cells can have bulleted lists
162
163	if self.tok_id == html.StartTag:
164	self.tag_lexer.Reset(self.start_pos, self.end_pos)
165	if self.tag_lexer.TagName() == 'li':
166	balance += 1
167
168	if self.tok_id == html.EndTag:
169	self.tag_lexer.Reset(self.start_pos, self.end_pos)
170	if self.tag_lexer.TagName() == 'li':
171	balance -= 1
172	if balance < 0:
173	break
174	self._Next()
175
176	right = self.start_pos # start of the end tag
177
178	inner_html = self.tag_lexer.s[left:right]
179	#log('RAW inner html %r', inner_html)
180
181	#self._Eat(html.EndTag, 'li')
182	self._Next()
183
184	return td_attrs, inner_html
185
186	def _ParseTHead(self):
187	"""
188	Assume we're looking at the first <ul> tag. Now we want to find
189	<li>thead and the nested <ul>
190
191	Grammar:
192
193	THEAD =
194	[StartTag 'ul']
195	[RawData \s*]?
196	[StartTag 'li']
197	[RawData thead\s*]
198	[StartTag 'ul'] # Indented bullet that starts -
199	LIST_ITEM+
200	[RawData \s*]?
201	[EndTag 'ul']
202	[RawData thead\s*]
203	[End 'li']
204
205	Two Algorithms:
206
207	1. Replacement:
208	- skip over the first ul 'thead' li, and ul 'tr' li
209	- then replace the next ul -> tr, and li -> td
210	2. Parsing and Rendering:
211	- parse them into a structure
212	- skip all the text
213	- print your own HTML
214
215	I think the second one is better, because it allows attribute extensions
216	to thead
217
218	- thead
219	- name [link][]
220	- colgroup=foo align=left
221	- age
222	- colgroup=foo align=right
223	"""
224	#log('*** _ParseTHead')
225	cells = []
226
227	self._WhitespaceOk()
228	self._Eat(html.StartTag, 'li')
229
230	# In CommonMark, r'thead\n' is enough, because it strips trailing
231	# whitespace. I'm not sure if other Markdown processors do that, so
232	# use r'thead\n'.
233	self._EatRawData(r'thead\s*')
234
235	# This is the row data
236	self._Eat(html.StartTag, 'ul')
237
238	while True:
239	td_attrs, inner_html = self._ListItem()
240	if inner_html is None:
241	break
242	cells.append((td_attrs, inner_html))
243	self._WhitespaceOk()
244
245	self._Eat(html.EndTag, 'ul')
246
247	self._WhitespaceOk()
248	self._Eat(html.EndTag, 'li')
249
250	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
251	return cells
252
253	def _ParseTr(self):
254	"""
255	Assume we're looking at the first <ul> tag. Now we want to find
256	<li>tr and the nested <ul>
257
258	Grammar:
259
260	TR =
261	[RawData \s*]?
262	[StartTag 'li']
263	[RawData thead\s*]
264	[StartTag 'ul'] # Indented bullet that starts -
265	( [StartEndTag tr-attrs] [RawData \s*] )?
266	LIST_ITEM+ # Defined above
267	[RawData \s*]?
268	[EndTag 'ul']
269	"""
270	#log('*** _ParseTr')
271
272	cells = []
273
274	self._WhitespaceOk()
275
276	# Could be a </ul>
277	if self.tok_id != html.StartTag:
278	return None, None
279
280	self._Eat(html.StartTag, 'li')
281
282	self._EatRawData(r'tr\s*')
283
284	tr_attrs = None
285	if self.tok_id == html.StartEndTag:
286	self.tag_lexer.Reset(self.start_pos, self.end_pos)
287	tr_attrs = self.tag_lexer.AllAttrsRaw()
288	self._Next()
289	self._WhitespaceOk()
290
291	# This is the row data
292	self._Eat(html.StartTag, 'ul')
293
294	while True:
295	td_attrs, inner_html = self._ListItem()
296	if inner_html is None:
297	break
298	cells.append((td_attrs, inner_html))
299	# TODO: assert
300
301	self._WhitespaceOk()
302
303	self._Eat(html.EndTag, 'ul')
304
305	self._WhitespaceOk()
306	self._Eat(html.EndTag, 'li')
307
308	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
309	return tr_attrs, cells
310
311	def ParseTable(self):
312	"""
313	Returns a structure like this
314	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
315	'tr': [ # raw HTML that you surround with <td>
316	[ 'cell1 html', 'cell2 html' ],
317	[ 'cell1 html', 'cell2 html' ],
318	]
319	}
320
321	Grammar:
322
323	UL_TABLE =
324	[StartTag 'ul']
325	THEAD # this this returns the number of cells, so it's NOT context
326	# free
327	TR*
328	[EndTag 'ul']
329	"""
330	table = {'tr': []}
331
332	ul_start = self.start_pos
333	self._Eat(html.StartTag, 'ul')
334
335	thead = self._ParseTHead()
336	#log('___ THEAD %s', thead)
337
338	num_cells = len(thead)
339	while True:
340	tr_attrs, tr = self._ParseTr()
341	if tr is None:
342	break
343	# Not validating because of colspan
344	if 0:
345	if len(tr) != num_cells:
346	raise html.ParseError('Expected %d cells, got %d: %s',
347	num_cells, len(tr), tr)
348
349	#log('___ TR %s', tr)
350	table['tr'].append((tr_attrs, tr))
351
352	self._Eat(html.EndTag, 'ul')
353
354	self._WhitespaceOk()
355
356	ul_end = self.start_pos
357
358	table['thead'] = thead
359	table['ul_start'] = ul_start
360	table['ul_end'] = ul_end
361
362	if 0:
363	log('table %s', table)
364	from pprint import pprint
365	pprint(table)
366
367	return table
368
369
370	def MergeAttrs(thead_td_attrs, row_td_attrs):
371	merged_attrs = []
372
373	if row_td_attrs is None:
374	row_lookup = {}
375	else:
376	row_lookup = {n: v for n, v in row_td_attrs}
377
378	done_for_row = set()
379
380	if thead_td_attrs:
381	for name, raw_value in thead_td_attrs:
382	more_values = row_lookup.get(name)
383	if more_values is not None:
384	raw_value += ' %s' % more_values
385	done_for_row.add(name)
386	merged_attrs.append((name, raw_value))
387
388	if row_td_attrs:
389	for name, raw_value in row_td_attrs:
390	if name in done_for_row:
391	continue
392	merged_attrs.append((name, raw_value))
393
394	return merged_attrs
395
396
397	def ReplaceTables(s, debug_out=None):
398	"""
399	ul-table: Write tables using bulleted list
400	"""
401	if debug_out is None:
402	debug_out = []
403
404	f = cStringIO.StringIO()
405	out = html.Output(s, f)
406
407	tag_lexer = html.TagLexer(s)
408	it = html.ValidTokens(s)
409
410	p = UlTableParser(it, tag_lexer)
411
412	while True:
413	ul_start = p.FindUlTable()
414	if ul_start == -1:
415	break
416
417	#log('UL START %d', ul_start)
418	out.PrintUntil(ul_start)
419
420	table = p.ParseTable()
421	#log('UL END %d', ul_end)
422
423	# Don't write the matching </u> of the LAST row, but write everything
424	# after that
425	out.SkipTo(table['ul_end'])
426
427	# Write the header
428	out.Print('<thead>\n')
429	out.Print('<tr>\n')
430
431	col_attrs = {} # integer -> td_attrs
432
433	i = 0
434	for td_attrs, raw_html in table['thead']:
435	if td_attrs:
436	col_attrs[i] = td_attrs
437	# <th> tag is more semantic, and styled bold by default
438	out.Print(' <th>')
439	out.Print(raw_html)
440	out.Print('</th>\n')
441	i += 1
442
443	out.Print('</tr>\n')
444	out.Print('</thead>\n')
445
446	# Write each row
447	for tr_attrs, row in table['tr']:
448
449	# Print tr tag and attrs
450	out.Print('<tr')
451	if tr_attrs:
452	for name, raw_value in tr_attrs:
453	out.Print(' ')
454	out.Print(name)
455	# No escaping because it's raw. It can't contain quotes.
456	out.Print('="%s"' % raw_value)
457	out.Print('>\n')
458
459	# Print cells
460	i = 0
461	for row_td_attrs, raw_html in row:
462	# Inherited from header
463	thead_td_attrs = col_attrs.get(i)
464	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
465
466	out.Print(' <td')
467	for name, raw_value in merged_attrs:
468	out.Print(' ')
469	out.Print(name)
470	# No escaping because it's raw. It can't contain quotes.
471	out.Print('="%s"' % raw_value)
472	out.Print('>')
473
474	out.Print(raw_html)
475	out.Print('</td>\n')
476	i += 1
477	out.Print('</tr>\n')
478
479	out.PrintTheRest()
480
481	return f.getvalue()