doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

563 lines, 281 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	try:
5	from cStringIO import StringIO
6	except ImportError:
7	from io import StringIO # type: ignore
8	import re
9	import sys
10
11	from doctools.util import log
12	from lazylex import html
13	from typing import List
14	from typing import Optional
15	from typing import Tuple
16	from typing import Union
17	from typing import Any
18	from typing import Dict
19
20
21	def RemoveComments(s):
22	# type: (str) -> str
23	"""Remove <!-- comments -->
24
25	This is a required preprocessing step for ul-table.
26	"""
27	f = StringIO()
28	out = html.Output(s, f)
29
30	tag_lexer = html.TagLexer(s)
31
32	pos = 0
33
34	for tok_id, end_pos in html.ValidTokens(s):
35	if tok_id == html.Comment:
36	value = s[pos:end_pos]
37	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
38	if 'REPLACE' not in value:
39	out.PrintUntil(pos)
40	out.SkipTo(end_pos)
41	pos = end_pos
42
43	out.PrintTheRest()
44	return f.getvalue()
45
46
47	_WHITESPACE_RE = re.compile(r'\s*')
48
49
50	class UlTableParser(object):
51
52	def __init__(self, lexer, tag_lexer):
53	# type: (html.Lexer, html.TagLexer) -> None
54	self.lexer = lexer
55	self.tag_lexer = tag_lexer
56
57	self.tok_id = html.Invalid
58	self.start_pos = 0
59	self.end_pos = 0
60
61	def _CurrentString(self):
62	# type: () -> str
63	part = self.lexer.s[self.start_pos:self.end_pos]
64	return part
65
66	def _Next(self, comment_ok=False):
67	# type: (bool) -> None
68	"""
69	Advance and set self.tok_id, self.start_pos, self.end_pos
70	"""
71	self.start_pos = self.end_pos
72	self.tok_id, self.end_pos = self.lexer.Read()
73
74	# Should have called RemoveComments() beforehand. That can still leave
75	# some REPLACE cmoments
76	if not comment_ok and self.tok_id == html.Comment:
77	raise html.ParseError('Unexpected HTML comment')
78
79	if 0:
80	part = self._CurrentString()
81	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
82
83	def _EatRawData(self, regex):
84	# type: (str) -> None
85	"""
86	Assert that we got text data matching a regex, and advance
87	"""
88	if self.tok_id != html.RawData:
89	raise html.ParseError('Expected RawData, got %s' %
90	html.TokenName(self.tok_id))
91	actual = self._CurrentString()
92	m = re.match(regex, actual) # could compile this
93	if m is None:
94	raise html.ParseError('Expected to match %r, got %r' %
95	(regex, actual))
96	self._Next()
97
98	def _Eat(self, expected_id, expected_tag):
99	# type: (int, str) -> None
100	"""
101	Assert that we got a start or end tag, with the given name, and advance
102
103	Args:
104	expected_id: html.StartTag or html.EndTag
105	expected_tag: 'a', 'span', etc.
106	"""
107	assert expected_id in (html.StartTag,
108	html.EndTag), html.TokenName(expected_id)
109
110	if self.tok_id != expected_id:
111	raise html.ParseError(
112	'Expected token %s, got %s' %
113	(html.TokenName(expected_id), html.TokenName(self.tok_id)))
114	self.tag_lexer.Reset(self.start_pos, self.end_pos)
115	tag_name = self.tag_lexer.TagName()
116	if expected_tag != tag_name:
117	raise html.ParseError('Expected tag %r, got %r' %
118	(expected_tag, tag_name))
119
120	self._Next()
121
122	def _WhitespaceOk(self):
123	# type: () -> None
124	"""
125	Optional whitespace
126	"""
127	if (self.tok_id == html.RawData and
128	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
129	self._Next()
130
131	def FindUlTable(self):
132	# type: () -> int
133	"""Find <table ...> <ul>
134
135	Return the START position of the <ul>
136	Similar algorithm as html.ReadUntilStartTag()
137	"""
138	tag_lexer = self.tag_lexer
139
140	# Find first table
141	while True:
142	self._Next(comment_ok=True)
143	if self.tok_id == html.EndOfStream:
144	return -1
145
146	tag_lexer.Reset(self.start_pos, self.end_pos)
147	if (self.tok_id == html.StartTag and
148	tag_lexer.TagName() == 'table'):
149	while True:
150	self._Next(comment_ok=True)
151	if self.tok_id != html.RawData:
152	break
153
154	tag_lexer.Reset(self.start_pos, self.end_pos)
155	if (self.tok_id == html.StartTag and
156	tag_lexer.TagName() == 'ul'):
157	return self.start_pos
158	return -1
159
160	def _ListItem(self):
161	# type: () -> Tuple[Optional[List[Tuple[str, str]]], Optional[str]]
162	"""Parse a list item nested below thead or tr.
163
164	Returns:
165	A pair (td_attrs, inner_html)
166
167	Grammar:
168
169	LIST_ITEM =
170	[RawData \s*]?
171	[StartTag 'li']
172	ANY* # NOT context-free:
173	# - we MATCH <li> and </li> with a tack
174	# - We search for [StartEndTag 'cell-attrs']?
175	[EndTag 'li']
176
177	Example of attribute borrowing:
178
179	- hi there ==>
180	<li>hi there</li> ==>
181	<td>hi there</td>
182
183	- <cell-attrs class=foo /> hi there ==>
184	<li><cell-attrs class=foo /> hi there </li> ==>
185	<td class=foo> hi there </td> ==>
186	"""
187	self._WhitespaceOk()
188
189	if self.tok_id != html.StartTag:
190	return None, None
191
192	inner_html = None
193	td_attrs = None # Can we also have col-attrs?
194	td_attrs_span = None
195
196	self._Eat(html.StartTag, 'li')
197
198	left = self.start_pos
199
200	# Find the closing </li>, taking into accounted NESTED tags:
201	# <li> <li>foo</li> </li>
202	# because cells can have bulleted lists
203	balance = 0
204	while True:
205	if self.tok_id == html.StartEndTag:
206	self.tag_lexer.Reset(self.start_pos, self.end_pos)
207	tag_name = self.tag_lexer.TagName()
208	# TODO: remove td-attrs backward compat
209	if tag_name in ('td-attrs', 'cell-attrs'):
210	td_attrs_span = self.start_pos, self.end_pos
211	td_attrs = self.tag_lexer.AllAttrsRaw()
212	#log('CELL ATTRS %r', self._CurrentString())
213
214	elif self.tok_id == html.StartTag:
215	self.tag_lexer.Reset(self.start_pos, self.end_pos)
216	if self.tag_lexer.TagName() == 'li':
217	balance += 1
218
219	elif self.tok_id == html.EndTag:
220	self.tag_lexer.Reset(self.start_pos, self.end_pos)
221	if self.tag_lexer.TagName() == 'li':
222	balance -= 1
223	if balance < 0:
224	break
225	self._Next()
226
227	right = self.start_pos # start of the end tag
228
229	s = self.tag_lexer.s
230	if td_attrs_span:
231	# everything except the <cell-attrs />
232	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
233	#log('LEFT %r', s[left:td_attrs_span[0]])
234	#log('RIGHT %r', s[td_attrs_span[1]:right])
235	else:
236	inner_html = s[left:right]
237	#log('RAW inner html %r', inner_html)
238
239	#self._Eat(html.EndTag, 'li')
240	self._Next()
241
242	return td_attrs, inner_html
243
244	def _ParseTHead(self):
245	# type: () -> Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[Optional[List[Tuple[str, str]]], str]]]
246	"""
247	Assume we're looking at the first <ul> tag. Now we want to find
248	<li>thead and the nested <ul>
249
250	Grammar:
251
252	THEAD =
253	[StartTag 'ul']
254	[RawData \s*]?
255	[StartTag 'li']
256	[RawData thead\s*]
257	[StartTag 'ul'] # Indented bullet that starts -
258	LIST_ITEM+
259	[RawData \s*]?
260	[EndTag 'ul']
261	[RawData thead\s+]
262	[End 'li']
263
264	Two Algorithms:
265
266	1. Replacement:
267	- skip over the first ul 'thead' li, and ul 'tr' li
268	- then replace the next ul -> tr, and li -> td
269	2. Parsing and Rendering:
270	- parse them into a structure
271	- skip all the text
272	- print your own HTML
273
274	I think the second one is better, because it allows attribute extensions
275	to thead
276
277	- thead
278	- name [link][]
279	- colgroup=foo align=left
280	- age
281	- colgroup=foo align=right
282	"""
283	#log('*** _ParseTHead')
284	cells = []
285
286	self._WhitespaceOk()
287	self._Eat(html.StartTag, 'li')
288
289	# In CommonMark, r'thead\n' is enough, because it strips trailing
290	# whitespace. I'm not sure if other Markdown processors do that, so
291	# use r'thead\s+'.
292	self._EatRawData(r'thead\s+')
293
294	# This is the row data
295	self._Eat(html.StartTag, 'ul')
296
297	while True:
298	td_attrs, inner_html = self._ListItem()
299	if inner_html is None:
300	break
301	cells.append((td_attrs, inner_html))
302	self._WhitespaceOk()
303
304	self._Eat(html.EndTag, 'ul')
305
306	self._WhitespaceOk()
307	self._Eat(html.EndTag, 'li')
308
309	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
310	return cells
311
312	def _ParseTr(self):
313	# type: () -> Tuple[None, Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[None, str]], None]]
314	"""
315	Assume we're looking at the first <ul> tag. Now we want to find
316	<li>tr and the nested <ul>
317
318	Grammar:
319
320	TR =
321	[RawData \s*]?
322	[StartTag 'li']
323	[RawData thead\s*]
324	[StartTag 'ul'] # Indented bullet that starts -
325	( [StartEndTag row-attrs] [RawData \s*] )?
326	LIST_ITEM+ # Defined above
327	[RawData \s*]?
328	[EndTag 'ul']
329	"""
330	#log('*** _ParseTr')
331
332	cells = []
333
334	self._WhitespaceOk()
335
336	# Could be a </ul>
337	if self.tok_id != html.StartTag:
338	return None, None
339
340	self._Eat(html.StartTag, 'li')
341
342	self._EatRawData(r'tr\s*')
343
344	tr_attrs = None
345	if self.tok_id == html.StartEndTag:
346	self.tag_lexer.Reset(self.start_pos, self.end_pos)
347	tag_name = self.tag_lexer.TagName()
348	if tag_name != 'row-attrs':
349	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
350	tr_attrs = self.tag_lexer.AllAttrsRaw()
351	self._Next()
352	self._WhitespaceOk()
353
354	# This is the row data
355	self._Eat(html.StartTag, 'ul')
356
357	while True:
358	td_attrs, inner_html = self._ListItem()
359	if inner_html is None:
360	break
361	cells.append((td_attrs, inner_html))
362	# TODO: assert
363
364	self._WhitespaceOk()
365
366	self._Eat(html.EndTag, 'ul')
367
368	self._WhitespaceOk()
369	self._Eat(html.EndTag, 'li')
370
371	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
372	return tr_attrs, cells
373
374	def ParseTable(self):
375	# type: () -> Dict[str, Any]
376	"""
377	Returns a structure like this
378	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
379	'tr': [ # raw HTML that you surround with <td>
380	[ 'cell1 html', 'cell2 html' ],
381	[ 'cell1 html', 'cell2 html' ],
382	]
383	}
384
385	Grammar:
386
387	UL_TABLE =
388	[StartTag 'ul']
389	THEAD # this this returns the number of cells, so it's NOT context
390	# free
391	TR*
392	[EndTag 'ul']
393	"""
394	table = {'tr': []}
395
396	ul_start = self.start_pos
397	self._Eat(html.StartTag, 'ul')
398
399	# Look ahead 2 or 3 tokens:
400	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
401	thead = self._ParseTHead()
402	else:
403	thead = None
404	#log('___ THEAD %s', thead)
405
406	while True:
407	tr_attrs, tr = self._ParseTr()
408	if tr is None:
409	break
410	# Not validating because of colspan
411	if 0:
412	if thead and len(tr) != len(thead):
413	raise html.ParseError('Expected %d cells, got %d: %s' %
414	(len(thead), len(tr), tr))
415
416	#log('___ TR %s', tr)
417	table['tr'].append((tr_attrs, tr))
418
419	self._Eat(html.EndTag, 'ul')
420
421	self._WhitespaceOk()
422
423	ul_end = self.start_pos
424
425	table['thead'] = thead
426	table['ul_start'] = ul_start
427	table['ul_end'] = ul_end
428
429	if 0:
430	log('table %s', table)
431	from pprint import pprint
432	pprint(table)
433
434	return table
435
436
437	def MergeAttrs(
438	thead_td_attrs, # type: Optional[List[Tuple[str, str]]]
439	row_td_attrs, # type: Optional[List[Tuple[str, str]]]
440	):
441	# type: (...) -> List[Tuple[str, str]]
442	merged_attrs = []
443
444	if row_td_attrs is None:
445	row_lookup = {}
446	else:
447	row_lookup = {n: v for n, v in row_td_attrs}
448
449	done_for_row = set()
450
451	if thead_td_attrs:
452	for name, raw_value in thead_td_attrs:
453	more_values = row_lookup.get(name)
454	if more_values is not None:
455	raw_value += ' %s' % more_values
456	done_for_row.add(name)
457	merged_attrs.append((name, raw_value))
458
459	if row_td_attrs:
460	for name, raw_value in row_td_attrs:
461	if name in done_for_row:
462	continue
463	merged_attrs.append((name, raw_value))
464
465	return merged_attrs
466
467
468	def ReplaceTables(s, debug_out=None):
469	# type: (str, Optional[Any]) -> str
470	"""
471	ul-table: Write tables using bulleted list
472	"""
473	if debug_out is None:
474	debug_out = []
475
476	f = StringIO()
477	out = html.Output(s, f)
478
479	tag_lexer = html.TagLexer(s)
480	lexer = html.Lexer(s)
481
482	p = UlTableParser(lexer, tag_lexer)
483
484	while True:
485	ul_start = p.FindUlTable()
486	if ul_start == -1:
487	break
488
489	#log('UL START %d', ul_start)
490	out.PrintUntil(ul_start)
491
492	table = p.ParseTable()
493	#log('UL END %d', ul_end)
494
495	# Don't write the matching </u> of the LAST row, but write everything
496	# after that
497	out.SkipTo(table['ul_end'])
498
499	# Write the header
500	thead = table['thead']
501
502	col_attrs = {} # integer -> td_attrs
503	if thead:
504	out.Print('<thead>\n')
505	out.Print('<tr>\n')
506
507	i = 0
508	for td_attrs, raw_html in thead:
509	if td_attrs:
510	col_attrs[i] = td_attrs
511	# <th> tag is more semantic, and styled bold by default
512	out.Print(' <th>')
513	out.Print(raw_html)
514	out.Print('</th>\n')
515	i += 1
516
517	out.Print('</tr>\n')
518	out.Print('</thead>\n')
519
520	# Write each row
521	for tr_attrs, row in table['tr']:
522
523	# Print tr tag and attrs
524	out.Print('<tr')
525	if tr_attrs:
526	for name, raw_value in tr_attrs:
527	out.Print(' ')
528	out.Print(name)
529	# No escaping because it's raw. It can't contain quotes.
530	out.Print('="%s"' % raw_value)
531	out.Print('>\n')
532
533	# Print cells
534	i = 0
535	for row_td_attrs, raw_html in row:
536	# Inherited from header
537	thead_td_attrs = col_attrs.get(i)
538	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
539
540	out.Print(' <td')
541	for name, raw_value in merged_attrs:
542	out.Print(' ')
543	out.Print(name)
544	# No escaping because it's raw. It can't contain quotes.
545	out.Print('="%s"' % raw_value)
546	out.Print('>')
547
548	out.Print(raw_html)
549	out.Print('</td>\n')
550	i += 1
551	out.Print('</tr>\n')
552
553	out.PrintTheRest()
554
555	return f.getvalue()
556
557
558	if __name__ == '__main__':
559	# Simple CLI filter
560	h = sys.stdin.read()
561	h = RemoveComments(h)
562	h = ReplaceTables(h)
563	sys.stdout.write(h)