doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

565 lines, 282 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6	try:
7	from cStringIO import StringIO
8	except ImportError:
9	from io import StringIO # type: ignore
10	import re
11	import sys
12
13	from doctools.util import log
14	from lazylex import html
15	from typing import List
16	from typing import Optional
17	from typing import Tuple
18	from typing import Union
19	from typing import Any
20	from typing import Dict
21
22
23	def RemoveComments(s):
24	# type: (str) -> str
25	"""Remove <!-- comments -->
26
27	This is a required preprocessing step for ul-table.
28	"""
29	f = StringIO()
30	out = html.Output(s, f)
31
32	tag_lexer = html.TagLexer(s)
33
34	pos = 0
35
36	for tok_id, end_pos in html.ValidTokens(s):
37	if tok_id == h8_id.Comment:
38	value = s[pos:end_pos]
39	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
40	if 'REPLACE' not in value:
41	out.PrintUntil(pos)
42	out.SkipTo(end_pos)
43	pos = end_pos
44
45	out.PrintTheRest()
46	return f.getvalue()
47
48
49	_WHITESPACE_RE = re.compile(r'\s*')
50
51
52	class UlTableParser(object):
53
54	def __init__(self, lexer, tag_lexer):
55	# type: (html.Lexer, html.TagLexer) -> None
56	self.lexer = lexer
57	self.tag_lexer = tag_lexer
58
59	self.tok_id = h8_id.Invalid
60	self.start_pos = 0
61	self.end_pos = 0
62
63	def _CurrentString(self):
64	# type: () -> str
65	part = self.lexer.s[self.start_pos:self.end_pos]
66	return part
67
68	def _Next(self, comment_ok=False):
69	# type: (bool) -> None
70	"""
71	Advance and set self.tok_id, self.start_pos, self.end_pos
72	"""
73	self.start_pos = self.end_pos
74	self.tok_id, self.end_pos = self.lexer.Read()
75
76	# Should have called RemoveComments() beforehand. That can still leave
77	# some REPLACE cmoments
78	if not comment_ok and self.tok_id == h8_id.Comment:
79	raise html.ParseError('Unexpected HTML comment')
80
81	if 0:
82	part = self._CurrentString()
83	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
84
85	def _EatRawData(self, regex):
86	# type: (str) -> None
87	"""
88	Assert that we got text data matching a regex, and advance
89	"""
90	if self.tok_id != h8_id.RawData:
91	raise html.ParseError('Expected RawData, got %s' %
92	h8_id_str(self.tok_id))
93	actual = self._CurrentString()
94	m = re.match(regex, actual) # could compile this
95	if m is None:
96	raise html.ParseError('Expected to match %r, got %r' %
97	(regex, actual))
98	self._Next()
99
100	def _Eat(self, expected_id, expected_tag):
101	# type: (h8_id_t, str) -> None
102	"""
103	Assert that we got a start or end tag, with the given name, and advance
104
105	Args:
106	expected_id: h8_id.StartTag or h8_id.EndTag
107	expected_tag: 'a', 'span', etc.
108	"""
109	assert expected_id in (h8_id.StartTag,
110	h8_id.EndTag), h8_id_str(expected_id)
111
112	if self.tok_id != expected_id:
113	raise html.ParseError(
114	'Expected token %s, got %s' %
115	(h8_id_str(expected_id), h8_id_str(self.tok_id)))
116	self.tag_lexer.Reset(self.start_pos, self.end_pos)
117	tag_name = self.tag_lexer.TagName()
118	if expected_tag != tag_name:
119	raise html.ParseError('Expected tag %r, got %r' %
120	(expected_tag, tag_name))
121
122	self._Next()
123
124	def _WhitespaceOk(self):
125	# type: () -> None
126	"""
127	Optional whitespace
128	"""
129	if (self.tok_id == h8_id.RawData and
130	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
131	self._Next()
132
133	def FindUlTable(self):
134	# type: () -> int
135	"""Find <table ...> <ul>
136
137	Return the START position of the <ul>
138	Similar algorithm as html.ReadUntilStartTag()
139	"""
140	tag_lexer = self.tag_lexer
141
142	# Find first table
143	while True:
144	self._Next(comment_ok=True)
145	if self.tok_id == h8_id.EndOfStream:
146	return -1
147
148	tag_lexer.Reset(self.start_pos, self.end_pos)
149	if (self.tok_id == h8_id.StartTag and
150	tag_lexer.TagName() == 'table'):
151	while True:
152	self._Next(comment_ok=True)
153	if self.tok_id != h8_id.RawData:
154	break
155
156	tag_lexer.Reset(self.start_pos, self.end_pos)
157	if (self.tok_id == h8_id.StartTag and
158	tag_lexer.TagName() == 'ul'):
159	return self.start_pos
160	return -1
161
162	def _ListItem(self):
163	# type: () -> Tuple[Optional[List[Tuple[str, str]]], Optional[str]]
164	"""Parse a list item nested below thead or tr.
165
166	Returns:
167	A pair (td_attrs, inner_html)
168
169	Grammar:
170
171	LIST_ITEM =
172	[RawData \s*]?
173	[StartTag 'li']
174	ANY* # NOT context-free:
175	# - we MATCH <li> and </li> with a tack
176	# - We search for [StartEndTag 'cell-attrs']?
177	[EndTag 'li']
178
179	Example of attribute borrowing:
180
181	- hi there ==>
182	<li>hi there</li> ==>
183	<td>hi there</td>
184
185	- <cell-attrs class=foo /> hi there ==>
186	<li><cell-attrs class=foo /> hi there </li> ==>
187	<td class=foo> hi there </td> ==>
188	"""
189	self._WhitespaceOk()
190
191	if self.tok_id != h8_id.StartTag:
192	return None, None
193
194	inner_html = None
195	td_attrs = None # Can we also have col-attrs?
196	td_attrs_span = None
197
198	self._Eat(h8_id.StartTag, 'li')
199
200	left = self.start_pos
201
202	# Find the closing </li>, taking into accounted NESTED tags:
203	# <li> <li>foo</li> </li>
204	# because cells can have bulleted lists
205	balance = 0
206	while True:
207	if self.tok_id == h8_id.StartEndTag:
208	self.tag_lexer.Reset(self.start_pos, self.end_pos)
209	tag_name = self.tag_lexer.TagName()
210	# TODO: remove td-attrs backward compat
211	if tag_name in ('td-attrs', 'cell-attrs'):
212	td_attrs_span = self.start_pos, self.end_pos
213	td_attrs = self.tag_lexer.AllAttrsRaw()
214	#log('CELL ATTRS %r', self._CurrentString())
215
216	elif self.tok_id == h8_id.StartTag:
217	self.tag_lexer.Reset(self.start_pos, self.end_pos)
218	if self.tag_lexer.TagName() == 'li':
219	balance += 1
220
221	elif self.tok_id == h8_id.EndTag:
222	self.tag_lexer.Reset(self.start_pos, self.end_pos)
223	if self.tag_lexer.TagName() == 'li':
224	balance -= 1
225	if balance < 0:
226	break
227	self._Next()
228
229	right = self.start_pos # start of the end tag
230
231	s = self.tag_lexer.s
232	if td_attrs_span:
233	# everything except the <cell-attrs />
234	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
235	#log('LEFT %r', s[left:td_attrs_span[0]])
236	#log('RIGHT %r', s[td_attrs_span[1]:right])
237	else:
238	inner_html = s[left:right]
239	#log('RAW inner html %r', inner_html)
240
241	#self._Eat(h8_id.EndTag, 'li')
242	self._Next()
243
244	return td_attrs, inner_html
245
246	def _ParseTHead(self):
247	# type: () -> Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[Optional[List[Tuple[str, str]]], str]]]
248	"""
249	Assume we're looking at the first <ul> tag. Now we want to find
250	<li>thead and the nested <ul>
251
252	Grammar:
253
254	THEAD =
255	[StartTag 'ul']
256	[RawData \s*]?
257	[StartTag 'li']
258	[RawData thead\s*]
259	[StartTag 'ul'] # Indented bullet that starts -
260	LIST_ITEM+
261	[RawData \s*]?
262	[EndTag 'ul']
263	[RawData thead\s+]
264	[End 'li']
265
266	Two Algorithms:
267
268	1. Replacement:
269	- skip over the first ul 'thead' li, and ul 'tr' li
270	- then replace the next ul -> tr, and li -> td
271	2. Parsing and Rendering:
272	- parse them into a structure
273	- skip all the text
274	- print your own HTML
275
276	I think the second one is better, because it allows attribute extensions
277	to thead
278
279	- thead
280	- name [link][]
281	- colgroup=foo align=left
282	- age
283	- colgroup=foo align=right
284	"""
285	#log('*** _ParseTHead')
286	cells = []
287
288	self._WhitespaceOk()
289	self._Eat(h8_id.StartTag, 'li')
290
291	# In CommonMark, r'thead\n' is enough, because it strips trailing
292	# whitespace. I'm not sure if other Markdown processors do that, so
293	# use r'thead\s+'.
294	self._EatRawData(r'thead\s+')
295
296	# This is the row data
297	self._Eat(h8_id.StartTag, 'ul')
298
299	while True:
300	td_attrs, inner_html = self._ListItem()
301	if inner_html is None:
302	break
303	cells.append((td_attrs, inner_html))
304	self._WhitespaceOk()
305
306	self._Eat(h8_id.EndTag, 'ul')
307
308	self._WhitespaceOk()
309	self._Eat(h8_id.EndTag, 'li')
310
311	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
312	return cells
313
314	def _ParseTr(self):
315	# type: () -> Tuple[None, Union[List[Tuple[List[Tuple[str, str]], str]], List[Tuple[None, str]], None]]
316	"""
317	Assume we're looking at the first <ul> tag. Now we want to find
318	<li>tr and the nested <ul>
319
320	Grammar:
321
322	TR =
323	[RawData \s*]?
324	[StartTag 'li']
325	[RawData thead\s*]
326	[StartTag 'ul'] # Indented bullet that starts -
327	( [StartEndTag row-attrs] [RawData \s*] )?
328	LIST_ITEM+ # Defined above
329	[RawData \s*]?
330	[EndTag 'ul']
331	"""
332	#log('*** _ParseTr')
333
334	cells = []
335
336	self._WhitespaceOk()
337
338	# Could be a </ul>
339	if self.tok_id != h8_id.StartTag:
340	return None, None
341
342	self._Eat(h8_id.StartTag, 'li')
343
344	self._EatRawData(r'tr\s*')
345
346	tr_attrs = None
347	if self.tok_id == h8_id.StartEndTag:
348	self.tag_lexer.Reset(self.start_pos, self.end_pos)
349	tag_name = self.tag_lexer.TagName()
350	if tag_name != 'row-attrs':
351	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
352	tr_attrs = self.tag_lexer.AllAttrsRaw()
353	self._Next()
354	self._WhitespaceOk()
355
356	# This is the row data
357	self._Eat(h8_id.StartTag, 'ul')
358
359	while True:
360	td_attrs, inner_html = self._ListItem()
361	if inner_html is None:
362	break
363	cells.append((td_attrs, inner_html))
364	# TODO: assert
365
366	self._WhitespaceOk()
367
368	self._Eat(h8_id.EndTag, 'ul')
369
370	self._WhitespaceOk()
371	self._Eat(h8_id.EndTag, 'li')
372
373	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
374	return tr_attrs, cells
375
376	def ParseTable(self):
377	# type: () -> Dict[str, Any]
378	"""
379	Returns a structure like this
380	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
381	'tr': [ # raw HTML that you surround with <td>
382	[ 'cell1 html', 'cell2 html' ],
383	[ 'cell1 html', 'cell2 html' ],
384	]
385	}
386
387	Grammar:
388
389	UL_TABLE =
390	[StartTag 'ul']
391	THEAD # this this returns the number of cells, so it's NOT context
392	# free
393	TR*
394	[EndTag 'ul']
395	"""
396	table = {'tr': []} # type: Dict[str, Any]
397
398	ul_start = self.start_pos
399	self._Eat(h8_id.StartTag, 'ul')
400
401	# Look ahead 2 or 3 tokens:
402	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
403	thead = self._ParseTHead()
404	else:
405	thead = None
406	#log('___ THEAD %s', thead)
407
408	while True:
409	tr_attrs, tr = self._ParseTr()
410	if tr is None:
411	break
412	# Not validating because of colspan
413	if 0:
414	if thead and len(tr) != len(thead):
415	raise html.ParseError('Expected %d cells, got %d: %s' %
416	(len(thead), len(tr), tr))
417
418	#log('___ TR %s', tr)
419	table['tr'].append((tr_attrs, tr))
420
421	self._Eat(h8_id.EndTag, 'ul')
422
423	self._WhitespaceOk()
424
425	ul_end = self.start_pos
426
427	table['thead'] = thead
428	table['ul_start'] = ul_start
429	table['ul_end'] = ul_end
430
431	if 0:
432	log('table %s', table)
433	from pprint import pprint
434	pprint(table)
435
436	return table
437
438
439	def MergeAttrs(
440	thead_td_attrs, # type: Optional[List[Tuple[str, str]]]
441	row_td_attrs, # type: Optional[List[Tuple[str, str]]]
442	):
443	# type: (...) -> List[Tuple[str, str]]
444	merged_attrs = []
445
446	if row_td_attrs is None:
447	row_lookup = {}
448	else:
449	row_lookup = {n: v for n, v in row_td_attrs}
450
451	done_for_row = set()
452
453	if thead_td_attrs:
454	for name, raw_value in thead_td_attrs:
455	more_values = row_lookup.get(name)
456	if more_values is not None:
457	raw_value += ' %s' % more_values
458	done_for_row.add(name)
459	merged_attrs.append((name, raw_value))
460
461	if row_td_attrs:
462	for name, raw_value in row_td_attrs:
463	if name in done_for_row:
464	continue
465	merged_attrs.append((name, raw_value))
466
467	return merged_attrs
468
469
470	def ReplaceTables(s, debug_out=None):
471	# type: (str, Optional[Any]) -> str
472	"""
473	ul-table: Write tables using bulleted list
474	"""
475	if debug_out is None:
476	debug_out = []
477
478	f = StringIO()
479	out = html.Output(s, f)
480
481	tag_lexer = html.TagLexer(s)
482	lexer = html.Lexer(s)
483
484	p = UlTableParser(lexer, tag_lexer)
485
486	while True:
487	ul_start = p.FindUlTable()
488	if ul_start == -1:
489	break
490
491	#log('UL START %d', ul_start)
492	out.PrintUntil(ul_start)
493
494	table = p.ParseTable()
495	#log('UL END %d', ul_end)
496
497	# Don't write the matching </u> of the LAST row, but write everything
498	# after that
499	out.SkipTo(table['ul_end'])
500
501	# Write the header
502	thead = table['thead']
503
504	col_attrs = {} # integer -> td_attrs
505	if thead:
506	out.Print('<thead>\n')
507	out.Print('<tr>\n')
508
509	i = 0
510	for td_attrs, raw_html in thead:
511	if td_attrs:
512	col_attrs[i] = td_attrs
513	# <th> tag is more semantic, and styled bold by default
514	out.Print(' <th>')
515	out.Print(raw_html)
516	out.Print('</th>\n')
517	i += 1
518
519	out.Print('</tr>\n')
520	out.Print('</thead>\n')
521
522	# Write each row
523	for tr_attrs, row in table['tr']:
524
525	# Print tr tag and attrs
526	out.Print('<tr')
527	if tr_attrs:
528	for name, raw_value in tr_attrs:
529	out.Print(' ')
530	out.Print(name)
531	# No escaping because it's raw. It can't contain quotes.
532	out.Print('="%s"' % raw_value)
533	out.Print('>\n')
534
535	# Print cells
536	i = 0
537	for row_td_attrs, raw_html in row:
538	# Inherited from header
539	thead_td_attrs = col_attrs.get(i)
540	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
541
542	out.Print(' <td')
543	for name, raw_value in merged_attrs:
544	out.Print(' ')
545	out.Print(name)
546	# No escaping because it's raw. It can't contain quotes.
547	out.Print('="%s"' % raw_value)
548	out.Print('>')
549
550	out.Print(raw_html)
551	out.Print('</td>\n')
552	i += 1
553	out.Print('</tr>\n')
554
555	out.PrintTheRest()
556
557	return f.getvalue()
558
559
560	if __name__ == '__main__':
561	# Simple CLI filter
562	h = sys.stdin.read()
563	h = RemoveComments(h)
564	h = ReplaceTables(h)
565	sys.stdout.write(h)