doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

566 lines, 282 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6	try:
7	from cStringIO import StringIO
8	except ImportError:
9	from io import StringIO # type: ignore
10	import re
11	import sys
12
13	from doctools.util import log
14	from lazylex import html
15	from typing import List
16	from typing import Optional
17	from typing import Tuple
18	from typing import Any
19	from typing import Dict
20
21
22	def RemoveComments(s):
23	# type: (str) -> str
24	"""Remove <!-- comments -->
25
26	This is a required preprocessing step for ul-table.
27	"""
28	f = StringIO()
29	out = html.Output(s, f)
30
31	tag_lexer = html.TagLexer(s)
32
33	pos = 0
34
35	for tok_id, end_pos in html.ValidTokens(s):
36	if tok_id == h8_id.Comment:
37	value = s[pos:end_pos]
38	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
39	if 'REPLACE' not in value:
40	out.PrintUntil(pos)
41	out.SkipTo(end_pos)
42	pos = end_pos
43
44	out.PrintTheRest()
45	return f.getvalue()
46
47
48	_WHITESPACE_RE = re.compile(r'\s*')
49
50	TdAttrs = List[Tuple[str, str]]
51
52
53	class UlTableParser(object):
54
55	def __init__(self, lexer, tag_lexer):
56	# type: (html.Lexer, html.TagLexer) -> None
57	self.lexer = lexer
58	self.tag_lexer = tag_lexer
59
60	self.tok_id = h8_id.Invalid
61	self.start_pos = 0
62	self.end_pos = 0
63
64	def _CurrentString(self):
65	# type: () -> str
66	part = self.lexer.s[self.start_pos:self.end_pos]
67	return part
68
69	def _Next(self, comment_ok=False):
70	# type: (bool) -> None
71	"""
72	Advance and set self.tok_id, self.start_pos, self.end_pos
73	"""
74	self.start_pos = self.end_pos
75	self.tok_id, self.end_pos = self.lexer.Read()
76
77	# Should have called RemoveComments() beforehand. That can still leave
78	# some REPLACE cmoments
79	if not comment_ok and self.tok_id == h8_id.Comment:
80	raise html.ParseError('Unexpected HTML comment')
81
82	if 0:
83	part = self._CurrentString()
84	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
85
86	def _EatRawData(self, regex):
87	# type: (str) -> None
88	"""
89	Assert that we got text data matching a regex, and advance
90	"""
91	if self.tok_id != h8_id.RawData:
92	raise html.ParseError('Expected RawData, got %s' %
93	h8_id_str(self.tok_id))
94	actual = self._CurrentString()
95	m = re.match(regex, actual) # could compile this
96	if m is None:
97	raise html.ParseError('Expected to match %r, got %r' %
98	(regex, actual))
99	self._Next()
100
101	def _Eat(self, expected_id, expected_tag):
102	# type: (h8_id_t, str) -> None
103	"""
104	Assert that we got a start or end tag, with the given name, and advance
105
106	Args:
107	expected_id: h8_id.StartTag or h8_id.EndTag
108	expected_tag: 'a', 'span', etc.
109	"""
110	assert expected_id in (h8_id.StartTag,
111	h8_id.EndTag), h8_id_str(expected_id)
112
113	if self.tok_id != expected_id:
114	raise html.ParseError(
115	'Expected token %s, got %s' %
116	(h8_id_str(expected_id), h8_id_str(self.tok_id)))
117	self.tag_lexer.Reset(self.start_pos, self.end_pos)
118	tag_name = self.tag_lexer.TagName()
119	if expected_tag != tag_name:
120	raise html.ParseError('Expected tag %r, got %r' %
121	(expected_tag, tag_name))
122
123	self._Next()
124
125	def _WhitespaceOk(self):
126	# type: () -> None
127	"""
128	Optional whitespace
129	"""
130	if (self.tok_id == h8_id.RawData and
131	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
132	self._Next()
133
134	def FindUlTable(self):
135	# type: () -> int
136	"""Find <table ...> <ul>
137
138	Return the START position of the <ul>
139	Similar algorithm as html.ReadUntilStartTag()
140	"""
141	tag_lexer = self.tag_lexer
142
143	# Find first table
144	while True:
145	self._Next(comment_ok=True)
146	if self.tok_id == h8_id.EndOfStream:
147	return -1
148
149	tag_lexer.Reset(self.start_pos, self.end_pos)
150	if (self.tok_id == h8_id.StartTag and
151	tag_lexer.TagName() == 'table'):
152	while True:
153	self._Next(comment_ok=True)
154	if self.tok_id != h8_id.RawData:
155	break
156
157	tag_lexer.Reset(self.start_pos, self.end_pos)
158	if (self.tok_id == h8_id.StartTag and
159	tag_lexer.TagName() == 'ul'):
160	return self.start_pos
161	return -1
162
163	def _ListItem(self):
164	# type: () -> Tuple[Optional[TdAttrs], Optional[str]]
165	"""Parse a list item nested below thead or tr.
166
167	Returns:
168	A pair (td_attrs, inner_html)
169
170	Grammar:
171
172	LIST_ITEM =
173	[RawData \s*]?
174	[StartTag 'li']
175	ANY* # NOT context-free:
176	# - we MATCH <li> and </li> with a tack
177	# - We search for [StartEndTag 'cell-attrs']?
178	[EndTag 'li']
179
180	Example of attribute borrowing:
181
182	- hi there ==>
183	<li>hi there</li> ==>
184	<td>hi there</td>
185
186	- <cell-attrs class=foo /> hi there ==>
187	<li><cell-attrs class=foo /> hi there </li> ==>
188	<td class=foo> hi there </td> ==>
189	"""
190	self._WhitespaceOk()
191
192	if self.tok_id != h8_id.StartTag:
193	return None, None
194
195	inner_html = None
196	td_attrs = None # Can we also have col-attrs?
197	td_attrs_span = None
198
199	self._Eat(h8_id.StartTag, 'li')
200
201	left = self.start_pos
202
203	# Find the closing </li>, taking into accounted NESTED tags:
204	# <li> <li>foo</li> </li>
205	# because cells can have bulleted lists
206	balance = 0
207	while True:
208	if self.tok_id == h8_id.StartEndTag:
209	self.tag_lexer.Reset(self.start_pos, self.end_pos)
210	tag_name = self.tag_lexer.TagName()
211	# TODO: remove td-attrs backward compat
212	if tag_name in ('td-attrs', 'cell-attrs'):
213	td_attrs_span = self.start_pos, self.end_pos
214	td_attrs = self.tag_lexer.AllAttrsRaw()
215	#log('CELL ATTRS %r', self._CurrentString())
216
217	elif self.tok_id == h8_id.StartTag:
218	self.tag_lexer.Reset(self.start_pos, self.end_pos)
219	if self.tag_lexer.TagName() == 'li':
220	balance += 1
221
222	elif self.tok_id == h8_id.EndTag:
223	self.tag_lexer.Reset(self.start_pos, self.end_pos)
224	if self.tag_lexer.TagName() == 'li':
225	balance -= 1
226	if balance < 0:
227	break
228	self._Next()
229
230	right = self.start_pos # start of the end tag
231
232	s = self.tag_lexer.s
233	if td_attrs_span:
234	# everything except the <cell-attrs />
235	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
236	#log('LEFT %r', s[left:td_attrs_span[0]])
237	#log('RIGHT %r', s[td_attrs_span[1]:right])
238	else:
239	inner_html = s[left:right]
240	#log('RAW inner html %r', inner_html)
241
242	#self._Eat(h8_id.EndTag, 'li')
243	self._Next()
244
245	return td_attrs, inner_html
246
247	def _ParseTHead(self):
248	# type: () -> List[Tuple[Optional[TdAttrs], str]]
249	"""
250	Assume we're looking at the first <ul> tag. Now we want to find
251	<li>thead and the nested <ul>
252
253	Grammar:
254
255	THEAD =
256	[StartTag 'ul']
257	[RawData \s*]?
258	[StartTag 'li']
259	[RawData thead\s*]
260	[StartTag 'ul'] # Indented bullet that starts -
261	LIST_ITEM+
262	[RawData \s*]?
263	[EndTag 'ul']
264	[RawData thead\s+]
265	[End 'li']
266
267	Two Algorithms:
268
269	1. Replacement:
270	- skip over the first ul 'thead' li, and ul 'tr' li
271	- then replace the next ul -> tr, and li -> td
272	2. Parsing and Rendering:
273	- parse them into a structure
274	- skip all the text
275	- print your own HTML
276
277	I think the second one is better, because it allows attribute extensions
278	to thead
279
280	- thead
281	- name [link][]
282	- colgroup=foo align=left
283	- age
284	- colgroup=foo align=right
285	"""
286	#log('*** _ParseTHead')
287	cells = []
288
289	self._WhitespaceOk()
290	self._Eat(h8_id.StartTag, 'li')
291
292	# In CommonMark, r'thead\n' is enough, because it strips trailing
293	# whitespace. I'm not sure if other Markdown processors do that, so
294	# use r'thead\s+'.
295	self._EatRawData(r'thead\s+')
296
297	# This is the row data
298	self._Eat(h8_id.StartTag, 'ul')
299
300	while True:
301	td_attrs, inner_html = self._ListItem()
302	if inner_html is None:
303	break
304	cells.append((td_attrs, inner_html))
305	self._WhitespaceOk()
306
307	self._Eat(h8_id.EndTag, 'ul')
308
309	self._WhitespaceOk()
310	self._Eat(h8_id.EndTag, 'li')
311
312	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
313	return cells
314
315	def _ParseTr(self):
316	# type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
317	"""
318	Assume we're looking at the first <ul> tag. Now we want to find
319	<li>tr and the nested <ul>
320
321	Grammar:
322
323	TR =
324	[RawData \s*]?
325	[StartTag 'li']
326	[RawData thead\s*]
327	[StartTag 'ul'] # Indented bullet that starts -
328	( [StartEndTag row-attrs] [RawData \s*] )?
329	LIST_ITEM+ # Defined above
330	[RawData \s*]?
331	[EndTag 'ul']
332	"""
333	#log('*** _ParseTr')
334
335	cells = []
336
337	self._WhitespaceOk()
338
339	# Could be a </ul>
340	if self.tok_id != h8_id.StartTag:
341	return None, None
342
343	self._Eat(h8_id.StartTag, 'li')
344
345	self._EatRawData(r'tr\s*')
346
347	tr_attrs = None
348	if self.tok_id == h8_id.StartEndTag:
349	self.tag_lexer.Reset(self.start_pos, self.end_pos)
350	tag_name = self.tag_lexer.TagName()
351	if tag_name != 'row-attrs':
352	raise html.ParseError('Expected row-attrs, got %r' % tag_name)
353	tr_attrs = self.tag_lexer.AllAttrsRaw()
354	self._Next()
355	self._WhitespaceOk()
356
357	# This is the row data
358	self._Eat(h8_id.StartTag, 'ul')
359
360	while True:
361	td_attrs, inner_html = self._ListItem()
362	if inner_html is None:
363	break
364	cells.append((td_attrs, inner_html))
365	# TODO: assert
366
367	self._WhitespaceOk()
368
369	self._Eat(h8_id.EndTag, 'ul')
370
371	self._WhitespaceOk()
372	self._Eat(h8_id.EndTag, 'li')
373
374	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
375	return tr_attrs, cells
376
377	def ParseTable(self):
378	# type: () -> Dict[str, Any]
379	"""
380	Returns a structure like this
381	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
382	'tr': [ # raw HTML that you surround with <td>
383	[ 'cell1 html', 'cell2 html' ],
384	[ 'cell1 html', 'cell2 html' ],
385	]
386	}
387
388	Grammar:
389
390	UL_TABLE =
391	[StartTag 'ul']
392	THEAD # this this returns the number of cells, so it's NOT context
393	# free
394	TR*
395	[EndTag 'ul']
396	"""
397	table = {'tr': []} # type: Dict[str, Any]
398
399	ul_start = self.start_pos
400	self._Eat(h8_id.StartTag, 'ul')
401
402	# Look ahead 2 or 3 tokens:
403	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
404	thead = self._ParseTHead()
405	else:
406	thead = None
407	#log('___ THEAD %s', thead)
408
409	while True:
410	tr_attrs, tr = self._ParseTr()
411	if tr is None:
412	break
413	# Not validating because of colspan
414	if 0:
415	if thead and len(tr) != len(thead):
416	raise html.ParseError('Expected %d cells, got %d: %s' %
417	(len(thead), len(tr), tr))
418
419	#log('___ TR %s', tr)
420	table['tr'].append((tr_attrs, tr))
421
422	self._Eat(h8_id.EndTag, 'ul')
423
424	self._WhitespaceOk()
425
426	ul_end = self.start_pos
427
428	table['thead'] = thead
429	table['ul_start'] = ul_start
430	table['ul_end'] = ul_end
431
432	if 0:
433	log('table %s', table)
434	from pprint import pprint
435	pprint(table)
436
437	return table
438
439
440	def MergeAttrs(
441	thead_td_attrs, # type: Optional[TdAttrs]
442	row_td_attrs, # type: Optional[TdAttrs]
443	):
444	# type: (...) -> TdAttrs
445	merged_attrs = []
446
447	if row_td_attrs is None:
448	row_lookup = {}
449	else:
450	row_lookup = {n: v for n, v in row_td_attrs}
451
452	done_for_row = set()
453
454	if thead_td_attrs:
455	for name, raw_value in thead_td_attrs:
456	more_values = row_lookup.get(name)
457	if more_values is not None:
458	raw_value += ' %s' % more_values
459	done_for_row.add(name)
460	merged_attrs.append((name, raw_value))
461
462	if row_td_attrs:
463	for name, raw_value in row_td_attrs:
464	if name in done_for_row:
465	continue
466	merged_attrs.append((name, raw_value))
467
468	return merged_attrs
469
470
471	def ReplaceTables(s, debug_out=None):
472	# type: (str, Optional[Any]) -> str
473	"""
474	ul-table: Write tables using bulleted list
475	"""
476	if debug_out is None:
477	debug_out = []
478
479	f = StringIO()
480	out = html.Output(s, f)
481
482	tag_lexer = html.TagLexer(s)
483	lexer = html.Lexer(s)
484
485	p = UlTableParser(lexer, tag_lexer)
486
487	while True:
488	ul_start = p.FindUlTable()
489	if ul_start == -1:
490	break
491
492	#log('UL START %d', ul_start)
493	out.PrintUntil(ul_start)
494
495	table = p.ParseTable()
496	#log('UL END %d', ul_end)
497
498	# Don't write the matching </u> of the LAST row, but write everything
499	# after that
500	out.SkipTo(table['ul_end'])
501
502	# Write the header
503	thead = table['thead']
504
505	col_attrs = {} # integer -> td_attrs
506	if thead:
507	out.Print('<thead>\n')
508	out.Print('<tr>\n')
509
510	i = 0
511	for td_attrs, raw_html in thead:
512	if td_attrs:
513	col_attrs[i] = td_attrs
514	# <th> tag is more semantic, and styled bold by default
515	out.Print(' <th>')
516	out.Print(raw_html)
517	out.Print('</th>\n')
518	i += 1
519
520	out.Print('</tr>\n')
521	out.Print('</thead>\n')
522
523	# Write each row
524	for tr_attrs, row in table['tr']:
525
526	# Print tr tag and attrs
527	out.Print('<tr')
528	if tr_attrs:
529	for name, raw_value in tr_attrs:
530	out.Print(' ')
531	out.Print(name)
532	# No escaping because it's raw. It can't contain quotes.
533	out.Print('="%s"' % raw_value)
534	out.Print('>\n')
535
536	# Print cells
537	i = 0
538	for row_td_attrs, raw_html in row:
539	# Inherited from header
540	thead_td_attrs = col_attrs.get(i)
541	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
542
543	out.Print(' <td')
544	for name, raw_value in merged_attrs:
545	out.Print(' ')
546	out.Print(name)
547	# No escaping because it's raw. It can't contain quotes.
548	out.Print('="%s"' % raw_value)
549	out.Print('>')
550
551	out.Print(raw_html)
552	out.Print('</td>\n')
553	i += 1
554	out.Print('</tr>\n')
555
556	out.PrintTheRest()
557
558	return f.getvalue()
559
560
561	if __name__ == '__main__':
562	# Simple CLI filter
563	h = sys.stdin.read()
564	h = RemoveComments(h)
565	h = ReplaceTables(h)
566	sys.stdout.write(h)