doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

568 lines, 283 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6	try:
7	from cStringIO import StringIO
8	except ImportError:
9	from io import StringIO # type: ignore
10	import re
11	import sys
12
13	from doctools.util import log
14	from data_lang import htm8
15	from typing import List
16	from typing import Optional
17	from typing import Tuple
18	from typing import Any
19	from typing import Dict
20
21
22	def RemoveComments(s):
23	# type: (str) -> str
24	"""Remove <!-- comments -->
25
26	This is a required preprocessing step for ul-table.
27	"""
28	f = StringIO()
29	out = htm8.Output(s, f)
30	lx = htm8.Lexer(s)
31
32	pos = 0
33	while True:
34	tok_id, end_pos = lx.Read()
35	if tok_id == h8_id.EndOfStream:
36	break
37
38	if tok_id == h8_id.Invalid:
39	raise htm8.LexError('RemoveComments() got invalid token', s, pos)
40
41	if tok_id == h8_id.Comment:
42	value = s[pos:end_pos]
43	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44	if 'REPLACE' not in value:
45	out.PrintUntil(pos)
46	out.SkipTo(end_pos)
47	pos = end_pos
48
49	out.PrintTheRest()
50	return f.getvalue()
51
52
53	_WHITESPACE_RE = re.compile(r'\s*')
54
55	TdAttrs = List[Tuple[str, str]]
56
57
58	class UlTableParser(object):
59
60	def __init__(self, lexer):
61	# type: (htm8.Lexer) -> None
62	self.lexer = lexer
63	self.attr_lexer = htm8.AttrLexer(lexer.s)
64
65	self.tok_id = h8_id.Invalid
66	self.start_pos = 0
67	self.end_pos = 0
68	# The tag name is only populated when we are "looking at"
69	# h8_id.{StartTag,EndTag,StartEndTag}
70	self.tag_name = None # type: Optional[str]
71
72	def _CurrentString(self):
73	# type: () -> str
74	part = self.lexer.s[self.start_pos:self.end_pos]
75	return part
76
77	def _Next(self, comment_ok=False):
78	# type: (bool) -> None
79	"""
80	Advance and set self.tok_id, self.start_pos, self.end_pos
81	"""
82	self.start_pos = self.end_pos
83	self.tok_id, self.end_pos = self.lexer.Read()
84	if self.tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
85	self.tag_name = self.lexer.CanonicalTagName()
86	else:
87	self.tag_name = None
88
89	# Should have called RemoveComments() beforehand. That can still leave
90	# some REPLACE cmoments
91	if not comment_ok and self.tok_id == h8_id.Comment:
92	raise htm8.ParseError('Unexpected HTML comment')
93
94	if 0:
95	part = self._CurrentString()
96	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
97
98	def _EatRawData(self, regex):
99	# type: (str) -> None
100	"""
101	Assert that we got text data matching a regex, and advance
102	"""
103	if self.tok_id != h8_id.RawData:
104	raise htm8.ParseError('Expected RawData, got %s' %
105	h8_id_str(self.tok_id))
106	actual = self._CurrentString()
107	m = re.match(regex, actual) # could compile this
108	if m is None:
109	raise htm8.ParseError('Expected to match %r, got %r' %
110	(regex, actual))
111	self._Next()
112
113	def _Eat(self, expected_id, expected_tag):
114	# type: (h8_id_t, str) -> None
115	"""
116	Assert that we got a start or end tag, with the given name, and advance
117
118	Args:
119	expected_id: h8_id.StartTag or h8_id.EndTag
120	expected_tag: 'a', 'span', etc.
121	"""
122	assert expected_id in (h8_id.StartTag,
123	h8_id.EndTag), h8_id_str(expected_id)
124
125	if self.tok_id != expected_id:
126	raise htm8.ParseError(
127	'Expected token %s, got %s' %
128	(h8_id_str(expected_id), h8_id_str(self.tok_id)))
129	if expected_tag != self.tag_name:
130	raise htm8.ParseError('Expected tag %r, got %r' %
131	(expected_tag, self.tag_name))
132
133	self._Next()
134
135	def _WhitespaceOk(self):
136	# type: () -> None
137	"""
138	Optional whitespace
139	"""
140	if (self.tok_id == h8_id.RawData and
141	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
142	self._Next()
143
144	def FindUlTable(self):
145	# type: () -> int
146	"""Find <table ...> <ul>
147
148	Return the START position of the <ul>
149	Similar algorithm as html.ReadUntilStartTag()
150	"""
151	# Find first table
152	while True:
153	self._Next(comment_ok=True)
154	if self.tok_id == h8_id.EndOfStream:
155	return -1
156
157	if (self.tok_id == h8_id.StartTag and self.tag_name == 'table'):
158	while True:
159	self._Next(comment_ok=True)
160	if self.tok_id != h8_id.RawData:
161	break
162
163	if (self.tok_id == h8_id.StartTag and self.tag_name == 'ul'):
164	return self.start_pos
165	return -1
166
167	def _ListItem(self):
168	# type: () -> Tuple[Optional[TdAttrs], Optional[str]]
169	"""Parse a list item nested below thead or tr.
170
171	Returns:
172	A pair (td_attrs, inner_html)
173
174	Grammar:
175
176	LIST_ITEM =
177	[RawData \s*]?
178	[StartTag 'li']
179	ANY* # NOT context-free:
180	# - we MATCH <li> and </li> with a tack
181	# - We search for [StartEndTag 'cell-attrs']?
182	[EndTag 'li']
183
184	Example of attribute borrowing:
185
186	- hi there ==>
187	<li>hi there</li> ==>
188	<td>hi there</td>
189
190	- <cell-attrs class=foo /> hi there ==>
191	<li><cell-attrs class=foo /> hi there </li> ==>
192	<td class=foo> hi there </td> ==>
193	"""
194	self._WhitespaceOk()
195
196	if self.tok_id != h8_id.StartTag:
197	return None, None
198
199	inner_html = None
200	td_attrs = None # Can we also have col-attrs?
201	td_attrs_span = None
202
203	self._Eat(h8_id.StartTag, 'li')
204
205	left = self.start_pos
206
207	# Find the closing </li>, taking into accounted NESTED tags:
208	# <li> <li>foo</li> </li>
209	# because cells can have bulleted lists
210	balance = 0
211	while True:
212	if self.tok_id == h8_id.StartEndTag:
213	self.attr_lexer.Init(self.tok_id, self.lexer.TagNamePos(),
214	self.end_pos)
215	# TODO: remove td-attrs backward compat
216	if self.tag_name in ('td-attrs', 'cell-attrs'):
217	td_attrs_span = self.start_pos, self.end_pos
218	td_attrs = htm8.AllAttrsRaw(self.attr_lexer)
219	#log('CELL ATTRS %r', self._CurrentString())
220
221	elif self.tok_id == h8_id.StartTag:
222	if self.tag_name == 'li':
223	balance += 1
224
225	elif self.tok_id == h8_id.EndTag:
226	if self.tag_name == 'li':
227	balance -= 1
228	if balance < 0:
229	break
230	self._Next()
231
232	right = self.start_pos # start of the end tag
233
234	s = self.lexer.s
235	if td_attrs_span:
236	# everything except the <cell-attrs />
237	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
238	#log('LEFT %r', s[left:td_attrs_span[0]])
239	#log('RIGHT %r', s[td_attrs_span[1]:right])
240	else:
241	inner_html = s[left:right]
242	#log('RAW inner html %r', inner_html)
243
244	#self._Eat(h8_id.EndTag, 'li')
245	self._Next()
246
247	return td_attrs, inner_html
248
249	def _ParseTHead(self):
250	# type: () -> List[Tuple[Optional[TdAttrs], str]]
251	"""
252	Assume we're looking at the first <ul> tag. Now we want to find
253	<li>thead and the nested <ul>
254
255	Grammar:
256
257	THEAD =
258	[StartTag 'ul']
259	[RawData \s*]?
260	[StartTag 'li']
261	[RawData thead\s*]
262	[StartTag 'ul'] # Indented bullet that starts -
263	LIST_ITEM+
264	[RawData \s*]?
265	[EndTag 'ul']
266	[RawData thead\s+]
267	[End 'li']
268
269	Two Algorithms:
270
271	1. Replacement:
272	- skip over the first ul 'thead' li, and ul 'tr' li
273	- then replace the next ul -> tr, and li -> td
274	2. Parsing and Rendering:
275	- parse them into a structure
276	- skip all the text
277	- print your own HTML
278
279	I think the second one is better, because it allows attribute extensions
280	to thead
281
282	- thead
283	- name [link][]
284	- colgroup=foo align=left
285	- age
286	- colgroup=foo align=right
287	"""
288	#log('*** _ParseTHead')
289	cells = []
290
291	self._WhitespaceOk()
292	self._Eat(h8_id.StartTag, 'li')
293
294	# In CommonMark, r'thead\n' is enough, because it strips trailing
295	# whitespace. I'm not sure if other Markdown processors do that, so
296	# use r'thead\s+'.
297	self._EatRawData(r'thead\s+')
298
299	# This is the row data
300	self._Eat(h8_id.StartTag, 'ul')
301
302	while True:
303	td_attrs, inner_html = self._ListItem()
304	if inner_html is None:
305	break
306	cells.append((td_attrs, inner_html))
307	self._WhitespaceOk()
308
309	self._Eat(h8_id.EndTag, 'ul')
310
311	self._WhitespaceOk()
312	self._Eat(h8_id.EndTag, 'li')
313
314	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
315	return cells
316
317	def _ParseTr(self):
318	# type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
319	"""
320	Assume we're looking at the first <ul> tag. Now we want to find
321	<li>tr and the nested <ul>
322
323	Grammar:
324
325	TR =
326	[RawData \s*]?
327	[StartTag 'li']
328	[RawData thead\s*]
329	[StartTag 'ul'] # Indented bullet that starts -
330	( [StartEndTag row-attrs] [RawData \s*] )?
331	LIST_ITEM+ # Defined above
332	[RawData \s*]?
333	[EndTag 'ul']
334	"""
335	#log('*** _ParseTr')
336
337	cells = []
338
339	self._WhitespaceOk()
340
341	# Could be a </ul>
342	if self.tok_id != h8_id.StartTag:
343	return None, None
344
345	self._Eat(h8_id.StartTag, 'li')
346
347	self._EatRawData(r'tr\s*')
348
349	tr_attrs = None
350	if self.tok_id == h8_id.StartEndTag:
351	self.attr_lexer.Init(self.tok_id, self.lexer.TagNamePos(),
352	self.end_pos)
353	if self.tag_name != 'row-attrs':
354	raise htm8.ParseError('Expected row-attrs, got %r' %
355	self.tag_name)
356	tr_attrs = htm8.AllAttrsRaw(self.attr_lexer)
357	self._Next()
358	self._WhitespaceOk()
359
360	# This is the row data
361	self._Eat(h8_id.StartTag, 'ul')
362
363	while True:
364	td_attrs, inner_html = self._ListItem()
365	if inner_html is None:
366	break
367	cells.append((td_attrs, inner_html))
368	# TODO: assert
369
370	self._WhitespaceOk()
371
372	self._Eat(h8_id.EndTag, 'ul')
373
374	self._WhitespaceOk()
375	self._Eat(h8_id.EndTag, 'li')
376
377	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
378	return tr_attrs, cells
379
380	def ParseTable(self):
381	# type: () -> Dict[str, Any]
382	"""
383	Returns a structure like this
384	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
385	'tr': [ # raw HTML that you surround with <td>
386	[ 'cell1 html', 'cell2 html' ],
387	[ 'cell1 html', 'cell2 html' ],
388	]
389	}
390
391	Grammar:
392
393	UL_TABLE =
394	[StartTag 'ul']
395	THEAD # this this returns the number of cells, so it's NOT context
396	# free
397	TR*
398	[EndTag 'ul']
399	"""
400	table = {'tr': []} # type: Dict[str, Any]
401
402	ul_start = self.start_pos
403	self._Eat(h8_id.StartTag, 'ul')
404
405	# Look ahead 2 or 3 tokens:
406	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
407	thead = self._ParseTHead()
408	else:
409	thead = None
410	#log('___ THEAD %s', thead)
411
412	while True:
413	tr_attrs, tr = self._ParseTr()
414	if tr is None:
415	break
416	# Not validating because of colspan
417	if 0:
418	if thead and len(tr) != len(thead):
419	raise htm8.ParseError('Expected %d cells, got %d: %s' %
420	(len(thead), len(tr), tr))
421
422	#log('___ TR %s', tr)
423	table['tr'].append((tr_attrs, tr))
424
425	self._Eat(h8_id.EndTag, 'ul')
426
427	self._WhitespaceOk()
428
429	ul_end = self.start_pos
430
431	table['thead'] = thead
432	table['ul_start'] = ul_start
433	table['ul_end'] = ul_end
434
435	if 0:
436	log('table %s', table)
437	from pprint import pprint
438	pprint(table)
439
440	return table
441
442
443	def MergeAttrs(
444	thead_td_attrs, # type: Optional[TdAttrs]
445	row_td_attrs, # type: Optional[TdAttrs]
446	):
447	# type: (...) -> TdAttrs
448	merged_attrs = []
449
450	if row_td_attrs is None:
451	row_lookup = {}
452	else:
453	row_lookup = {n: v for n, v in row_td_attrs}
454
455	done_for_row = set()
456
457	if thead_td_attrs:
458	for name, raw_value in thead_td_attrs:
459	more_values = row_lookup.get(name)
460	if more_values is not None:
461	raw_value += ' %s' % more_values
462	done_for_row.add(name)
463	merged_attrs.append((name, raw_value))
464
465	if row_td_attrs:
466	for name, raw_value in row_td_attrs:
467	if name in done_for_row:
468	continue
469	merged_attrs.append((name, raw_value))
470
471	return merged_attrs
472
473
474	def ReplaceTables(s, debug_out=None):
475	# type: (str, Optional[Any]) -> str
476	"""
477	ul-table: Write tables using bulleted list
478	"""
479	if debug_out is None:
480	debug_out = []
481
482	f = StringIO()
483	out = htm8.Output(s, f)
484
485	lexer = htm8.Lexer(s)
486
487	p = UlTableParser(lexer)
488
489	while True:
490	ul_start = p.FindUlTable()
491	if ul_start == -1:
492	break
493
494	#log('UL START %d', ul_start)
495	out.PrintUntil(ul_start)
496
497	table = p.ParseTable()
498	#log('UL END %d', ul_end)
499
500	# Don't write the matching </u> of the LAST row, but write everything
501	# after that
502	out.SkipTo(table['ul_end'])
503
504	# Write the header
505	thead = table['thead']
506
507	col_attrs = {} # integer -> td_attrs
508	if thead:
509	out.Print('<thead>\n')
510	out.Print('<tr>\n')
511
512	i = 0
513	for td_attrs, raw_html in thead:
514	if td_attrs:
515	col_attrs[i] = td_attrs
516	# <th> tag is more semantic, and styled bold by default
517	out.Print(' <th>')
518	out.Print(raw_html)
519	out.Print('</th>\n')
520	i += 1
521
522	out.Print('</tr>\n')
523	out.Print('</thead>\n')
524
525	# Write each row
526	for tr_attrs, row in table['tr']:
527
528	# Print tr tag and attrs
529	out.Print('<tr')
530	if tr_attrs:
531	for name, raw_value in tr_attrs:
532	out.Print(' ')
533	out.Print(name)
534	# No escaping because it's raw. It can't contain quotes.
535	out.Print('="%s"' % raw_value)
536	out.Print('>\n')
537
538	# Print cells
539	i = 0
540	for row_td_attrs, raw_html in row:
541	# Inherited from header
542	thead_td_attrs = col_attrs.get(i)
543	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
544
545	out.Print(' <td')
546	for name, raw_value in merged_attrs:
547	out.Print(' ')
548	out.Print(name)
549	# No escaping because it's raw. It can't contain quotes.
550	out.Print('="%s"' % raw_value)
551	out.Print('>')
552
553	out.Print(raw_html)
554	out.Print('</td>\n')
555	i += 1
556	out.Print('</tr>\n')
557
558	out.PrintTheRest()
559
560	return f.getvalue()
561
562
563	if __name__ == '__main__':
564	# Simple CLI filter
565	h = sys.stdin.read()
566	h = RemoveComments(h)
567	h = ReplaceTables(h)
568	sys.stdout.write(h)