doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

567 lines, 282 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6	try:
7	from cStringIO import StringIO
8	except ImportError:
9	from io import StringIO # type: ignore
10	import re
11	import sys
12
13	from doctools.util import log
14	from data_lang import htm8
15	from typing import List
16	from typing import Optional
17	from typing import Tuple
18	from typing import Any
19	from typing import Dict
20
21
22	def RemoveComments(s):
23	# type: (str) -> str
24	"""Remove <!-- comments -->
25
26	This is a required preprocessing step for ul-table.
27	"""
28	f = StringIO()
29	out = htm8.Output(s, f)
30	lx = htm8.Lexer(s)
31
32	pos = 0
33	while True:
34	tok_id, end_pos = lx.Read()
35	if tok_id == h8_id.EndOfStream:
36	break
37
38	if tok_id == h8_id.Invalid:
39	raise htm8.LexError('RemoveComments() got invalid token', s, pos)
40
41	if tok_id == h8_id.Comment:
42	value = s[pos:end_pos]
43	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44	if 'REPLACE' not in value:
45	out.PrintUntil(pos)
46	out.SkipTo(end_pos)
47	pos = end_pos
48
49	out.PrintTheRest()
50	return f.getvalue()
51
52
53	_WHITESPACE_RE = re.compile(r'\s*')
54
55	TdAttrs = List[Tuple[str, str]]
56
57
58	class UlTableParser(object):
59
60	def __init__(self, lexer, tag_lexer):
61	# type: (htm8.Lexer, htm8.TagLexer) -> None
62	self.lexer = lexer
63	self.tag_lexer = tag_lexer
64
65	self.tok_id = h8_id.Invalid
66	self.start_pos = 0
67	self.end_pos = 0
68	# The tag name is only populated when we are "looking at"
69	# h8_id.{StartTag,EndTag,StartEndTag}
70	self.tag_name = None # type: Optional[str]
71
72	def _CurrentString(self):
73	# type: () -> str
74	part = self.lexer.s[self.start_pos:self.end_pos]
75	return part
76
77	def _Next(self, comment_ok=False):
78	# type: (bool) -> None
79	"""
80	Advance and set self.tok_id, self.start_pos, self.end_pos
81	"""
82	self.start_pos = self.end_pos
83	self.tok_id, self.end_pos = self.lexer.Read()
84	if self.tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
85	self.tag_name = self.lexer.CanonicalTagName()
86	else:
87	self.tag_name = None
88
89	# Should have called RemoveComments() beforehand. That can still leave
90	# some REPLACE cmoments
91	if not comment_ok and self.tok_id == h8_id.Comment:
92	raise htm8.ParseError('Unexpected HTML comment')
93
94	if 0:
95	part = self._CurrentString()
96	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
97
98	def _EatRawData(self, regex):
99	# type: (str) -> None
100	"""
101	Assert that we got text data matching a regex, and advance
102	"""
103	if self.tok_id != h8_id.RawData:
104	raise htm8.ParseError('Expected RawData, got %s' %
105	h8_id_str(self.tok_id))
106	actual = self._CurrentString()
107	m = re.match(regex, actual) # could compile this
108	if m is None:
109	raise htm8.ParseError('Expected to match %r, got %r' %
110	(regex, actual))
111	self._Next()
112
113	def _Eat(self, expected_id, expected_tag):
114	# type: (h8_id_t, str) -> None
115	"""
116	Assert that we got a start or end tag, with the given name, and advance
117
118	Args:
119	expected_id: h8_id.StartTag or h8_id.EndTag
120	expected_tag: 'a', 'span', etc.
121	"""
122	assert expected_id in (h8_id.StartTag,
123	h8_id.EndTag), h8_id_str(expected_id)
124
125	if self.tok_id != expected_id:
126	raise htm8.ParseError(
127	'Expected token %s, got %s' %
128	(h8_id_str(expected_id), h8_id_str(self.tok_id)))
129	if expected_tag != self.tag_name:
130	raise htm8.ParseError('Expected tag %r, got %r' %
131	(expected_tag, self.tag_name))
132
133	self._Next()
134
135	def _WhitespaceOk(self):
136	# type: () -> None
137	"""
138	Optional whitespace
139	"""
140	if (self.tok_id == h8_id.RawData and
141	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
142	self._Next()
143
144	def FindUlTable(self):
145	# type: () -> int
146	"""Find <table ...> <ul>
147
148	Return the START position of the <ul>
149	Similar algorithm as html.ReadUntilStartTag()
150	"""
151	# Find first table
152	while True:
153	self._Next(comment_ok=True)
154	if self.tok_id == h8_id.EndOfStream:
155	return -1
156
157	if (self.tok_id == h8_id.StartTag and self.tag_name == 'table'):
158	while True:
159	self._Next(comment_ok=True)
160	if self.tok_id != h8_id.RawData:
161	break
162
163	if (self.tok_id == h8_id.StartTag and self.tag_name == 'ul'):
164	return self.start_pos
165	return -1
166
167	def _ListItem(self):
168	# type: () -> Tuple[Optional[TdAttrs], Optional[str]]
169	"""Parse a list item nested below thead or tr.
170
171	Returns:
172	A pair (td_attrs, inner_html)
173
174	Grammar:
175
176	LIST_ITEM =
177	[RawData \s*]?
178	[StartTag 'li']
179	ANY* # NOT context-free:
180	# - we MATCH <li> and </li> with a tack
181	# - We search for [StartEndTag 'cell-attrs']?
182	[EndTag 'li']
183
184	Example of attribute borrowing:
185
186	- hi there ==>
187	<li>hi there</li> ==>
188	<td>hi there</td>
189
190	- <cell-attrs class=foo /> hi there ==>
191	<li><cell-attrs class=foo /> hi there </li> ==>
192	<td class=foo> hi there </td> ==>
193	"""
194	self._WhitespaceOk()
195
196	if self.tok_id != h8_id.StartTag:
197	return None, None
198
199	inner_html = None
200	td_attrs = None # Can we also have col-attrs?
201	td_attrs_span = None
202
203	self._Eat(h8_id.StartTag, 'li')
204
205	left = self.start_pos
206
207	# Find the closing </li>, taking into accounted NESTED tags:
208	# <li> <li>foo</li> </li>
209	# because cells can have bulleted lists
210	balance = 0
211	while True:
212	if self.tok_id == h8_id.StartEndTag:
213	self.tag_lexer.Reset(self.start_pos, self.end_pos)
214	# TODO: remove td-attrs backward compat
215	if self.tag_name in ('td-attrs', 'cell-attrs'):
216	td_attrs_span = self.start_pos, self.end_pos
217	td_attrs = self.tag_lexer.AllAttrsRaw()
218	#log('CELL ATTRS %r', self._CurrentString())
219
220	elif self.tok_id == h8_id.StartTag:
221	if self.tag_name == 'li':
222	balance += 1
223
224	elif self.tok_id == h8_id.EndTag:
225	if self.tag_name == 'li':
226	balance -= 1
227	if balance < 0:
228	break
229	self._Next()
230
231	right = self.start_pos # start of the end tag
232
233	s = self.lexer.s
234	if td_attrs_span:
235	# everything except the <cell-attrs />
236	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
237	#log('LEFT %r', s[left:td_attrs_span[0]])
238	#log('RIGHT %r', s[td_attrs_span[1]:right])
239	else:
240	inner_html = s[left:right]
241	#log('RAW inner html %r', inner_html)
242
243	#self._Eat(h8_id.EndTag, 'li')
244	self._Next()
245
246	return td_attrs, inner_html
247
248	def _ParseTHead(self):
249	# type: () -> List[Tuple[Optional[TdAttrs], str]]
250	"""
251	Assume we're looking at the first <ul> tag. Now we want to find
252	<li>thead and the nested <ul>
253
254	Grammar:
255
256	THEAD =
257	[StartTag 'ul']
258	[RawData \s*]?
259	[StartTag 'li']
260	[RawData thead\s*]
261	[StartTag 'ul'] # Indented bullet that starts -
262	LIST_ITEM+
263	[RawData \s*]?
264	[EndTag 'ul']
265	[RawData thead\s+]
266	[End 'li']
267
268	Two Algorithms:
269
270	1. Replacement:
271	- skip over the first ul 'thead' li, and ul 'tr' li
272	- then replace the next ul -> tr, and li -> td
273	2. Parsing and Rendering:
274	- parse them into a structure
275	- skip all the text
276	- print your own HTML
277
278	I think the second one is better, because it allows attribute extensions
279	to thead
280
281	- thead
282	- name [link][]
283	- colgroup=foo align=left
284	- age
285	- colgroup=foo align=right
286	"""
287	#log('*** _ParseTHead')
288	cells = []
289
290	self._WhitespaceOk()
291	self._Eat(h8_id.StartTag, 'li')
292
293	# In CommonMark, r'thead\n' is enough, because it strips trailing
294	# whitespace. I'm not sure if other Markdown processors do that, so
295	# use r'thead\s+'.
296	self._EatRawData(r'thead\s+')
297
298	# This is the row data
299	self._Eat(h8_id.StartTag, 'ul')
300
301	while True:
302	td_attrs, inner_html = self._ListItem()
303	if inner_html is None:
304	break
305	cells.append((td_attrs, inner_html))
306	self._WhitespaceOk()
307
308	self._Eat(h8_id.EndTag, 'ul')
309
310	self._WhitespaceOk()
311	self._Eat(h8_id.EndTag, 'li')
312
313	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
314	return cells
315
316	def _ParseTr(self):
317	# type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
318	"""
319	Assume we're looking at the first <ul> tag. Now we want to find
320	<li>tr and the nested <ul>
321
322	Grammar:
323
324	TR =
325	[RawData \s*]?
326	[StartTag 'li']
327	[RawData thead\s*]
328	[StartTag 'ul'] # Indented bullet that starts -
329	( [StartEndTag row-attrs] [RawData \s*] )?
330	LIST_ITEM+ # Defined above
331	[RawData \s*]?
332	[EndTag 'ul']
333	"""
334	#log('*** _ParseTr')
335
336	cells = []
337
338	self._WhitespaceOk()
339
340	# Could be a </ul>
341	if self.tok_id != h8_id.StartTag:
342	return None, None
343
344	self._Eat(h8_id.StartTag, 'li')
345
346	self._EatRawData(r'tr\s*')
347
348	tr_attrs = None
349	if self.tok_id == h8_id.StartEndTag:
350	self.tag_lexer.Reset(self.start_pos, self.end_pos)
351	if self.tag_name != 'row-attrs':
352	raise htm8.ParseError('Expected row-attrs, got %r' %
353	self.tag_name)
354	tr_attrs = self.tag_lexer.AllAttrsRaw()
355	self._Next()
356	self._WhitespaceOk()
357
358	# This is the row data
359	self._Eat(h8_id.StartTag, 'ul')
360
361	while True:
362	td_attrs, inner_html = self._ListItem()
363	if inner_html is None:
364	break
365	cells.append((td_attrs, inner_html))
366	# TODO: assert
367
368	self._WhitespaceOk()
369
370	self._Eat(h8_id.EndTag, 'ul')
371
372	self._WhitespaceOk()
373	self._Eat(h8_id.EndTag, 'li')
374
375	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
376	return tr_attrs, cells
377
378	def ParseTable(self):
379	# type: () -> Dict[str, Any]
380	"""
381	Returns a structure like this
382	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
383	'tr': [ # raw HTML that you surround with <td>
384	[ 'cell1 html', 'cell2 html' ],
385	[ 'cell1 html', 'cell2 html' ],
386	]
387	}
388
389	Grammar:
390
391	UL_TABLE =
392	[StartTag 'ul']
393	THEAD # this this returns the number of cells, so it's NOT context
394	# free
395	TR*
396	[EndTag 'ul']
397	"""
398	table = {'tr': []} # type: Dict[str, Any]
399
400	ul_start = self.start_pos
401	self._Eat(h8_id.StartTag, 'ul')
402
403	# Look ahead 2 or 3 tokens:
404	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
405	thead = self._ParseTHead()
406	else:
407	thead = None
408	#log('___ THEAD %s', thead)
409
410	while True:
411	tr_attrs, tr = self._ParseTr()
412	if tr is None:
413	break
414	# Not validating because of colspan
415	if 0:
416	if thead and len(tr) != len(thead):
417	raise htm8.ParseError('Expected %d cells, got %d: %s' %
418	(len(thead), len(tr), tr))
419
420	#log('___ TR %s', tr)
421	table['tr'].append((tr_attrs, tr))
422
423	self._Eat(h8_id.EndTag, 'ul')
424
425	self._WhitespaceOk()
426
427	ul_end = self.start_pos
428
429	table['thead'] = thead
430	table['ul_start'] = ul_start
431	table['ul_end'] = ul_end
432
433	if 0:
434	log('table %s', table)
435	from pprint import pprint
436	pprint(table)
437
438	return table
439
440
441	def MergeAttrs(
442	thead_td_attrs, # type: Optional[TdAttrs]
443	row_td_attrs, # type: Optional[TdAttrs]
444	):
445	# type: (...) -> TdAttrs
446	merged_attrs = []
447
448	if row_td_attrs is None:
449	row_lookup = {}
450	else:
451	row_lookup = {n: v for n, v in row_td_attrs}
452
453	done_for_row = set()
454
455	if thead_td_attrs:
456	for name, raw_value in thead_td_attrs:
457	more_values = row_lookup.get(name)
458	if more_values is not None:
459	raw_value += ' %s' % more_values
460	done_for_row.add(name)
461	merged_attrs.append((name, raw_value))
462
463	if row_td_attrs:
464	for name, raw_value in row_td_attrs:
465	if name in done_for_row:
466	continue
467	merged_attrs.append((name, raw_value))
468
469	return merged_attrs
470
471
472	def ReplaceTables(s, debug_out=None):
473	# type: (str, Optional[Any]) -> str
474	"""
475	ul-table: Write tables using bulleted list
476	"""
477	if debug_out is None:
478	debug_out = []
479
480	f = StringIO()
481	out = htm8.Output(s, f)
482
483	tag_lexer = htm8.TagLexer(s)
484	lexer = htm8.Lexer(s)
485
486	p = UlTableParser(lexer, tag_lexer)
487
488	while True:
489	ul_start = p.FindUlTable()
490	if ul_start == -1:
491	break
492
493	#log('UL START %d', ul_start)
494	out.PrintUntil(ul_start)
495
496	table = p.ParseTable()
497	#log('UL END %d', ul_end)
498
499	# Don't write the matching </u> of the LAST row, but write everything
500	# after that
501	out.SkipTo(table['ul_end'])
502
503	# Write the header
504	thead = table['thead']
505
506	col_attrs = {} # integer -> td_attrs
507	if thead:
508	out.Print('<thead>\n')
509	out.Print('<tr>\n')
510
511	i = 0
512	for td_attrs, raw_html in thead:
513	if td_attrs:
514	col_attrs[i] = td_attrs
515	# <th> tag is more semantic, and styled bold by default
516	out.Print(' <th>')
517	out.Print(raw_html)
518	out.Print('</th>\n')
519	i += 1
520
521	out.Print('</tr>\n')
522	out.Print('</thead>\n')
523
524	# Write each row
525	for tr_attrs, row in table['tr']:
526
527	# Print tr tag and attrs
528	out.Print('<tr')
529	if tr_attrs:
530	for name, raw_value in tr_attrs:
531	out.Print(' ')
532	out.Print(name)
533	# No escaping because it's raw. It can't contain quotes.
534	out.Print('="%s"' % raw_value)
535	out.Print('>\n')
536
537	# Print cells
538	i = 0
539	for row_td_attrs, raw_html in row:
540	# Inherited from header
541	thead_td_attrs = col_attrs.get(i)
542	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
543
544	out.Print(' <td')
545	for name, raw_value in merged_attrs:
546	out.Print(' ')
547	out.Print(name)
548	# No escaping because it's raw. It can't contain quotes.
549	out.Print('="%s"' % raw_value)
550	out.Print('>')
551
552	out.Print(raw_html)
553	out.Print('</td>\n')
554	i += 1
555	out.Print('</tr>\n')
556
557	out.PrintTheRest()
558
559	return f.getvalue()
560
561
562	if __name__ == '__main__':
563	# Simple CLI filter
564	h = sys.stdin.read()
565	h = RemoveComments(h)
566	h = ReplaceTables(h)
567	sys.stdout.write(h)