doctools/ul_table.py

OILS / doctools / ul_table.py View on Github | oils.pub

571 lines, 287 significant

1	#!/usr/bin/env python2
2	"""ul_table.py: Markdown Tables Without New Syntax."""
3
4	from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
5
6	try:
7	from cStringIO import StringIO
8	except ImportError:
9	from io import StringIO # type: ignore
10	import re
11	import sys
12
13	from doctools.util import log
14	from data_lang import htm8
15	from typing import List
16	from typing import Optional
17	from typing import Tuple
18	from typing import Any
19	from typing import Dict
20
21
22	def RemoveComments(s):
23	# type: (str) -> str
24	"""Remove <!-- comments -->
25
26	This is a required preprocessing step for ul-table.
27	"""
28	f = StringIO()
29	out = htm8.Output(s, f)
30	lx = htm8.Lexer(s)
31
32	pos = 0
33	while True:
34	tok_id, end_pos = lx.Read()
35	if tok_id == h8_id.EndOfStream:
36	break
37
38	if tok_id == h8_id.Invalid:
39	raise htm8.LexError(s, pos)
40
41	if tok_id == h8_id.Comment:
42	value = s[pos:end_pos]
43	# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
44	if 'REPLACE' not in value:
45	out.PrintUntil(pos)
46	out.SkipTo(end_pos)
47	pos = end_pos
48
49	out.PrintTheRest()
50	return f.getvalue()
51
52
53	_WHITESPACE_RE = re.compile(r'\s*')
54
55	TdAttrs = List[Tuple[str, str]]
56
57
58	class UlTableParser(object):
59
60	def __init__(self, lexer, tag_lexer):
61	# type: (htm8.Lexer, htm8.TagLexer) -> None
62	self.lexer = lexer
63	self.tag_lexer = tag_lexer
64
65	self.tok_id = h8_id.Invalid
66	self.start_pos = 0
67	self.end_pos = 0
68
69	def _CurrentString(self):
70	# type: () -> str
71	part = self.lexer.s[self.start_pos:self.end_pos]
72	return part
73
74	def _Next(self, comment_ok=False):
75	# type: (bool) -> None
76	"""
77	Advance and set self.tok_id, self.start_pos, self.end_pos
78	"""
79	self.start_pos = self.end_pos
80	self.tok_id, self.end_pos = self.lexer.Read()
81
82	# Should have called RemoveComments() beforehand. That can still leave
83	# some REPLACE cmoments
84	if not comment_ok and self.tok_id == h8_id.Comment:
85	raise htm8.ParseError('Unexpected HTML comment')
86
87	if 0:
88	part = self._CurrentString()
89	log('[%3d - %3d] %r', self.start_pos, self.end_pos, part)
90
91	def _EatRawData(self, regex):
92	# type: (str) -> None
93	"""
94	Assert that we got text data matching a regex, and advance
95	"""
96	if self.tok_id != h8_id.RawData:
97	raise htm8.ParseError('Expected RawData, got %s' %
98	h8_id_str(self.tok_id))
99	actual = self._CurrentString()
100	m = re.match(regex, actual) # could compile this
101	if m is None:
102	raise htm8.ParseError('Expected to match %r, got %r' %
103	(regex, actual))
104	self._Next()
105
106	def _Eat(self, expected_id, expected_tag):
107	# type: (h8_id_t, str) -> None
108	"""
109	Assert that we got a start or end tag, with the given name, and advance
110
111	Args:
112	expected_id: h8_id.StartTag or h8_id.EndTag
113	expected_tag: 'a', 'span', etc.
114	"""
115	assert expected_id in (h8_id.StartTag,
116	h8_id.EndTag), h8_id_str(expected_id)
117
118	if self.tok_id != expected_id:
119	raise htm8.ParseError(
120	'Expected token %s, got %s' %
121	(h8_id_str(expected_id), h8_id_str(self.tok_id)))
122	self.tag_lexer.Reset(self.start_pos, self.end_pos)
123	tag_name = self.tag_lexer.GetTagName()
124	if expected_tag != tag_name:
125	raise htm8.ParseError('Expected tag %r, got %r' %
126	(expected_tag, tag_name))
127
128	self._Next()
129
130	def _WhitespaceOk(self):
131	# type: () -> None
132	"""
133	Optional whitespace
134	"""
135	if (self.tok_id == h8_id.RawData and
136	_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
137	self._Next()
138
139	def FindUlTable(self):
140	# type: () -> int
141	"""Find <table ...> <ul>
142
143	Return the START position of the <ul>
144	Similar algorithm as html.ReadUntilStartTag()
145	"""
146	tag_lexer = self.tag_lexer
147
148	# Find first table
149	while True:
150	self._Next(comment_ok=True)
151	if self.tok_id == h8_id.EndOfStream:
152	return -1
153
154	tag_lexer.Reset(self.start_pos, self.end_pos)
155	if (self.tok_id == h8_id.StartTag and
156	tag_lexer.GetTagName() == 'table'):
157	while True:
158	self._Next(comment_ok=True)
159	if self.tok_id != h8_id.RawData:
160	break
161
162	tag_lexer.Reset(self.start_pos, self.end_pos)
163	if (self.tok_id == h8_id.StartTag and
164	tag_lexer.GetTagName() == 'ul'):
165	return self.start_pos
166	return -1
167
168	def _ListItem(self):
169	# type: () -> Tuple[Optional[TdAttrs], Optional[str]]
170	"""Parse a list item nested below thead or tr.
171
172	Returns:
173	A pair (td_attrs, inner_html)
174
175	Grammar:
176
177	LIST_ITEM =
178	[RawData \s*]?
179	[StartTag 'li']
180	ANY* # NOT context-free:
181	# - we MATCH <li> and </li> with a tack
182	# - We search for [StartEndTag 'cell-attrs']?
183	[EndTag 'li']
184
185	Example of attribute borrowing:
186
187	- hi there ==>
188	<li>hi there</li> ==>
189	<td>hi there</td>
190
191	- <cell-attrs class=foo /> hi there ==>
192	<li><cell-attrs class=foo /> hi there </li> ==>
193	<td class=foo> hi there </td> ==>
194	"""
195	self._WhitespaceOk()
196
197	if self.tok_id != h8_id.StartTag:
198	return None, None
199
200	inner_html = None
201	td_attrs = None # Can we also have col-attrs?
202	td_attrs_span = None
203
204	self._Eat(h8_id.StartTag, 'li')
205
206	left = self.start_pos
207
208	# Find the closing </li>, taking into accounted NESTED tags:
209	# <li> <li>foo</li> </li>
210	# because cells can have bulleted lists
211	balance = 0
212	while True:
213	if self.tok_id == h8_id.StartEndTag:
214	self.tag_lexer.Reset(self.start_pos, self.end_pos)
215	tag_name = self.tag_lexer.GetTagName()
216	# TODO: remove td-attrs backward compat
217	if tag_name in ('td-attrs', 'cell-attrs'):
218	td_attrs_span = self.start_pos, self.end_pos
219	td_attrs = self.tag_lexer.AllAttrsRaw()
220	#log('CELL ATTRS %r', self._CurrentString())
221
222	elif self.tok_id == h8_id.StartTag:
223	self.tag_lexer.Reset(self.start_pos, self.end_pos)
224	if self.tag_lexer.GetTagName() == 'li':
225	balance += 1
226
227	elif self.tok_id == h8_id.EndTag:
228	self.tag_lexer.Reset(self.start_pos, self.end_pos)
229	if self.tag_lexer.GetTagName() == 'li':
230	balance -= 1
231	if balance < 0:
232	break
233	self._Next()
234
235	right = self.start_pos # start of the end tag
236
237	s = self.tag_lexer.s
238	if td_attrs_span:
239	# everything except the <cell-attrs />
240	inner_html = s[left:td_attrs_span[0]] + s[td_attrs_span[1]:right]
241	#log('LEFT %r', s[left:td_attrs_span[0]])
242	#log('RIGHT %r', s[td_attrs_span[1]:right])
243	else:
244	inner_html = s[left:right]
245	#log('RAW inner html %r', inner_html)
246
247	#self._Eat(h8_id.EndTag, 'li')
248	self._Next()
249
250	return td_attrs, inner_html
251
252	def _ParseTHead(self):
253	# type: () -> List[Tuple[Optional[TdAttrs], str]]
254	"""
255	Assume we're looking at the first <ul> tag. Now we want to find
256	<li>thead and the nested <ul>
257
258	Grammar:
259
260	THEAD =
261	[StartTag 'ul']
262	[RawData \s*]?
263	[StartTag 'li']
264	[RawData thead\s*]
265	[StartTag 'ul'] # Indented bullet that starts -
266	LIST_ITEM+
267	[RawData \s*]?
268	[EndTag 'ul']
269	[RawData thead\s+]
270	[End 'li']
271
272	Two Algorithms:
273
274	1. Replacement:
275	- skip over the first ul 'thead' li, and ul 'tr' li
276	- then replace the next ul -> tr, and li -> td
277	2. Parsing and Rendering:
278	- parse them into a structure
279	- skip all the text
280	- print your own HTML
281
282	I think the second one is better, because it allows attribute extensions
283	to thead
284
285	- thead
286	- name [link][]
287	- colgroup=foo align=left
288	- age
289	- colgroup=foo align=right
290	"""
291	#log('*** _ParseTHead')
292	cells = []
293
294	self._WhitespaceOk()
295	self._Eat(h8_id.StartTag, 'li')
296
297	# In CommonMark, r'thead\n' is enough, because it strips trailing
298	# whitespace. I'm not sure if other Markdown processors do that, so
299	# use r'thead\s+'.
300	self._EatRawData(r'thead\s+')
301
302	# This is the row data
303	self._Eat(h8_id.StartTag, 'ul')
304
305	while True:
306	td_attrs, inner_html = self._ListItem()
307	if inner_html is None:
308	break
309	cells.append((td_attrs, inner_html))
310	self._WhitespaceOk()
311
312	self._Eat(h8_id.EndTag, 'ul')
313
314	self._WhitespaceOk()
315	self._Eat(h8_id.EndTag, 'li')
316
317	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
318	return cells
319
320	def _ParseTr(self):
321	# type: () -> Tuple[Optional[TdAttrs], List[Tuple[Optional[TdAttrs], str]]]
322	"""
323	Assume we're looking at the first <ul> tag. Now we want to find
324	<li>tr and the nested <ul>
325
326	Grammar:
327
328	TR =
329	[RawData \s*]?
330	[StartTag 'li']
331	[RawData thead\s*]
332	[StartTag 'ul'] # Indented bullet that starts -
333	( [StartEndTag row-attrs] [RawData \s*] )?
334	LIST_ITEM+ # Defined above
335	[RawData \s*]?
336	[EndTag 'ul']
337	"""
338	#log('*** _ParseTr')
339
340	cells = []
341
342	self._WhitespaceOk()
343
344	# Could be a </ul>
345	if self.tok_id != h8_id.StartTag:
346	return None, None
347
348	self._Eat(h8_id.StartTag, 'li')
349
350	self._EatRawData(r'tr\s*')
351
352	tr_attrs = None
353	if self.tok_id == h8_id.StartEndTag:
354	self.tag_lexer.Reset(self.start_pos, self.end_pos)
355	tag_name = self.tag_lexer.GetTagName()
356	if tag_name != 'row-attrs':
357	raise htm8.ParseError('Expected row-attrs, got %r' % tag_name)
358	tr_attrs = self.tag_lexer.AllAttrsRaw()
359	self._Next()
360	self._WhitespaceOk()
361
362	# This is the row data
363	self._Eat(h8_id.StartTag, 'ul')
364
365	while True:
366	td_attrs, inner_html = self._ListItem()
367	if inner_html is None:
368	break
369	cells.append((td_attrs, inner_html))
370	# TODO: assert
371
372	self._WhitespaceOk()
373
374	self._Eat(h8_id.EndTag, 'ul')
375
376	self._WhitespaceOk()
377	self._Eat(h8_id.EndTag, 'li')
378
379	#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
380	return tr_attrs, cells
381
382	def ParseTable(self):
383	# type: () -> Dict[str, Any]
384	"""
385	Returns a structure like this
386	{ 'thead': [ 'col1', 'col2' ], # TODO: columns can have CSS attributes
387	'tr': [ # raw HTML that you surround with <td>
388	[ 'cell1 html', 'cell2 html' ],
389	[ 'cell1 html', 'cell2 html' ],
390	]
391	}
392
393	Grammar:
394
395	UL_TABLE =
396	[StartTag 'ul']
397	THEAD # this this returns the number of cells, so it's NOT context
398	# free
399	TR*
400	[EndTag 'ul']
401	"""
402	table = {'tr': []} # type: Dict[str, Any]
403
404	ul_start = self.start_pos
405	self._Eat(h8_id.StartTag, 'ul')
406
407	# Look ahead 2 or 3 tokens:
408	if self.lexer.LookAhead(r'\s*<li>thead\s+'):
409	thead = self._ParseTHead()
410	else:
411	thead = None
412	#log('___ THEAD %s', thead)
413
414	while True:
415	tr_attrs, tr = self._ParseTr()
416	if tr is None:
417	break
418	# Not validating because of colspan
419	if 0:
420	if thead and len(tr) != len(thead):
421	raise htm8.ParseError('Expected %d cells, got %d: %s' %
422	(len(thead), len(tr), tr))
423
424	#log('___ TR %s', tr)
425	table['tr'].append((tr_attrs, tr))
426
427	self._Eat(h8_id.EndTag, 'ul')
428
429	self._WhitespaceOk()
430
431	ul_end = self.start_pos
432
433	table['thead'] = thead
434	table['ul_start'] = ul_start
435	table['ul_end'] = ul_end
436
437	if 0:
438	log('table %s', table)
439	from pprint import pprint
440	pprint(table)
441
442	return table
443
444
445	def MergeAttrs(
446	thead_td_attrs, # type: Optional[TdAttrs]
447	row_td_attrs, # type: Optional[TdAttrs]
448	):
449	# type: (...) -> TdAttrs
450	merged_attrs = []
451
452	if row_td_attrs is None:
453	row_lookup = {}
454	else:
455	row_lookup = {n: v for n, v in row_td_attrs}
456
457	done_for_row = set()
458
459	if thead_td_attrs:
460	for name, raw_value in thead_td_attrs:
461	more_values = row_lookup.get(name)
462	if more_values is not None:
463	raw_value += ' %s' % more_values
464	done_for_row.add(name)
465	merged_attrs.append((name, raw_value))
466
467	if row_td_attrs:
468	for name, raw_value in row_td_attrs:
469	if name in done_for_row:
470	continue
471	merged_attrs.append((name, raw_value))
472
473	return merged_attrs
474
475
476	def ReplaceTables(s, debug_out=None):
477	# type: (str, Optional[Any]) -> str
478	"""
479	ul-table: Write tables using bulleted list
480	"""
481	if debug_out is None:
482	debug_out = []
483
484	f = StringIO()
485	out = htm8.Output(s, f)
486
487	tag_lexer = htm8.TagLexer(s)
488	lexer = htm8.Lexer(s)
489
490	p = UlTableParser(lexer, tag_lexer)
491
492	while True:
493	ul_start = p.FindUlTable()
494	if ul_start == -1:
495	break
496
497	#log('UL START %d', ul_start)
498	out.PrintUntil(ul_start)
499
500	table = p.ParseTable()
501	#log('UL END %d', ul_end)
502
503	# Don't write the matching </u> of the LAST row, but write everything
504	# after that
505	out.SkipTo(table['ul_end'])
506
507	# Write the header
508	thead = table['thead']
509
510	col_attrs = {} # integer -> td_attrs
511	if thead:
512	out.Print('<thead>\n')
513	out.Print('<tr>\n')
514
515	i = 0
516	for td_attrs, raw_html in thead:
517	if td_attrs:
518	col_attrs[i] = td_attrs
519	# <th> tag is more semantic, and styled bold by default
520	out.Print(' <th>')
521	out.Print(raw_html)
522	out.Print('</th>\n')
523	i += 1
524
525	out.Print('</tr>\n')
526	out.Print('</thead>\n')
527
528	# Write each row
529	for tr_attrs, row in table['tr']:
530
531	# Print tr tag and attrs
532	out.Print('<tr')
533	if tr_attrs:
534	for name, raw_value in tr_attrs:
535	out.Print(' ')
536	out.Print(name)
537	# No escaping because it's raw. It can't contain quotes.
538	out.Print('="%s"' % raw_value)
539	out.Print('>\n')
540
541	# Print cells
542	i = 0
543	for row_td_attrs, raw_html in row:
544	# Inherited from header
545	thead_td_attrs = col_attrs.get(i)
546	merged_attrs = MergeAttrs(thead_td_attrs, row_td_attrs)
547
548	out.Print(' <td')
549	for name, raw_value in merged_attrs:
550	out.Print(' ')
551	out.Print(name)
552	# No escaping because it's raw. It can't contain quotes.
553	out.Print('="%s"' % raw_value)
554	out.Print('>')
555
556	out.Print(raw_html)
557	out.Print('</td>\n')
558	i += 1
559	out.Print('</tr>\n')
560
561	out.PrintTheRest()
562
563	return f.getvalue()
564
565
566	if __name__ == '__main__':
567	# Simple CLI filter
568	h = sys.stdin.read()
569	h = RemoveComments(h)
570	h = ReplaceTables(h)
571	sys.stdout.write(h)