doctools/html

OILS / doctools / html_old.py View on Github | oils.pub

377 lines, 172 significant

1	#!/usr/bin/env python2
2	"""
3	doctools/html_old.py - APIs that should be replaced by data_lang/htm8.py
4
5	See doc/lazylex.md for details.
6	"""
7	from __future__ import print_function
8
9	import re
10
11	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
12	h8_tag_id_str)
13	from data_lang.htm8 import (Lexer, LexError, ParseError, Output, _NAME_RE)
14	from doctools.util import log
15
16	_ = log
17
18	try:
19	from cStringIO import StringIO
20	except ImportError:
21	# for python3
22	from io import StringIO # type: ignore
23	import sys
24
25	if sys.version_info.major == 2:
26	from typing import List, Tuple, Iterator, Optional
27
28
29	def _Tokens(s, left_pos, right_pos):
30	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
31	"""
32	Args:
33	s: string to parse
34	left_pos, right_pos: Optional span boundaries.
35	"""
36	lx = Lexer(s, left_pos, right_pos)
37	while True:
38	tok_id, pos = lx.Read()
39	yield tok_id, pos
40	if tok_id == h8_id.EndOfStream:
41	break
42
43
44	def ValidTokens(s, left_pos=0, right_pos=-1):
45	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
46	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
47
48	I'm not combining the two functions because I might want to do a
49	'yield' transformation on Tokens()? Exceptions might complicate the
50	issue?
51	"""
52	pos = left_pos
53	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
54	if tok_id == h8_id.Invalid:
55	raise LexError('ValidTokens() got invalid token', s, pos)
56	yield tok_id, end_pos
57	pos = end_pos
58
59
60	def ReadUntilStartTag(it, tag_lexer, tag_name):
61	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
62	"""Find the next <foo>, returning its (start, end) positions
63
64	Raise ParseError if it's not found.
65
66	tag_lexer is RESET.
67	"""
68	pos = 0
69	while True:
70	try:
71	tok_id, end_pos = next(it)
72	except StopIteration:
73	break
74	tag_lexer.Reset(pos, end_pos)
75	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
76	return pos, end_pos
77
78	pos = end_pos
79
80	raise ParseError('No start tag %r' % tag_name)
81
82
83	def ReadUntilEndTag(it, tag_lexer, tag_name):
84	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
85	"""Find the next </foo>, returning its (start, end) position
86
87	Raise ParseError if it's not found.
88
89	tag_lexer is RESET.
90	"""
91	pos = 0
92	while True:
93	try:
94	tok_id, end_pos = next(it)
95	except StopIteration:
96	break
97	tag_lexer.Reset(pos, end_pos)
98	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
99	return pos, end_pos
100
101	pos = end_pos
102
103	raise ParseError('No end tag %r' % tag_name)
104
105
106	CHAR_ENTITY = {
107	'amp': '&',
108	'lt': '<',
109	'gt': '>',
110	'quot': '"',
111	'apos': "'",
112	}
113
114
115	def ToText(s, left_pos=0, right_pos=-1):
116	# type: (str, int, int) -> str
117	"""Given HTML, return text by unquoting > and < etc.
118
119	Used by:
120	doctools/oils_doc.py: PygmentsPlugin
121	doctools/help_gen.py: HelpIndexCards
122
123	In the latter case, we cold process some tags, like:
124
125	- Blue Link (not clickable, but still useful)
126	- Red X
127
128	That should be html.ToAnsi.
129	"""
130	f = StringIO()
131	out = Output(s, f, left_pos, right_pos)
132
133	pos = left_pos
134	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
135	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
136	h8_id.BadLessThan):
137	out.SkipTo(pos)
138	out.PrintUntil(end_pos)
139
140	elif tok_id == h8_id.CharEntity: # &
141
142	entity = s[pos + 1:end_pos - 1]
143
144	out.SkipTo(pos)
145	out.Print(CHAR_ENTITY[entity])
146	out.SkipTo(end_pos)
147
148	# Not handling these yet
149	elif tok_id == h8_id.HexChar:
150	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
151
152	elif tok_id == h8_id.DecChar:
153	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
154
155	else:
156	# Skip everything else
157	out.SkipTo(end_pos)
158
159	pos = end_pos
160
161	out.PrintTheRest()
162	return f.getvalue()
163
164
165	#
166	# OLD TagLexer API - REMOVE THIS
167	#
168	# HTML 5 doesn't restrict tag names at all
169	# https://html.spec.whatwg.org/#toc-syntax
170	#
171	# XML allows : - .
172	# https://www.w3.org/TR/xml/#NT-NameChar
173
174	# Namespaces for MathML, SVG
175	# XLink, XML, XMLNS
176	#
177	# https://infra.spec.whatwg.org/#namespaces
178	#
179	# Allow - for td-attrs
180
181	# Similar to _UNQUOTED_VALUE in data_lang/htm8.py
182	_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
183
184	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME_RE, re.VERBOSE)
185
186	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
187
188	# To match href="foo"
189
190	# <button disabled> is standard usage
191
192	# NOTE: This used to allow whitespace around =
193	# <a foo = "bar"> makes sense in XML
194	# But then you also have
195	# <a foo= bar> - which is TWO attributes, in HTML5
196	# So the space is problematic
197
198	_ATTR_RE = re.compile(
199	r'''
200	\s+ # Leading whitespace is required
201	(%s) # Attribute name
202	(?: # Optional attribute value
203	\s* = \s* # Spaces allowed around =
204	(?:
205	" ([^>"\x00]*) " # double quoted value
206	\| ' ([^>'\x00]*) ' # single quoted value
207	\| (%s) # Attribute value
208	)
209	)?
210	''' % (_NAME_RE, _UNQUOTED_VALUE_OLD), re.VERBOSE)
211
212
213	class TagLexer(object):
214	"""
215	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
216	provides a few operations:
217
218	- What is the tag?
219	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
220	"""
221
222	def __init__(self, s):
223	# type: (str) -> None
224	self.s = s
225	self.start_pos = -1 # Invalid
226	self.end_pos = -1
227
228	def Reset(self, start_pos, end_pos):
229	# type: (int, int) -> None
230	"""Reuse instances of this object."""
231	assert start_pos >= 0, start_pos
232	assert end_pos >= 0, end_pos
233
234	self.start_pos = start_pos
235	self.end_pos = end_pos
236
237	def WholeTagString(self):
238	# type: () -> str
239	"""Return the entire tag string, e.g. <a href='foo'>"""
240	return self.s[self.start_pos:self.end_pos]
241
242	def GetTagName(self):
243	# type: () -> str
244	# First event
245	tok_id, start, end = next(self.Tokens())
246	return self.s[start:end]
247
248	def GetSpanForAttrValue(self, attr_name):
249	# type: (str) -> Tuple[int, int]
250	"""
251	Used by oils_doc.py, for href shortcuts
252	"""
253	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
254	# TODO: Could also cache these
255
256	events = self.Tokens()
257	val = (-1, -1)
258	try:
259	while True:
260	tok_id, start, end = next(events)
261	if tok_id == h8_tag_id.AttrName:
262	name = self.s[start:end]
263	if name == attr_name:
264	# The value should come next
265	tok_id, start, end = next(events)
266	assert tok_id in (
267	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
268	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
269	val = start, end
270	break
271
272	except StopIteration:
273	pass
274	return val
275
276	def GetAttrRaw(self, attr_name):
277	# type: (str) -> Optional[str]
278	"""
279	Return the value, which may be UNESCAPED.
280	"""
281	start, end = self.GetSpanForAttrValue(attr_name)
282	if start == -1:
283	return None
284	return self.s[start:end]
285
286	def AllAttrsRawSlice(self):
287	# type: () -> List[Tuple[str, int, int]]
288	"""
289	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
290	"""
291	slices = []
292	events = self.Tokens()
293	try:
294	while True:
295	tok_id, start, end = next(events)
296	if tok_id == h8_tag_id.AttrName:
297	name = self.s[start:end]
298
299	# The value should come next
300	tok_id, start, end = next(events)
301	assert tok_id in (
302	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
303	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
304	# Note: quoted values may have &
305	# We would need ANOTHER lexer to unescape them, but we
306	# don't need that for ul-table
307	slices.append((name, start, end))
308	except StopIteration:
309	pass
310	return slices
311
312	def AllAttrsRaw(self):
313	# type: () -> List[Tuple[str, str]]
314	"""
315	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
316
317	The quoted values may be escaped. We would need another lexer to
318	unescape them.
319	"""
320	slices = self.AllAttrsRawSlice()
321	pairs = []
322	for name, start, end in slices:
323	pairs.append((name, self.s[start:end]))
324	return pairs
325
326	def Tokens(self):
327	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
328	"""
329	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
330
331	Where each Token is (Type, start_pos, end_pos)
332
333	Note that start and end are NOT redundant! We skip over some unwanted
334	characters.
335	"""
336	m = _TAG_RE.match(self.s, self.start_pos + 1)
337	if not m:
338	raise RuntimeError("Couldn't find HTML tag in %r" %
339	self.WholeTagString())
340	yield h8_tag_id.TagName, m.start(1), m.end(1)
341
342	pos = m.end(0)
343	#log('POS %d', pos)
344
345	while True:
346	# don't search past the end
347	m = _ATTR_RE.match(self.s, pos, self.end_pos)
348	if not m:
349	#log('BREAK pos %d', pos)
350	break
351	#log('AttrName %r', m.group(1))
352
353	yield h8_tag_id.AttrName, m.start(1), m.end(1)
354
355	#log('m.groups() %r', m.groups())
356	if m.group(2) is not None:
357	# double quoted
358	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
359	elif m.group(3) is not None:
360	# single quoted - TODO: could have different token types
361	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
362	elif m.group(4) is not None:
363	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
364	else:
365	# <button disabled>
366	end = m.end(0)
367	yield h8_tag_id.MissingValue, end, end
368
369	# Skip past the "
370	pos = m.end(0)
371
372	#log('TOK %r', self.s)
373
374	m = _TAG_LAST_RE.match(self.s, pos)
375	#log('_TAG_LAST_RE match %r', self.s[pos:])
376	if not m:
377	raise LexError('Extra data at end of tag', self.s, pos)