lazylex/html.py

OILS / lazylex / html.py View on Github | oils.pub

378 lines, 172 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Wrapper around HTM8
4
5	See doc/lazylex.md for details.
6
7	"""
8	from __future__ import print_function
9
10	import re
11
12	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_tag_id, h8_tag_id_t,
13	h8_tag_id_str)
14	from data_lang.htm8 import (Lexer, LexError, ParseError, Output, _NAME_RE)
15	from doctools.util import log
16
17	_ = log
18
19	try:
20	from cStringIO import StringIO
21	except ImportError:
22	# for python3
23	from io import StringIO # type: ignore
24	import sys
25
26	if sys.version_info.major == 2:
27	from typing import List, Tuple, Iterator, Optional
28
29
30	def _Tokens(s, left_pos, right_pos):
31	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
32	"""
33	Args:
34	s: string to parse
35	left_pos, right_pos: Optional span boundaries.
36	"""
37	lx = Lexer(s, left_pos, right_pos)
38	while True:
39	tok_id, pos = lx.Read()
40	yield tok_id, pos
41	if tok_id == h8_id.EndOfStream:
42	break
43
44
45	def ValidTokens(s, left_pos=0, right_pos=-1):
46	# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
47	"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
48
49	I'm not combining the two functions because I might want to do a
50	'yield' transformation on Tokens()? Exceptions might complicate the
51	issue?
52	"""
53	pos = left_pos
54	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
55	if tok_id == h8_id.Invalid:
56	raise LexError('ValidTokens() got invalid token', s, pos)
57	yield tok_id, end_pos
58	pos = end_pos
59
60
61	def ReadUntilStartTag(it, tag_lexer, tag_name):
62	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
63	"""Find the next <foo>, returning its (start, end) positions
64
65	Raise ParseError if it's not found.
66
67	tag_lexer is RESET.
68	"""
69	pos = 0
70	while True:
71	try:
72	tok_id, end_pos = next(it)
73	except StopIteration:
74	break
75	tag_lexer.Reset(pos, end_pos)
76	if tok_id == h8_id.StartTag and tag_lexer.GetTagName() == tag_name:
77	return pos, end_pos
78
79	pos = end_pos
80
81	raise ParseError('No start tag %r' % tag_name)
82
83
84	def ReadUntilEndTag(it, tag_lexer, tag_name):
85	# type: (Iterator[Tuple[h8_id_t, int]], TagLexer, str) -> Tuple[int, int]
86	"""Find the next </foo>, returning its (start, end) position
87
88	Raise ParseError if it's not found.
89
90	tag_lexer is RESET.
91	"""
92	pos = 0
93	while True:
94	try:
95	tok_id, end_pos = next(it)
96	except StopIteration:
97	break
98	tag_lexer.Reset(pos, end_pos)
99	if tok_id == h8_id.EndTag and tag_lexer.GetTagName() == tag_name:
100	return pos, end_pos
101
102	pos = end_pos
103
104	raise ParseError('No end tag %r' % tag_name)
105
106
107	CHAR_ENTITY = {
108	'amp': '&',
109	'lt': '<',
110	'gt': '>',
111	'quot': '"',
112	'apos': "'",
113	}
114
115
116	def ToText(s, left_pos=0, right_pos=-1):
117	# type: (str, int, int) -> str
118	"""Given HTML, return text by unquoting > and < etc.
119
120	Used by:
121	doctools/oils_doc.py: PygmentsPlugin
122	doctools/help_gen.py: HelpIndexCards
123
124	In the latter case, we cold process some tags, like:
125
126	- Blue Link (not clickable, but still useful)
127	- Red X
128
129	That should be html.ToAnsi.
130	"""
131	f = StringIO()
132	out = Output(s, f, left_pos, right_pos)
133
134	pos = left_pos
135	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
136	if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
137	h8_id.BadLessThan):
138	out.SkipTo(pos)
139	out.PrintUntil(end_pos)
140
141	elif tok_id == h8_id.CharEntity: # &
142
143	entity = s[pos + 1:end_pos - 1]
144
145	out.SkipTo(pos)
146	out.Print(CHAR_ENTITY[entity])
147	out.SkipTo(end_pos)
148
149	# Not handling these yet
150	elif tok_id == h8_id.HexChar:
151	raise AssertionError('Hex Char %r' % s[pos:pos + 20])
152
153	elif tok_id == h8_id.DecChar:
154	raise AssertionError('Dec Char %r' % s[pos:pos + 20])
155
156	else:
157	# Skip everything else
158	out.SkipTo(end_pos)
159
160	pos = end_pos
161
162	out.PrintTheRest()
163	return f.getvalue()
164
165
166	#
167	# OLD TagLexer API - REMOVE THIS
168	#
169	# HTML 5 doesn't restrict tag names at all
170	# https://html.spec.whatwg.org/#toc-syntax
171	#
172	# XML allows : - .
173	# https://www.w3.org/TR/xml/#NT-NameChar
174
175	# Namespaces for MathML, SVG
176	# XLink, XML, XMLNS
177	#
178	# https://infra.spec.whatwg.org/#namespaces
179	#
180	# Allow - for td-attrs
181
182	# Similar to _UNQUOTED_VALUE in data_lang/htm8.py
183	_UNQUOTED_VALUE_OLD = r'''[^ \t\r\n<>&"'\x00]*'''
184
185	_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME_RE, re.VERBOSE)
186
187	_TAG_LAST_RE = re.compile(r'\s* /? >', re.VERBOSE)
188
189	# To match href="foo"
190
191	# <button disabled> is standard usage
192
193	# NOTE: This used to allow whitespace around =
194	# <a foo = "bar"> makes sense in XML
195	# But then you also have
196	# <a foo= bar> - which is TWO attributes, in HTML5
197	# So the space is problematic
198
199	_ATTR_RE = re.compile(
200	r'''
201	\s+ # Leading whitespace is required
202	(%s) # Attribute name
203	(?: # Optional attribute value
204	\s* = \s* # Spaces allowed around =
205	(?:
206	" ([^>"\x00]*) " # double quoted value
207	\| ' ([^>'\x00]*) ' # single quoted value
208	\| (%s) # Attribute value
209	)
210	)?
211	''' % (_NAME_RE, _UNQUOTED_VALUE_OLD), re.VERBOSE)
212
213
214	class TagLexer(object):
215	"""
216	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
217	provides a few operations:
218
219	- What is the tag?
220	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
221	"""
222
223	def __init__(self, s):
224	# type: (str) -> None
225	self.s = s
226	self.start_pos = -1 # Invalid
227	self.end_pos = -1
228
229	def Reset(self, start_pos, end_pos):
230	# type: (int, int) -> None
231	"""Reuse instances of this object."""
232	assert start_pos >= 0, start_pos
233	assert end_pos >= 0, end_pos
234
235	self.start_pos = start_pos
236	self.end_pos = end_pos
237
238	def WholeTagString(self):
239	# type: () -> str
240	"""Return the entire tag string, e.g. <a href='foo'>"""
241	return self.s[self.start_pos:self.end_pos]
242
243	def GetTagName(self):
244	# type: () -> str
245	# First event
246	tok_id, start, end = next(self.Tokens())
247	return self.s[start:end]
248
249	def GetSpanForAttrValue(self, attr_name):
250	# type: (str) -> Tuple[int, int]
251	"""
252	Used by oils_doc.py, for href shortcuts
253	"""
254	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
255	# TODO: Could also cache these
256
257	events = self.Tokens()
258	val = (-1, -1)
259	try:
260	while True:
261	tok_id, start, end = next(events)
262	if tok_id == h8_tag_id.AttrName:
263	name = self.s[start:end]
264	if name == attr_name:
265	# The value should come next
266	tok_id, start, end = next(events)
267	assert tok_id in (
268	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
269	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
270	val = start, end
271	break
272
273	except StopIteration:
274	pass
275	return val
276
277	def GetAttrRaw(self, attr_name):
278	# type: (str) -> Optional[str]
279	"""
280	Return the value, which may be UNESCAPED.
281	"""
282	start, end = self.GetSpanForAttrValue(attr_name)
283	if start == -1:
284	return None
285	return self.s[start:end]
286
287	def AllAttrsRawSlice(self):
288	# type: () -> List[Tuple[str, int, int]]
289	"""
290	Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
291	"""
292	slices = []
293	events = self.Tokens()
294	try:
295	while True:
296	tok_id, start, end = next(events)
297	if tok_id == h8_tag_id.AttrName:
298	name = self.s[start:end]
299
300	# The value should come next
301	tok_id, start, end = next(events)
302	assert tok_id in (
303	h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
304	h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
305	# Note: quoted values may have &
306	# We would need ANOTHER lexer to unescape them, but we
307	# don't need that for ul-table
308	slices.append((name, start, end))
309	except StopIteration:
310	pass
311	return slices
312
313	def AllAttrsRaw(self):
314	# type: () -> List[Tuple[str, str]]
315	"""
316	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
317
318	The quoted values may be escaped. We would need another lexer to
319	unescape them.
320	"""
321	slices = self.AllAttrsRawSlice()
322	pairs = []
323	for name, start, end in slices:
324	pairs.append((name, self.s[start:end]))
325	return pairs
326
327	def Tokens(self):
328	# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
329	"""
330	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
331
332	Where each Token is (Type, start_pos, end_pos)
333
334	Note that start and end are NOT redundant! We skip over some unwanted
335	characters.
336	"""
337	m = _TAG_RE.match(self.s, self.start_pos + 1)
338	if not m:
339	raise RuntimeError("Couldn't find HTML tag in %r" %
340	self.WholeTagString())
341	yield h8_tag_id.TagName, m.start(1), m.end(1)
342
343	pos = m.end(0)
344	#log('POS %d', pos)
345
346	while True:
347	# don't search past the end
348	m = _ATTR_RE.match(self.s, pos, self.end_pos)
349	if not m:
350	#log('BREAK pos %d', pos)
351	break
352	#log('AttrName %r', m.group(1))
353
354	yield h8_tag_id.AttrName, m.start(1), m.end(1)
355
356	#log('m.groups() %r', m.groups())
357	if m.group(2) is not None:
358	# double quoted
359	yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
360	elif m.group(3) is not None:
361	# single quoted - TODO: could have different token types
362	yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
363	elif m.group(4) is not None:
364	yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
365	else:
366	# <button disabled>
367	end = m.end(0)
368	yield h8_tag_id.MissingValue, end, end
369
370	# Skip past the "
371	pos = m.end(0)
372
373	#log('TOK %r', self.s)
374
375	m = _TAG_LAST_RE.match(self.s, pos)
376	#log('_TAG_LAST_RE match %r', self.s[pos:])
377	if not m:
378	raise LexError('Extra data at end of tag', self.s, pos)