data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

234 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Move code into data_lang/htm8.py
8	# - iterators stay in lazylex/html.py?
9	#
10	# - statically type it
11	# - revive pyannotate
12	# - translate to C++
13	# - how to handle the regexes in the lexer? Port to re2c directly?
14	# - for find(), do we need a C++ primitive for it?
15	# - no allocation for TagName()
16	# - ASDL file for Tok.Foo?
17	# - remove TagName() from TagLexer(), it is on the Htm8Lexer
18	#
19	# re2c considerations:
20	# - We need to use CAPTURES, so we can't use frontend/match directly
21	# - Could we STREAM the lexer?
22	# - Instead of sentinel model, use something else!
23	# - default is sentinel with padding, and there is YYFILL with padding
24	# - there is also the separate --storable-state option
25	# - because this can be used queries that don't allocate
26	# - I may also want to do this with JSON
27	#
28	# Not working yet:
29	# - understanding all entities &zz;
30	# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
31	# - capital letters <TR/> - I guess we can normalize the case
32	#
33	# Leniency:
34	# - foo=1&bar=2 is extremely common
35	# - well then does that mean you allow <p>a & b</b too?
36	# - and then it's not far from that to <p id="value >"> - the quotes help
37	# - I guess you can have a rule for unescaped &, just like unescaped backslash
38	# - you can warn about it, but it doesn't cause much problem?
39	# We are already firmly in HTML territory, not in XML ...
40	#
41	# Features:
42	# - Are there special rules for <svg> and <math>?
43	# - Do we need to know about <textarea> <pre>? Those don't have the same
44	# whitespace rules
45
46
47	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
48
49	# Special case: we need $REPO_ROOT
50	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
51	source $LIB_OSH/bash-strict.sh
52	source $LIB_OSH/task-five.sh
53
54	# parse with lazylex/html.py, or data_lang/htm8.py
55
56	site-files() {
57	#find ../../oilshell/oilshell.org__deploy -name '*.html'
58
59	# omit all the _ files
60	git ls-files \| grep '\.html$'
61	}
62
63	# Issues with lazylex/html.py
64	#
65	# - Token ID is annoying to express in Python
66	# - re.DOTALL for newlines
67	# - can we change that with [.\n]*?
68	# - nongreedy match for --> and ?>
69
70	htm8-tool() {
71	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
72	$REPO_ROOT/lazylex/html.py "$@"
73	}
74
75	test-well-formed() {
76	cat >_tmp/bad.html <<EOF
77	unfinished <!--
78	hi && bye
79	EOF
80	echo '_tmp/bad.html' \| htm8-tool well-formed
81	}
82
83	# site errors
84	#
85	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
86	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
87	# 5833374 tokens in 4710 files
88	#
89	# The second is the "Woboq" browser, which has CDATA
90	# Ah I wonder if we need that.
91
92	# Takes ~13 seconds
93	test-site() {
94	local new_site=${1:-}
95
96	# TODO:
97	# - test that the top level lexes
98	# - test that each tag lexers
99	# - test that each quoted attribute lexes
100	# - test that tags are balanced
101
102	local dir
103	local action
104	if test -n "$new_site"; then
105	dir='../oils.pub__deploy'
106	action='parse-htm8'
107	else
108	dir='../../oilshell/oilshell.org__deploy'
109	action='lex-htm8'
110	fi
111
112	pushd $dir
113
114	# Too many files
115	# site-files \| xargs wc -l \| grep total
116
117	# Not using xargs
118	time site-files \| $REPO_ROOT/$0 htm8-tool $action
119
120	popd
121	}
122
123	readonly SOIL_ID=8924
124	readonly WWZ_DIR=_tmp/$SOIL_ID
125
126	sync-wwz() {
127	mkdir -p $WWZ_DIR
128	rsync --archive --verbose \
129	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
130	}
131
132	extract-wwz() {
133	pushd $WWZ_DIR
134	for z in *.wwz; do
135	local name=$(basename $z .wwz)
136
137	mkdir -p $name
138	pushd $name >/dev/null
139
140	echo $name
141	unzip ../$z
142
143	popd >/dev/null
144	done
145	popd
146	}
147
148	tree-wwz() {
149	tree $WWZ_DIR
150	}
151
152	test-wwz() {
153	pushd $WWZ_DIR
154
155	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
156
157	popd
158	}
159
160	find-xml() {
161	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
162	}
163
164	test-other-xml() {
165	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
166	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
167	}
168
169	test-repo-xml() {
170	# OK these parse
171	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
172	\| $REPO_ROOT/$0 htm8-tool parse-xml
173	}
174
175	test-repo-html() {
176	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
177	}
178
179	test-docs() {
180	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
181	}
182
183	soil-run() {
184	test-docs
185	}
186
187	# OK we have to skip the <script> tag! And <style>
188	#
189	# document.location = '#' + params.join('&');
190	# gUrlHash = new UrlHash(location.hash);
191	#
192	# I think textarea we don't though?
193
194
195	task-five "$@"
196	exit
197
198
199	echo '
200	In HTML5, instead of
201	<script>
202	<![CDATA[
203	if (x < y) { ... }
204	]]>
205	</script>
206
207	You can write
208
209	<script>
210	if (x < y) { ... }
211	</script>
212
213	<script> <style> <textarea>
214
215	These have special escaping rules. I guess we just do NOT lex them at all?
216	We can totally SKIP them.
217
218	CDATA vs. RCDATA
219
220	<textarea>
221	<p> <!-- This will show as: <p> -->
222	& <!-- This will show as: & -->
223	</textarea>
224
225	<script>
226	<p> <!-- This will show literally as: <p> -->
227	& <!-- This will show literally as: & -->
228	</script>
229
230	The main practical difference is that RCDATA processes HTML entities while
231	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
232	as plain text) except for their own closing tag. '
233	'
234