data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

232 lines, 86 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	#
8	# - htm8.py should use one-pass algorithm
9	# - micro-syntax should check all errors
10	# - with tests
11	# - and then download CommonCrawl data set?
12	#
13	# - translate to C++
14	# - how to handle the regexes in the lexer? Port to re2c directly?
15	# - for find(), do we need a C++ primitive for it?
16	# - no allocation for TagName()
17	#
18	# re2c considerations:
19	# - We need to use CAPTURES, so we can't use frontend/match directly
20	# - Could we STREAM the lexer?
21	# - Instead of sentinel model, use something else!
22	# - default is sentinel with padding, and there is YYFILL with padding
23	# - there is also the separate --storable-state option
24	# - because this can be used queries that don't allocate
25	# - I may also want to do this with JSON
26	#
27	# Features:
28	# - Are there special rules for <svg> and <math>?
29	# - Do we need to know about <textarea> <pre>? Those don't have the same
30	# whitespace rules
31
32
33	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
34
35	# Special case: we need $REPO_ROOT
36	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
37	source $LIB_OSH/bash-strict.sh
38	source $LIB_OSH/task-five.sh
39
40	site-files() {
41	#find ../../oilshell/oilshell.org__deploy -name '*.html'
42
43	# omit all the _ files
44	git ls-files \| grep '\.html$'
45	}
46
47	# Issues with lazylex/html.py
48	#
49	# - Token ID is annoying to express in Python
50	# - re.DOTALL for newlines
51	# - can we change that with [.\n]*?
52	# - nongreedy match for --> and ?>
53
54	htm8-tool() {
55	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
56	$REPO_ROOT/data_lang/htm8_util.py "$@"
57	}
58
59	test-quick-scan() {
60	cat >_tmp/bad-top.html <<EOF
61	unfinished <!--
62	hi && bye
63	EOF
64
65	set +o errexit
66	echo '_tmp/bad-top.html' \| htm8-tool quick-scan
67
68	echo '_tmp/bad-top.html' \| htm8-tool lex-htm8
69
70	cat >_tmp/bad-attr.html <<EOF
71	hi <a href !>
72	EOF
73
74	echo '*** bad-attr quick-scan'
75	echo '_tmp/bad-attr.html' \| htm8-tool quick-scan
76
77	echo '*** bad-attr lex-htm8'
78	echo '_tmp/bad-attr.html' \| htm8-tool lex-htm8
79	}
80
81	# site errors
82	#
83	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
84	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
85	# 5833374 tokens in 4710 files
86	#
87	# The second is the "Woboq" browser, which has CDATA
88	# Ah I wonder if we need that.
89
90	# Takes ~13 seconds
91	test-site() {
92	local new_site=${1:-}
93
94	# TODO:
95	# - test that the top level lexes
96	# - test that each tag lexers
97	# - test that each quoted attribute lexes
98	# - test that tags are balanced
99
100	local dir
101	local action
102	if test -n "$new_site"; then
103	dir='../oils.pub__deploy'
104	action='parse-htm8'
105	else
106	dir='../../oilshell/oilshell.org__deploy'
107	action='lex-htm8'
108	fi
109
110	pushd $dir
111
112	# Too many files
113	# site-files \| xargs wc -l \| grep total
114
115	# Not using xargs
116	time site-files \| $REPO_ROOT/$0 htm8-tool $action
117
118	popd
119	}
120
121	readonly SOIL_ID=8924
122	readonly WWZ_DIR=_tmp/$SOIL_ID
123
124	sync-wwz() {
125	mkdir -p $WWZ_DIR
126	rsync --archive --verbose \
127	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
128	}
129
130	extract-wwz() {
131	pushd $WWZ_DIR
132	for z in *.wwz; do
133	local name=$(basename $z .wwz)
134
135	mkdir -p $name
136	pushd $name >/dev/null
137
138	echo $name
139	unzip ../$z
140
141	popd >/dev/null
142	done
143	popd
144	}
145
146	tree-wwz() {
147	tree $WWZ_DIR
148	}
149
150	test-wwz() {
151	pushd $WWZ_DIR
152
153	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
154
155	popd
156	}
157
158	find-xml() {
159	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
160	}
161
162	test-other-xml() {
163	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
164	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
165	}
166
167	test-repo-xml() {
168	# OK these parse
169	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
170	\| $REPO_ROOT/$0 htm8-tool parse-xml
171	}
172
173	test-repo-html() {
174	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
175	}
176
177	test-docs() {
178	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
179	}
180
181	soil-run() {
182	test-docs
183	}
184
185	# OK we have to skip the <script> tag! And <style>
186	#
187	# document.location = '#' + params.join('&');
188	# gUrlHash = new UrlHash(location.hash);
189	#
190	# I think textarea we don't though?
191
192
193	task-five "$@"
194	exit
195
196
197	echo '
198	In HTML5, instead of
199	<script>
200	<![CDATA[
201	if (x < y) { ... }
202	]]>
203	</script>
204
205	You can write
206
207	<script>
208	if (x < y) { ... }
209	</script>
210
211	<script> <style> <textarea>
212
213	These have special escaping rules. I guess we just do NOT lex them at all?
214	We can totally SKIP them.
215
216	CDATA vs. RCDATA
217
218	<textarea>
219	<p> <!-- This will show as: <p> -->
220	& <!-- This will show as: & -->
221	</textarea>
222
223	<script>
224	<p> <!-- This will show literally as: <p> -->
225	& <!-- This will show literally as: & -->
226	</script>
227
228	The main practical difference is that RCDATA processes HTML entities while
229	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
230	as plain text) except for their own closing tag. '
231	'
232