data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

227 lines, 86 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	#
8	# - translate to C++
9	# - how to handle the regexes in the lexer? Port to re2c directly?
10	# - for find(), do we need a C++ primitive for it?
11	# - no allocation for TagName()
12	#
13	# re2c considerations:
14	# - We need to use CAPTURES, so we can't use frontend/match directly
15	# - Could we STREAM the lexer?
16	# - Instead of sentinel model, use something else!
17	# - default is sentinel with padding, and there is YYFILL with padding
18	# - there is also the separate --storable-state option
19	# - because this can be used queries that don't allocate
20	# - I may also want to do this with JSON
21	#
22	# Features:
23	# - Are there special rules for <svg> and <math>?
24	# - Do we need to know about <textarea> <pre>? Those don't have the same
25	# whitespace rules
26
27
28	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30	# Special case: we need $REPO_ROOT
31	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32	source $LIB_OSH/bash-strict.sh
33	source $LIB_OSH/task-five.sh
34
35	site-files() {
36	#find ../../oilshell/oilshell.org__deploy -name '*.html'
37
38	# omit all the _ files
39	git ls-files \| grep '\.html$'
40	}
41
42	# Issues with lazylex/html.py
43	#
44	# - Token ID is annoying to express in Python
45	# - re.DOTALL for newlines
46	# - can we change that with [.\n]*?
47	# - nongreedy match for --> and ?>
48
49	htm8-tool() {
50	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
51	$REPO_ROOT/data_lang/htm8_util.py "$@"
52	}
53
54	test-quick-scan() {
55	cat >_tmp/bad-top.html <<EOF
56	unfinished <!--
57	hi && bye
58	EOF
59
60	set +o errexit
61	echo '_tmp/bad-top.html' \| htm8-tool quick-scan
62
63	echo '_tmp/bad-top.html' \| htm8-tool lex-htm8
64
65	cat >_tmp/bad-attr.html <<EOF
66	hi <a href !>
67	EOF
68
69	echo '*** bad-attr quick-scan'
70	echo '_tmp/bad-attr.html' \| htm8-tool quick-scan
71
72	echo '*** bad-attr lex-htm8'
73	echo '_tmp/bad-attr.html' \| htm8-tool lex-htm8
74	}
75
76	# site errors
77	#
78	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
79	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
80	# 5833374 tokens in 4710 files
81	#
82	# The second is the "Woboq" browser, which has CDATA
83	# Ah I wonder if we need that.
84
85	# Takes ~13 seconds
86	test-site() {
87	local new_site=${1:-}
88
89	# TODO:
90	# - test that the top level lexes
91	# - test that each tag lexers
92	# - test that each quoted attribute lexes
93	# - test that tags are balanced
94
95	local dir
96	local action
97	if test -n "$new_site"; then
98	dir='../oils.pub__deploy'
99	action='parse-htm8'
100	else
101	dir='../../oilshell/oilshell.org__deploy'
102	action='lex-htm8'
103	fi
104
105	pushd $dir
106
107	# Too many files
108	# site-files \| xargs wc -l \| grep total
109
110	# Not using xargs
111	time site-files \| $REPO_ROOT/$0 htm8-tool $action
112
113	popd
114	}
115
116	readonly SOIL_ID=8924
117	readonly WWZ_DIR=_tmp/$SOIL_ID
118
119	sync-wwz() {
120	mkdir -p $WWZ_DIR
121	rsync --archive --verbose \
122	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
123	}
124
125	extract-wwz() {
126	pushd $WWZ_DIR
127	for z in *.wwz; do
128	local name=$(basename $z .wwz)
129
130	mkdir -p $name
131	pushd $name >/dev/null
132
133	echo $name
134	unzip ../$z
135
136	popd >/dev/null
137	done
138	popd
139	}
140
141	tree-wwz() {
142	tree $WWZ_DIR
143	}
144
145	test-wwz() {
146	pushd $WWZ_DIR
147
148	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
149
150	popd
151	}
152
153	find-xml() {
154	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
155	}
156
157	test-other-xml() {
158	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
159	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
160	}
161
162	test-repo-xml() {
163	# OK these parse
164	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
165	\| $REPO_ROOT/$0 htm8-tool parse-xml
166	}
167
168	test-repo-html() {
169	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
170	}
171
172	test-docs() {
173	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
174	}
175
176	soil-run() {
177	test-docs
178	}
179
180	# OK we have to skip the <script> tag! And <style>
181	#
182	# document.location = '#' + params.join('&');
183	# gUrlHash = new UrlHash(location.hash);
184	#
185	# I think textarea we don't though?
186
187
188	task-five "$@"
189	exit
190
191
192	echo '
193	In HTML5, instead of
194	<script>
195	<![CDATA[
196	if (x < y) { ... }
197	]]>
198	</script>
199
200	You can write
201
202	<script>
203	if (x < y) { ... }
204	</script>
205
206	<script> <style> <textarea>
207
208	These have special escaping rules. I guess we just do NOT lex them at all?
209	We can totally SKIP them.
210
211	CDATA vs. RCDATA
212
213	<textarea>
214	<p> <!-- This will show as: <p> -->
215	& <!-- This will show as: & -->
216	</textarea>
217
218	<script>
219	<p> <!-- This will show literally as: <p> -->
220	& <!-- This will show literally as: & -->
221	</script>
222
223	The main practical difference is that RCDATA processes HTML entities while
224	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
225	as plain text) except for their own closing tag. '
226	'
227