data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

215 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	#
8	# - translate to C++
9	# - how to handle the regexes in the lexer? Port to re2c directly?
10	# - for find(), do we need a C++ primitive for it?
11	# - no allocation for TagName()
12	#
13	# re2c considerations:
14	# - We need to use CAPTURES, so we can't use frontend/match directly
15	# - Could we STREAM the lexer?
16	# - Instead of sentinel model, use something else!
17	# - default is sentinel with padding, and there is YYFILL with padding
18	# - there is also the separate --storable-state option
19	# - because this can be used queries that don't allocate
20	# - I may also want to do this with JSON
21	#
22	# Features:
23	# - Are there special rules for <svg> and <math>?
24	# - Do we need to know about <textarea> <pre>? Those don't have the same
25	# whitespace rules
26
27
28	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30	# Special case: we need $REPO_ROOT
31	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32	source $LIB_OSH/bash-strict.sh
33	source $LIB_OSH/task-five.sh
34
35	# parse with lazylex/html.py, or data_lang/htm8.py
36
37	site-files() {
38	#find ../../oilshell/oilshell.org__deploy -name '*.html'
39
40	# omit all the _ files
41	git ls-files \| grep '\.html$'
42	}
43
44	# Issues with lazylex/html.py
45	#
46	# - Token ID is annoying to express in Python
47	# - re.DOTALL for newlines
48	# - can we change that with [.\n]*?
49	# - nongreedy match for --> and ?>
50
51	htm8-tool() {
52	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
53	$REPO_ROOT/lazylex/html.py "$@"
54	}
55
56	test-well-formed() {
57	cat >_tmp/bad.html <<EOF
58	unfinished <!--
59	hi && bye
60	EOF
61	echo '_tmp/bad.html' \| htm8-tool well-formed
62	}
63
64	# site errors
65	#
66	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
67	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
68	# 5833374 tokens in 4710 files
69	#
70	# The second is the "Woboq" browser, which has CDATA
71	# Ah I wonder if we need that.
72
73	# Takes ~13 seconds
74	test-site() {
75	local new_site=${1:-}
76
77	# TODO:
78	# - test that the top level lexes
79	# - test that each tag lexers
80	# - test that each quoted attribute lexes
81	# - test that tags are balanced
82
83	local dir
84	local action
85	if test -n "$new_site"; then
86	dir='../oils.pub__deploy'
87	action='parse-htm8'
88	else
89	dir='../../oilshell/oilshell.org__deploy'
90	action='lex-htm8'
91	fi
92
93	pushd $dir
94
95	# Too many files
96	# site-files \| xargs wc -l \| grep total
97
98	# Not using xargs
99	time site-files \| $REPO_ROOT/$0 htm8-tool $action
100
101	popd
102	}
103
104	readonly SOIL_ID=8924
105	readonly WWZ_DIR=_tmp/$SOIL_ID
106
107	sync-wwz() {
108	mkdir -p $WWZ_DIR
109	rsync --archive --verbose \
110	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
111	}
112
113	extract-wwz() {
114	pushd $WWZ_DIR
115	for z in *.wwz; do
116	local name=$(basename $z .wwz)
117
118	mkdir -p $name
119	pushd $name >/dev/null
120
121	echo $name
122	unzip ../$z
123
124	popd >/dev/null
125	done
126	popd
127	}
128
129	tree-wwz() {
130	tree $WWZ_DIR
131	}
132
133	test-wwz() {
134	pushd $WWZ_DIR
135
136	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
137
138	popd
139	}
140
141	find-xml() {
142	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
143	}
144
145	test-other-xml() {
146	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
147	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
148	}
149
150	test-repo-xml() {
151	# OK these parse
152	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
153	\| $REPO_ROOT/$0 htm8-tool parse-xml
154	}
155
156	test-repo-html() {
157	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
158	}
159
160	test-docs() {
161	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
162	}
163
164	soil-run() {
165	test-docs
166	}
167
168	# OK we have to skip the <script> tag! And <style>
169	#
170	# document.location = '#' + params.join('&');
171	# gUrlHash = new UrlHash(location.hash);
172	#
173	# I think textarea we don't though?
174
175
176	task-five "$@"
177	exit
178
179
180	echo '
181	In HTML5, instead of
182	<script>
183	<![CDATA[
184	if (x < y) { ... }
185	]]>
186	</script>
187
188	You can write
189
190	<script>
191	if (x < y) { ... }
192	</script>
193
194	<script> <style> <textarea>
195
196	These have special escaping rules. I guess we just do NOT lex them at all?
197	We can totally SKIP them.
198
199	CDATA vs. RCDATA
200
201	<textarea>
202	<p> <!-- This will show as: <p> -->
203	& <!-- This will show as: & -->
204	</textarea>
205
206	<script>
207	<p> <!-- This will show literally as: <p> -->
208	& <!-- This will show literally as: & -->
209	</script>
210
211	The main practical difference is that RCDATA processes HTML entities while
212	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
213	as plain text) except for their own closing tag. '
214	'
215