data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

213 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	#
8	# - translate to C++
9	# - how to handle the regexes in the lexer? Port to re2c directly?
10	# - for find(), do we need a C++ primitive for it?
11	# - no allocation for TagName()
12	#
13	# re2c considerations:
14	# - We need to use CAPTURES, so we can't use frontend/match directly
15	# - Could we STREAM the lexer?
16	# - Instead of sentinel model, use something else!
17	# - default is sentinel with padding, and there is YYFILL with padding
18	# - there is also the separate --storable-state option
19	# - because this can be used queries that don't allocate
20	# - I may also want to do this with JSON
21	#
22	# Features:
23	# - Are there special rules for <svg> and <math>?
24	# - Do we need to know about <textarea> <pre>? Those don't have the same
25	# whitespace rules
26
27
28	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30	# Special case: we need $REPO_ROOT
31	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32	source $LIB_OSH/bash-strict.sh
33	source $LIB_OSH/task-five.sh
34
35	site-files() {
36	#find ../../oilshell/oilshell.org__deploy -name '*.html'
37
38	# omit all the _ files
39	git ls-files \| grep '\.html$'
40	}
41
42	# Issues with lazylex/html.py
43	#
44	# - Token ID is annoying to express in Python
45	# - re.DOTALL for newlines
46	# - can we change that with [.\n]*?
47	# - nongreedy match for --> and ?>
48
49	htm8-tool() {
50	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
51	$REPO_ROOT/data_lang/htm8_util.py "$@"
52	}
53
54	test-well-formed() {
55	cat >_tmp/bad.html <<EOF
56	unfinished <!--
57	hi && bye
58	EOF
59	echo '_tmp/bad.html' \| htm8-tool well-formed
60	}
61
62	# site errors
63	#
64	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
65	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
66	# 5833374 tokens in 4710 files
67	#
68	# The second is the "Woboq" browser, which has CDATA
69	# Ah I wonder if we need that.
70
71	# Takes ~13 seconds
72	test-site() {
73	local new_site=${1:-}
74
75	# TODO:
76	# - test that the top level lexes
77	# - test that each tag lexers
78	# - test that each quoted attribute lexes
79	# - test that tags are balanced
80
81	local dir
82	local action
83	if test -n "$new_site"; then
84	dir='../oils.pub__deploy'
85	action='parse-htm8'
86	else
87	dir='../../oilshell/oilshell.org__deploy'
88	action='lex-htm8'
89	fi
90
91	pushd $dir
92
93	# Too many files
94	# site-files \| xargs wc -l \| grep total
95
96	# Not using xargs
97	time site-files \| $REPO_ROOT/$0 htm8-tool $action
98
99	popd
100	}
101
102	readonly SOIL_ID=8924
103	readonly WWZ_DIR=_tmp/$SOIL_ID
104
105	sync-wwz() {
106	mkdir -p $WWZ_DIR
107	rsync --archive --verbose \
108	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
109	}
110
111	extract-wwz() {
112	pushd $WWZ_DIR
113	for z in *.wwz; do
114	local name=$(basename $z .wwz)
115
116	mkdir -p $name
117	pushd $name >/dev/null
118
119	echo $name
120	unzip ../$z
121
122	popd >/dev/null
123	done
124	popd
125	}
126
127	tree-wwz() {
128	tree $WWZ_DIR
129	}
130
131	test-wwz() {
132	pushd $WWZ_DIR
133
134	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
135
136	popd
137	}
138
139	find-xml() {
140	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
141	}
142
143	test-other-xml() {
144	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
145	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
146	}
147
148	test-repo-xml() {
149	# OK these parse
150	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
151	\| $REPO_ROOT/$0 htm8-tool parse-xml
152	}
153
154	test-repo-html() {
155	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
156	}
157
158	test-docs() {
159	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
160	}
161
162	soil-run() {
163	test-docs
164	}
165
166	# OK we have to skip the <script> tag! And <style>
167	#
168	# document.location = '#' + params.join('&');
169	# gUrlHash = new UrlHash(location.hash);
170	#
171	# I think textarea we don't though?
172
173
174	task-five "$@"
175	exit
176
177
178	echo '
179	In HTML5, instead of
180	<script>
181	<![CDATA[
182	if (x < y) { ... }
183	]]>
184	</script>
185
186	You can write
187
188	<script>
189	if (x < y) { ... }
190	</script>
191
192	<script> <style> <textarea>
193
194	These have special escaping rules. I guess we just do NOT lex them at all?
195	We can totally SKIP them.
196
197	CDATA vs. RCDATA
198
199	<textarea>
200	<p> <!-- This will show as: <p> -->
201	& <!-- This will show as: & -->
202	</textarea>
203
204	<script>
205	<p> <!-- This will show literally as: <p> -->
206	& <!-- This will show literally as: & -->
207	</script>
208
209	The main practical difference is that RCDATA processes HTML entities while
210	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
211	as plain text) except for their own closing tag. '
212	'
213