data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

225 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Move code into data_lang/htm8.py
8	# - iterators stay in lazylex/html.py?
9	#
10	# - statically type it
11	# - revive pyannotate
12	# - translate to C++
13	# - how to handle the regexes in the lexer? Port to re2c directly?
14	# - for find(), do we need a C++ primitive for it?
15	# - no allocation for TagName()
16	# - ASDL file for Tok.Foo?
17	# - remove TagName() from TagLexer(), it is on the Htm8Lexer
18	#
19	# re2c considerations:
20	# - We need to use CAPTURES, so we can't use frontend/match directly
21	# - Could we STREAM the lexer?
22	# - Instead of sentinel model, use something else!
23	# - default is sentinel with padding, and there is YYFILL with padding
24	# - there is also the separate --storable-state option
25	# - because this can be used queries that don't allocate
26	# - I may also want to do this with JSON
27	#
28	# Not working yet:
29	# - capital letters <TR/> - I guess we can normalize the case
30	# - islower()
31	#
32	# Features:
33	# - Are there special rules for <svg> and <math>?
34	# - Do we need to know about <textarea> <pre>? Those don't have the same
35	# whitespace rules
36
37
38	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
39
40	# Special case: we need $REPO_ROOT
41	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
42	source $LIB_OSH/bash-strict.sh
43	source $LIB_OSH/task-five.sh
44
45	# parse with lazylex/html.py, or data_lang/htm8.py
46
47	site-files() {
48	#find ../../oilshell/oilshell.org__deploy -name '*.html'
49
50	# omit all the _ files
51	git ls-files \| grep '\.html$'
52	}
53
54	# Issues with lazylex/html.py
55	#
56	# - Token ID is annoying to express in Python
57	# - re.DOTALL for newlines
58	# - can we change that with [.\n]*?
59	# - nongreedy match for --> and ?>
60
61	htm8-tool() {
62	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
63	$REPO_ROOT/lazylex/html.py "$@"
64	}
65
66	test-well-formed() {
67	cat >_tmp/bad.html <<EOF
68	unfinished <!--
69	hi && bye
70	EOF
71	echo '_tmp/bad.html' \| htm8-tool well-formed
72	}
73
74	# site errors
75	#
76	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
77	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
78	# 5833374 tokens in 4710 files
79	#
80	# The second is the "Woboq" browser, which has CDATA
81	# Ah I wonder if we need that.
82
83	# Takes ~13 seconds
84	test-site() {
85	local new_site=${1:-}
86
87	# TODO:
88	# - test that the top level lexes
89	# - test that each tag lexers
90	# - test that each quoted attribute lexes
91	# - test that tags are balanced
92
93	local dir
94	local action
95	if test -n "$new_site"; then
96	dir='../oils.pub__deploy'
97	action='parse-htm8'
98	else
99	dir='../../oilshell/oilshell.org__deploy'
100	action='lex-htm8'
101	fi
102
103	pushd $dir
104
105	# Too many files
106	# site-files \| xargs wc -l \| grep total
107
108	# Not using xargs
109	time site-files \| $REPO_ROOT/$0 htm8-tool $action
110
111	popd
112	}
113
114	readonly SOIL_ID=8924
115	readonly WWZ_DIR=_tmp/$SOIL_ID
116
117	sync-wwz() {
118	mkdir -p $WWZ_DIR
119	rsync --archive --verbose \
120	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
121	}
122
123	extract-wwz() {
124	pushd $WWZ_DIR
125	for z in *.wwz; do
126	local name=$(basename $z .wwz)
127
128	mkdir -p $name
129	pushd $name >/dev/null
130
131	echo $name
132	unzip ../$z
133
134	popd >/dev/null
135	done
136	popd
137	}
138
139	tree-wwz() {
140	tree $WWZ_DIR
141	}
142
143	test-wwz() {
144	pushd $WWZ_DIR
145
146	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
147
148	popd
149	}
150
151	find-xml() {
152	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
153	}
154
155	test-other-xml() {
156	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
157	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
158	}
159
160	test-repo-xml() {
161	# OK these parse
162	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
163	\| $REPO_ROOT/$0 htm8-tool parse-xml
164	}
165
166	test-repo-html() {
167	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
168	}
169
170	test-docs() {
171	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
172	}
173
174	soil-run() {
175	test-docs
176	}
177
178	# OK we have to skip the <script> tag! And <style>
179	#
180	# document.location = '#' + params.join('&');
181	# gUrlHash = new UrlHash(location.hash);
182	#
183	# I think textarea we don't though?
184
185
186	task-five "$@"
187	exit
188
189
190	echo '
191	In HTML5, instead of
192	<script>
193	<![CDATA[
194	if (x < y) { ... }
195	]]>
196	</script>
197
198	You can write
199
200	<script>
201	if (x < y) { ... }
202	</script>
203
204	<script> <style> <textarea>
205
206	These have special escaping rules. I guess we just do NOT lex them at all?
207	We can totally SKIP them.
208
209	CDATA vs. RCDATA
210
211	<textarea>
212	<p> <!-- This will show as: <p> -->
213	& <!-- This will show as: & -->
214	</textarea>
215
216	<script>
217	<p> <!-- This will show literally as: <p> -->
218	& <!-- This will show literally as: & -->
219	</script>
220
221	The main practical difference is that RCDATA processes HTML entities while
222	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
223	as plain text) except for their own closing tag. '
224	'
225