data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

228 lines, 75 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Refactor Validate(): take FLAGS, return stats optionally
8	# - add LEX_QUOTED_VALUES
9	# - and then re-run all the tests
10	# - Rename to data_lang/htm8.py
11	# - it has NO_SPECIAL_TAGS mode for XML
12	#
13	# - Soil
14	# - Validate all the HTML in the repo - well-formed check
15	# - this should go in the CI
16	# - Automate some more tests:
17	# - site oils.pub, site oilshell.org
18	# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
19	#
20	# - statically type it
21	# - revive pyannotate
22	# - translate to C++
23	# - what to do about all the regexes? Port to re2c directly?
24	# - for find(), do we need a C++ primitive for it?
25	# - no allocation for TagName()
26	# - ASDL file for Tok.Foo?
27	# - refactor TagName() API - remove it from the TagLexer?
28	# - that is really the AttrLexer()
29	#
30	# - build a DOM with objects in YSH?
31	# - rewrite ul-table in that?
32	#
33	# YSH API
34	# - Generating HTML/HTM8 is much more common than parsing it
35	# - although maybe we can do RemoveComments as a demo?
36	# - that is the lowest level "sed" model
37	# - For parsing, a minimum idea is:
38	# - lexer-based algorithms for query by tag, class name, and id
39	# - and then toTree()
40	# - .tag and .attrs?
41	# - .innerHTML() and .outerHTML() perhaps
42	# - and maybe you can mutate it directly
43
44	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
45
46	# Special case: we need $REPO_ROOT
47	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
48	source $LIB_OSH/bash-strict.sh
49	source $LIB_OSH/task-five.sh
50
51	# parse with lazylex/html.py, or data_lang/htm8.py
52
53	site-files() {
54	#find ../../oilshell/oilshell.org__deploy -name '*.html'
55
56	# omit all the _ files
57	git ls-files \| grep '\.html$'
58	}
59
60	# Issues with lazylex/html.py
61	#
62	# - Token ID is annoying to express in Python
63	# - re.DOTALL for newlines
64	# - can we change that with [.\n]*?
65	# - nongreedy match for --> and ?>
66
67	htm8-tool() {
68	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
69	$REPO_ROOT/lazylex/html.py "$@"
70	}
71
72	test-well-formed() {
73	cat >_tmp/bad.html <<EOF
74	unfinished <!--
75	hi && bye
76	EOF
77	echo '_tmp/bad.html' \| htm8-tool well-formed
78	}
79
80	# site errors
81	#
82	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
83	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
84	# 5833374 tokens in 4710 files
85	#
86	# The second is the "Woboq" browser, which has CDATA
87	# Ah I wonder if we need that.
88
89	# Takes ~13 seconds
90	test-site() {
91	local new_site=${1:-}
92
93	# TODO:
94	# - test that the top level lexes
95	# - test that each tag lexers
96	# - test that each quoted attribute lexes
97	# - test that tags are balanced
98
99	if test -n "$new_site"; then
100	dir='../oils.pub__deploy'
101	else
102	dir='../../oilshell/oilshell.org__deploy'
103	fi
104
105	pushd $dir
106
107	# Too many files
108	# site-files \| xargs wc -l \| grep total
109
110	# Not using xargs
111	time site-files \| $REPO_ROOT/$0 htm8-tool validate
112
113	popd
114	}
115
116	readonly SOIL_ID=8917
117	readonly WWZ_DIR=_tmp/$SOIL_ID
118
119	sync-wwz() {
120	mkdir -p $WWZ_DIR
121	rsync --archive --verbose \
122	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
123	}
124
125	extract-wwz() {
126	pushd $WWZ_DIR
127	for z in *.wwz; do
128	local name=$(basename $z .wwz)
129
130	mkdir -p $name
131	pushd $name >/dev/null
132
133	echo $name
134	unzip ../$z
135
136	popd >/dev/null
137	done
138	popd
139	}
140
141	tree-wwz() {
142	tree $WWZ_DIR
143	}
144
145	test-wwz() {
146	pushd $WWZ_DIR
147
148	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool validate
149
150	popd
151	}
152
153	find-xml() {
154	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
155	}
156
157	test-other-xml() {
158	# problem with &ent1;
159	# CDATA support! haha OK
160	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool validate
161	}
162
163	test-repo-xml() {
164	# OK these parse
165	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
166	\| $REPO_ROOT/$0 htm8-tool validate
167	}
168
169	test-repo-html() {
170	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool validate
171	}
172
173	test-docs() {
174	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool validate
175	}
176
177	soil-run() {
178	test-docs
179	}
180
181	# OK we have to skip the <script> tag! And <style>
182	#
183	# document.location = '#' + params.join('&');
184	# gUrlHash = new UrlHash(location.hash);
185	#
186	# I think textarea we don't though?
187
188
189	task-five "$@"
190	exit
191
192
193	echo '
194	In HTML5, instead of
195	<script>
196	<![CDATA[
197	if (x < y) { ... }
198	]]>
199	</script>
200
201	You can write
202
203	<script>
204	if (x < y) { ... }
205	</script>
206
207	<script> <style> <textarea>
208
209	These have special escaping rules. I guess we just do NOT lex them at all?
210	We can totally SKIP them.
211
212	CDATA vs. RCDATA
213
214	<textarea>
215	<p> <!-- This will show as: <p> -->
216	& <!-- This will show as: & -->
217	</textarea>
218
219	<script>
220	<p> <!-- This will show literally as: <p> -->
221	& <!-- This will show literally as: & -->
222	</script>
223
224	The main practical difference is that RCDATA processes HTML entities while
225	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
226	as plain text) except for their own closing tag. '
227	'
228