data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

247 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Validate()
8	# - add LEX_QUOTED_VALUES, along with counter for it
9	# - and then re-run all the tests - make sure they pass
10	# - site oils.pub, site oilshell.org
11	# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
12	# - Rename to data_lang/htm8.py
13	# - it has NO_SPECIAL_TAGS mode for XML
14	# - put iterators at a higher level in doctools/ ?
15	#
16	# - statically type it
17	# - revive pyannotate
18	# - translate to C++
19	# - how to handle the regexes in the lexer? Port to re2c directly?
20	# - for find(), do we need a C++ primitive for it?
21	# - no allocation for TagName()
22	# - ASDL file for Tok.Foo?
23	# - refactor TagName() API - remove it from the TagLexer?
24	# - that is really the AttrLexer()
25	#
26	# Not working yet:
27	# - understanding all entities &zz;
28	# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
29	# - capital letters <TR/> - I guess we can normalize the case
30	#
31	# Features:
32	# - Are there special rules for <svg> and <math>?
33	# - Do we need to know about <textarea> <pre>? Those don't have the same
34	# whitespace rules
35	#
36	# YSH API
37	# - Generating HTML/HTM8 is much more common than parsing it
38	# - although maybe we can do RemoveComments as a demo?
39	# - that is the lowest level "sed" model
40	# - For parsing, a minimum idea is:
41	# - lexer-based algorithms for query by tag, class name, and id
42	# - and then toTree() - this is a DOM
43	# - .tag and .attrs?
44	# - .innerHTML() and .outerHTML() perhaps
45	# - rewrite ul-table in that?
46	# - does that mean you mutate it, or construct text?
47	# - I think you can set the innerHTML probably
48	#
49	# - Testing of html.ysh aka htm8.ysh in the stdlib
50	#
51	# Cases:
52	# html 'hello <b>world</b>'
53	# html "hello <b>$name</b>"html
54	# html ["hello <b>$name</b>"] # hm this isn't bad, it's an unevaluated expression?
55	# commonmark 'hello world'
56	# md 'hello world'
57	# md ['hello $escape'] ? We don't have a good escaping algorithm
58
59
60	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
61
62	# Special case: we need $REPO_ROOT
63	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
64	source $LIB_OSH/bash-strict.sh
65	source $LIB_OSH/task-five.sh
66
67	# parse with lazylex/html.py, or data_lang/htm8.py
68
69	site-files() {
70	#find ../../oilshell/oilshell.org__deploy -name '*.html'
71
72	# omit all the _ files
73	git ls-files \| grep '\.html$'
74	}
75
76	# Issues with lazylex/html.py
77	#
78	# - Token ID is annoying to express in Python
79	# - re.DOTALL for newlines
80	# - can we change that with [.\n]*?
81	# - nongreedy match for --> and ?>
82
83	htm8-tool() {
84	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
85	$REPO_ROOT/lazylex/html.py "$@"
86	}
87
88	test-well-formed() {
89	cat >_tmp/bad.html <<EOF
90	unfinished <!--
91	hi && bye
92	EOF
93	echo '_tmp/bad.html' \| htm8-tool well-formed
94	}
95
96	# site errors
97	#
98	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
99	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
100	# 5833374 tokens in 4710 files
101	#
102	# The second is the "Woboq" browser, which has CDATA
103	# Ah I wonder if we need that.
104
105	# Takes ~13 seconds
106	test-site() {
107	local new_site=${1:-}
108
109	# TODO:
110	# - test that the top level lexes
111	# - test that each tag lexers
112	# - test that each quoted attribute lexes
113	# - test that tags are balanced
114
115	local dir
116	local action
117	if test -n "$new_site"; then
118	dir='../oils.pub__deploy'
119	action='parse-htm8'
120	else
121	dir='../../oilshell/oilshell.org__deploy'
122	action='lex-htm8'
123	fi
124
125	pushd $dir
126
127	# Too many files
128	# site-files \| xargs wc -l \| grep total
129
130	# Not using xargs
131	time site-files \| $REPO_ROOT/$0 htm8-tool $action
132
133	popd
134	}
135
136	readonly SOIL_ID=8924
137	readonly WWZ_DIR=_tmp/$SOIL_ID
138
139	sync-wwz() {
140	mkdir -p $WWZ_DIR
141	rsync --archive --verbose \
142	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
143	}
144
145	extract-wwz() {
146	pushd $WWZ_DIR
147	for z in *.wwz; do
148	local name=$(basename $z .wwz)
149
150	mkdir -p $name
151	pushd $name >/dev/null
152
153	echo $name
154	unzip ../$z
155
156	popd >/dev/null
157	done
158	popd
159	}
160
161	tree-wwz() {
162	tree $WWZ_DIR
163	}
164
165	test-wwz() {
166	pushd $WWZ_DIR
167
168	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
169
170	popd
171	}
172
173	find-xml() {
174	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
175	}
176
177	test-other-xml() {
178	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
179	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
180	}
181
182	test-repo-xml() {
183	# OK these parse
184	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
185	\| $REPO_ROOT/$0 htm8-tool parse-xml
186	}
187
188	test-repo-html() {
189	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
190	}
191
192	test-docs() {
193	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
194	}
195
196	soil-run() {
197	test-docs
198	}
199
200	# OK we have to skip the <script> tag! And <style>
201	#
202	# document.location = '#' + params.join('&');
203	# gUrlHash = new UrlHash(location.hash);
204	#
205	# I think textarea we don't though?
206
207
208	task-five "$@"
209	exit
210
211
212	echo '
213	In HTML5, instead of
214	<script>
215	<![CDATA[
216	if (x < y) { ... }
217	]]>
218	</script>
219
220	You can write
221
222	<script>
223	if (x < y) { ... }
224	</script>
225
226	<script> <style> <textarea>
227
228	These have special escaping rules. I guess we just do NOT lex them at all?
229	We can totally SKIP them.
230
231	CDATA vs. RCDATA
232
233	<textarea>
234	<p> <!-- This will show as: <p> -->
235	& <!-- This will show as: & -->
236	</textarea>
237
238	<script>
239	<p> <!-- This will show literally as: <p> -->
240	& <!-- This will show literally as: & -->
241	</script>
242
243	The main practical difference is that RCDATA processes HTML entities while
244	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
245	as plain text) except for their own closing tag. '
246	'
247