data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

250 lines, 79 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Rename to data_lang/htm8.py
8	# - it has NO_SPECIAL_TAGS mode for XML
9	# - put iterators at a higher level in doctools/ ?
10	#
11	# - statically type it
12	# - revive pyannotate
13	# - translate to C++
14	# - how to handle the regexes in the lexer? Port to re2c directly?
15	# - for find(), do we need a C++ primitive for it?
16	# - no allocation for TagName()
17	# - ASDL file for Tok.Foo?
18	# - refactor TagName() API - remove it from the TagLexer?
19	# - that is really the AttrLexer()
20	#
21	# Not working yet:
22	# - understanding all entities &zz;
23	# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
24	# - capital letters <TR/> - I guess we can normalize the case
25	#
26	# Leniency:
27	# - foo=1&bar=2 is extremely common
28	# - well then does that mean you allow <p>a & b</b too?
29	# - and then it's not far from that to <p id="value >"> - the quotes help
30	# - I guess you can have a rule for unescaped &, just like unescaped backslash
31	# - you can warn about it, but it doesn't cause much problem?
32	# We are already firmly in HTML territory, not in XML ...
33	#
34	# Features:
35	# - Are there special rules for <svg> and <math>?
36	# - Do we need to know about <textarea> <pre>? Those don't have the same
37	# whitespace rules
38	#
39	# YSH API
40	# - Generating HTML/HTM8 is much more common than parsing it
41	# - although maybe we can do RemoveComments as a demo?
42	# - that is the lowest level "sed" model
43	# - For parsing, a minimum idea is:
44	# - lexer-based algorithms for query by tag, class name, and id
45	# - and then toTree() - this is a DOM
46	# - .tag and .attrs?
47	# - .innerHTML() and .outerHTML() perhaps
48	# - rewrite ul-table in that?
49	# - does that mean you mutate it, or construct text?
50	# - I think you can set the innerHTML probably
51	#
52	# - Testing of html.ysh aka htm8.ysh in the stdlib
53	#
54	# Cases:
55	# html 'hello <b>world</b>'
56	# html "hello <b>$name</b>"html
57	# html ["hello <b>$name</b>"] # hm this isn't bad, it's an unevaluated expression?
58	# commonmark 'hello world'
59	# md 'hello world'
60	# md ['hello $escape'] ? We don't have a good escaping algorithm
61
62
63	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
64
65	# Special case: we need $REPO_ROOT
66	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
67	source $LIB_OSH/bash-strict.sh
68	source $LIB_OSH/task-five.sh
69
70	# parse with lazylex/html.py, or data_lang/htm8.py
71
72	site-files() {
73	#find ../../oilshell/oilshell.org__deploy -name '*.html'
74
75	# omit all the _ files
76	git ls-files \| grep '\.html$'
77	}
78
79	# Issues with lazylex/html.py
80	#
81	# - Token ID is annoying to express in Python
82	# - re.DOTALL for newlines
83	# - can we change that with [.\n]*?
84	# - nongreedy match for --> and ?>
85
86	htm8-tool() {
87	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
88	$REPO_ROOT/lazylex/html.py "$@"
89	}
90
91	test-well-formed() {
92	cat >_tmp/bad.html <<EOF
93	unfinished <!--
94	hi && bye
95	EOF
96	echo '_tmp/bad.html' \| htm8-tool well-formed
97	}
98
99	# site errors
100	#
101	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
102	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
103	# 5833374 tokens in 4710 files
104	#
105	# The second is the "Woboq" browser, which has CDATA
106	# Ah I wonder if we need that.
107
108	# Takes ~13 seconds
109	test-site() {
110	local new_site=${1:-}
111
112	# TODO:
113	# - test that the top level lexes
114	# - test that each tag lexers
115	# - test that each quoted attribute lexes
116	# - test that tags are balanced
117
118	local dir
119	local action
120	if test -n "$new_site"; then
121	dir='../oils.pub__deploy'
122	action='parse-htm8'
123	else
124	dir='../../oilshell/oilshell.org__deploy'
125	action='lex-htm8'
126	fi
127
128	pushd $dir
129
130	# Too many files
131	# site-files \| xargs wc -l \| grep total
132
133	# Not using xargs
134	time site-files \| $REPO_ROOT/$0 htm8-tool $action
135
136	popd
137	}
138
139	readonly SOIL_ID=8924
140	readonly WWZ_DIR=_tmp/$SOIL_ID
141
142	sync-wwz() {
143	mkdir -p $WWZ_DIR
144	rsync --archive --verbose \
145	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
146	}
147
148	extract-wwz() {
149	pushd $WWZ_DIR
150	for z in *.wwz; do
151	local name=$(basename $z .wwz)
152
153	mkdir -p $name
154	pushd $name >/dev/null
155
156	echo $name
157	unzip ../$z
158
159	popd >/dev/null
160	done
161	popd
162	}
163
164	tree-wwz() {
165	tree $WWZ_DIR
166	}
167
168	test-wwz() {
169	pushd $WWZ_DIR
170
171	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
172
173	popd
174	}
175
176	find-xml() {
177	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
178	}
179
180	test-other-xml() {
181	# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
182	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 htm8-tool parse-xml
183	}
184
185	test-repo-xml() {
186	# OK these parse
187	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
188	\| $REPO_ROOT/$0 htm8-tool parse-xml
189	}
190
191	test-repo-html() {
192	time find . -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
193	}
194
195	test-docs() {
196	time find _release/VERSION -name '*.html' \| $REPO_ROOT/$0 htm8-tool parse-htm8
197	}
198
199	soil-run() {
200	test-docs
201	}
202
203	# OK we have to skip the <script> tag! And <style>
204	#
205	# document.location = '#' + params.join('&');
206	# gUrlHash = new UrlHash(location.hash);
207	#
208	# I think textarea we don't though?
209
210
211	task-five "$@"
212	exit
213
214
215	echo '
216	In HTML5, instead of
217	<script>
218	<![CDATA[
219	if (x < y) { ... }
220	]]>
221	</script>
222
223	You can write
224
225	<script>
226	if (x < y) { ... }
227	</script>
228
229	<script> <style> <textarea>
230
231	These have special escaping rules. I guess we just do NOT lex them at all?
232	We can totally SKIP them.
233
234	CDATA vs. RCDATA
235
236	<textarea>
237	<p> <!-- This will show as: <p> -->
238	& <!-- This will show as: & -->
239	</textarea>
240
241	<script>
242	<p> <!-- This will show literally as: <p> -->
243	& <!-- This will show literally as: & -->
244	</script>
245
246	The main practical difference is that RCDATA processes HTML entities while
247	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
248	as plain text) except for their own closing tag. '
249	'
250