data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

194 lines, 66 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Rename to DML8? Because it can handle XML
8	# - CDATA in XML, which is not a script
9	#
10	# Operations / Levels:
11	#
12	# - Lexing
13	# - lex-tags
14	# - lex-attrs - validate all Start tags, all StartEnd tags
15	# - lex-quoted-values - unescaping, etc.
16	# - are there invalid entities?
17	# - Parsing
18	# - well-formed / tag balance check
19	# - Schema
20	# - not sure if we check the HTML schema or not - it might be too restrictive
21
22	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
23
24	# Special case: we need $REPO_ROOT
25	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
26	source $LIB_OSH/bash-strict.sh
27	source $LIB_OSH/task-five.sh
28
29	# parse with lazylex/html.py, or data_lang/htm8.py
30
31	site-files() {
32	#find ../../oilshell/oilshell.org__deploy -name '*.html'
33
34	# omit all the _ files
35	git ls-files \| grep '\.html$'
36	}
37
38	# Issues with lazylex/html.py
39	#
40	# - Token ID is annoying to express in Python
41	# - re.DOTALL for newlines
42	# - can we change that with [.\n]*?
43	# - nongreedy match for --> and ?>
44
45	ht8-tool() {
46	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
47	$REPO_ROOT/lazylex/html.py "$@"
48	}
49
50	test-well-formed() {
51	cat >_tmp/bad.html <<EOF
52	unfinished <!--
53	hi && bye
54	EOF
55	echo '_tmp/bad.html' \| ht8-tool well-formed
56	}
57
58	# site errors
59	#
60	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
61	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
62	# 5833374 tokens in 4710 files
63	#
64	# The second is the "Woboq" browser, which has CDATA
65	# Ah I wonder if we need that.
66
67	# Takes ~13 seconds
68	test-site() {
69	local new_site=${1:-}
70
71	# TODO:
72	# - test that the top level lexes
73	# - test that each tag lexers
74	# - test that each quoted attribute lexes
75	# - test that tags are balanced
76
77	if test -n "$new_site"; then
78	dir='../oils.pub__deploy'
79	else
80	dir='../../oilshell/oilshell.org__deploy'
81	fi
82
83	pushd $dir
84
85	# Too many files
86	# site-files \| xargs wc -l \| grep total
87
88	# Not using xargs
89	time site-files \| $REPO_ROOT/$0 ht8-tool well-formed
90
91	popd
92	}
93
94	readonly SOIL_ID=8915
95	readonly WWZ_DIR=_tmp/$SOIL_ID
96
97	sync-wwz() {
98	mkdir -p $WWZ_DIR
99	rsync --archive --verbose \
100	op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
101	}
102
103	extract-wwz() {
104	pushd $WWZ_DIR
105	for z in *.wwz; do
106	local name=$(basename $z .wwz)
107
108	mkdir -p $name
109	pushd $name >/dev/null
110
111	echo $name
112	unzip ../$z
113
114	popd >/dev/null
115	done
116	popd
117	}
118
119	tree-wwz() {
120	tree $WWZ_DIR
121	}
122
123	test-wwz() {
124	pushd $WWZ_DIR
125
126	time find . -name '*.html' \| $REPO_ROOT/$0 ht8-tool well-formed
127
128	popd
129	}
130
131	find-xml() {
132	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
133	}
134
135	test-other-xml() {
136	# problem with &ent1;
137	# CDATA support! haha OK
138	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 ht8-tool well-formed
139	}
140
141	test-repo-xml() {
142	# OK these parse
143	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
144	\| $REPO_ROOT/$0 ht8-tool well-formed
145	}
146
147	# OK we have to skip the <script> tag! And <style>
148	#
149	# document.location = '#' + params.join('&');
150	# gUrlHash = new UrlHash(location.hash);
151	#
152	# I think textarea we don't though?
153
154
155	task-five "$@"
156	exit
157
158
159	echo '
160	In HTML5, instead of
161	<script>
162	<![CDATA[
163	if (x < y) { ... }
164	]]>
165	</script>
166
167	You can write
168
169	<script>
170	if (x < y) { ... }
171	</script>
172
173	<script> <style> <textarea>
174
175	These have special escaping rules. I guess we just do NOT lex them at all?
176	We can totally SKIP them.
177
178	CDATA vs. RCDATA
179
180	<textarea>
181	<p> <!-- This will show as: <p> -->
182	& <!-- This will show as: & -->
183	</textarea>
184
185	<script>
186	<p> <!-- This will show literally as: <p> -->
187	& <!-- This will show literally as: & -->
188	</script>
189
190	The main practical difference is that RCDATA processes HTML entities while
191	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
192	as plain text) except for their own closing tag. '
193	'
194