data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

185 lines, 59 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5	#
6	# TODO:
7	# - Rename to DML8? Because it can handle XML
8	# - CDATA in XML, which is not a script
9	#
10	# Operations / Levels:
11	#
12	# - Lexing
13	# - lex-tags
14	# - lex-attrs - validate all Start tags, all StartEnd tags
15	# - lex-quoted-values - unescaping, etc.
16	# - are there invalid entities?
17	# - Parsing
18	# - well-formed / tag balance check
19	# - Schema
20	# - not sure if we check the HTML schema or not - it might be too restrictive
21
22	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
23
24	# Special case: we need $REPO_ROOT
25	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
26	source $LIB_OSH/bash-strict.sh
27	source $LIB_OSH/task-five.sh
28
29	# parse with lazylex/html.py, or data_lang/htm8.py
30
31	site-files() {
32	#find ../../oilshell/oilshell.org__deploy -name '*.html'
33
34	# omit all the _ files
35	git ls-files \| grep '\.html$'
36	}
37
38	# Issues with lazylex/html.py
39	#
40	# - Token ID is annoying to express in Python
41	# - re.DOTALL for newlines
42	# - can we change that with [.\n]*?
43	# - nongreedy match for --> and ?>
44
45	ht8-tool() {
46	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
47	$REPO_ROOT/lazylex/html.py "$@"
48	}
49
50	test-well-formed() {
51	cat >_tmp/bad.html <<EOF
52	unfinished <!--
53	hi && bye
54	EOF
55	echo '_tmp/bad.html' \| ht8-tool well-formed
56	}
57
58	# site errors
59	#
60	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
61	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
62	# 5833374 tokens in 4710 files
63	#
64	# The second is the "Woboq" browser, which has CDATA
65	# Ah I wonder if we need that.
66
67	# Takes ~13 seconds
68	test-site() {
69	# TODO:
70	# - test that the top level lexes
71	# - test that each tag lexers
72	# - test that each quoted attribute lexes
73	# - test that tags are balanced
74
75	pushd ../../oilshell/oilshell.org__deploy
76
77	# Too many files
78	# site-files \| xargs wc -l \| grep total
79
80	# Not using xargs
81	time site-files \| $REPO_ROOT/$0 ht8-tool well-formed
82
83	popd
84	}
85
86	readonly WWZ_DIR=_tmp/8899
87
88	sync-wwz() {
89	mkdir -p $WWZ_DIR
90	rsync --archive --verbose \
91	op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
92	}
93
94	extract-wwz() {
95	pushd $WWZ_DIR
96	for z in *.wwz; do
97	local name=$(basename $z .wwz)
98
99	mkdir -p $name
100	pushd $name >/dev/null
101
102	echo $name
103	unzip ../$z
104
105	popd >/dev/null
106	done
107	popd
108	}
109
110	tree-wwz() {
111	tree $WWZ_DIR
112	}
113
114	test-wwz() {
115	pushd $WWZ_DIR
116
117	time find . -name '*.html' \| $REPO_ROOT/$0 ht8-tool well-formed
118
119	popd
120	}
121
122	find-xml() {
123	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
124	}
125
126	test-other-xml() {
127	# problem with &ent1;
128	# CDATA support! haha OK
129	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 ht8-tool well-formed
130	}
131
132	test-repo-xml() {
133	# OK these parse
134	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
135	\| $REPO_ROOT/$0 ht8-tool well-formed
136	}
137
138	# OK we have to skip the <script> tag! And <style>
139	#
140	# document.location = '#' + params.join('&');
141	# gUrlHash = new UrlHash(location.hash);
142	#
143	# I think textarea we don't though?
144
145
146	task-five "$@"
147	exit
148
149
150	echo '
151	In HTML5, instead of
152	<script>
153	<![CDATA[
154	if (x < y) { ... }
155	]]>
156	</script>
157
158	You can write
159
160	<script>
161	if (x < y) { ... }
162	</script>
163
164	<script> <style> <textarea>
165
166	These have special escaping rules. I guess we just do NOT lex them at all?
167	We can totally SKIP them.
168
169	CDATA vs. RCDATA
170
171	<textarea>
172	<p> <!-- This will show as: <p> -->
173	& <!-- This will show as: & -->
174	</textarea>
175
176	<script>
177	<p> <!-- This will show literally as: <p> -->
178	& <!-- This will show literally as: & -->
179	</script>
180
181	The main practical difference is that RCDATA processes HTML entities while
182	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
183	as plain text) except for their own closing tag. '
184	'
185