data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

169 lines, 59 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5
6	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
7
8	# Special case: we need $REPO_ROOT
9	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
10	source $LIB_OSH/bash-strict.sh
11	source $LIB_OSH/task-five.sh
12
13	# parse with lazylex/html.py, or data_lang/htm8.py
14
15	site-files() {
16	#find ../../oilshell/oilshell.org__deploy -name '*.html'
17
18	# omit all the _ files
19	git ls-files \| grep '\.html$'
20	}
21
22	# Issues with lazylex/html.py
23	#
24	# - Token ID is annoying to express in Python
25	# - re.DOTALL for newlines
26	# - can we change that with [.\n]*?
27	# - nongreedy match for --> and ?>
28
29	ht8-tool() {
30	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
31	$REPO_ROOT/lazylex/html.py "$@"
32	}
33
34	test-well-formed() {
35	cat >_tmp/bad.html <<EOF
36	unfinished <!--
37	hi && bye
38	EOF
39	echo '_tmp/bad.html' \| ht8-tool well-formed
40	}
41
42	# site errors
43	#
44	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
45	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
46	# 5833374 tokens in 4710 files
47	#
48	# The second is the "Woboq" browser, which has CDATA
49	# Ah I wonder if we need that.
50
51	# Takes ~13 seconds
52	test-site() {
53	# TODO:
54	# - test that the top level lexes
55	# - test that each tag lexers
56	# - test that each quoted attribute lexes
57	# - test that tags are balanced
58
59	pushd ../../oilshell/oilshell.org__deploy
60
61	# Too many files
62	# site-files \| xargs wc -l \| grep total
63
64	# Not using xargs
65	time site-files \| $REPO_ROOT/$0 ht8-tool well-formed
66
67	popd
68	}
69
70	readonly WWZ_DIR=_tmp/8899
71
72	sync-wwz() {
73	mkdir -p $WWZ_DIR
74	rsync --archive --verbose \
75	op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
76	}
77
78	extract-wwz() {
79	pushd $WWZ_DIR
80	for z in *.wwz; do
81	local name=$(basename $z .wwz)
82
83	mkdir -p $name
84	pushd $name >/dev/null
85
86	echo $name
87	unzip ../$z
88
89	popd >/dev/null
90	done
91	popd
92	}
93
94	tree-wwz() {
95	tree $WWZ_DIR
96	}
97
98	test-wwz() {
99	pushd $WWZ_DIR
100
101	find . -name '*.html' \| $REPO_ROOT/$0 ht8-tool well-formed
102
103	popd
104	}
105
106	find-xml() {
107	time find ~ -iname '*.xml' \| tee _tmp/xml-files.txt
108	}
109
110	test-other-xml() {
111	# problem with &ent1;
112	# CDATA support! haha OK
113	time cat _tmp/xml-files.txt \| $REPO_ROOT/$0 ht8-tool well-formed
114	}
115
116	test-repo-xml() {
117	# OK these parse
118	time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
119	\| $REPO_ROOT/$0 ht8-tool well-formed
120	}
121
122	# OK we have to skip the <script> tag! And <style>
123	#
124	# document.location = '#' + params.join('&');
125	# gUrlHash = new UrlHash(location.hash);
126	#
127	# I think textarea we don't though?
128
129
130	task-five "$@"
131	exit
132
133
134	echo '
135	In HTML5, instead of
136	<script>
137	<![CDATA[
138	if (x < y) { ... }
139	]]>
140	</script>
141
142	You can write
143
144	<script>
145	if (x < y) { ... }
146	</script>
147
148	<script> <style> <textarea>
149
150	These have special escaping rules. I guess we just do NOT lex them at all?
151	We can totally SKIP them.
152
153	CDATA vs. RCDATA
154
155	<textarea>
156	<p> <!-- This will show as: <p> -->
157	& <!-- This will show as: & -->
158	</textarea>
159
160	<script>
161	<p> <!-- This will show literally as: <p> -->
162	& <!-- This will show literally as: & -->
163	</script>
164
165	The main practical difference is that RCDATA processes HTML entities while
166	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
167	as plain text) except for their own closing tag. '
168	'
169