data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

152 lines, 49 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5
6	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
7
8	# Special case: we need $REPO_ROOT
9	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
10	source $LIB_OSH/bash-strict.sh
11	source $LIB_OSH/task-five.sh
12
13	# parse with lazylex/html.py, or data_lang/htm8.py
14
15	site-files() {
16	#find ../../oilshell/oilshell.org__deploy -name '*.html'
17
18	# omit all the _ files
19	git ls-files \| grep '\.html$'
20	}
21
22	# Issues with lazylex/html.py
23	#
24	# - Token ID is annoying to express in Python
25	# - re.DOTALL for newlines
26	# - can we change that with [.\n]*?
27	# - nongreedy match for --> and ?>
28
29	ht8-tool() {
30	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
31	$REPO_ROOT/lazylex/html.py "$@"
32	}
33
34	test-well-formed() {
35	cat >_tmp/bad.html <<EOF
36	hi && bye
37	EOF
38	echo '_tmp/bad.html' \| ht8-tool well-formed
39	}
40
41	# site errors
42	#
43	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
44	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
45	# 5833374 tokens in 4710 files
46	#
47	# The second is the "Woboq" browser, which has CDATA
48	# Ah I wonder if we need that.
49
50	# Takes ~13 seconds
51	test-site() {
52	# TODO:
53	# - test that the top level lexes
54	# - test that each tag lexers
55	# - test that each quoted attribute lexes
56	# - test that tags are balanced
57
58	pushd ../../oilshell/oilshell.org__deploy
59
60	# Too many files
61	# site-files \| xargs wc -l \| grep total
62
63	# Not using xargs
64	time site-files \| $REPO_ROOT/$0 ht8-tool well-formed
65
66	popd
67	}
68
69	readonly WWZ_DIR=_tmp/8899
70
71	sync-wwz() {
72	mkdir -p $WWZ_DIR
73	rsync --archive --verbose \
74	op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
75	}
76
77	extract-wwz() {
78	pushd $WWZ_DIR
79	for z in *.wwz; do
80	local name=$(basename $z .wwz)
81
82	mkdir -p $name
83	pushd $name >/dev/null
84
85	echo $name
86	unzip ../$z
87
88	popd >/dev/null
89	done
90	popd
91	}
92
93	tree-wwz() {
94	tree $WWZ_DIR
95	}
96
97	check-wwz() {
98	pushd $WWZ_DIR
99
100	find . -name '*.html' \| $REPO_ROOT/$0 ht8-tool well-formed
101
102	popd
103	}
104
105	# OK we have to skip the <script> tag! And <style>
106	#
107	# document.location = '#' + params.join('&');
108	# gUrlHash = new UrlHash(location.hash);
109	#
110	# I think textarea we don't though?
111
112
113	task-five "$@"
114	exit
115
116
117	echo '
118	In HTML5, instead of
119	<script>
120	<![CDATA[
121	if (x < y) { ... }
122	]]>
123	</script>
124
125	You can write
126
127	<script>
128	if (x < y) { ... }
129	</script>
130
131	<script> <style> <textarea>
132
133	These have special escaping rules. I guess we just do NOT lex them at all?
134	We can totally SKIP them.
135
136	CDATA vs. RCDATA
137
138	<textarea>
139	<p> <!-- This will show as: <p> -->
140	& <!-- This will show as: & -->
141	</textarea>
142
143	<script>
144	<p> <!-- This will show literally as: <p> -->
145	& <!-- This will show literally as: & -->
146	</script>
147
148	The main practical difference is that RCDATA processes HTML entities while
149	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
150	as plain text) except for their own closing tag. '
151	'
152