OILS / data_lang / htm8-test.sh View on Github | oils.pub

234 lines, 79 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7# - Move code into data_lang/htm8.py
8# - iterators stay in lazylex/html.py?
9#
10# - statically type it
11# - revive pyannotate
12# - translate to C++
13# - how to handle the regexes in the lexer? Port to re2c directly?
14# - for find(), do we need a C++ primitive for it?
15# - no allocation for TagName()
16# - ASDL file for Tok.Foo?
17# - remove TagName() from TagLexer(), it is on the Htm8Lexer
18#
19# re2c considerations:
20# - We need to use CAPTURES, so we can't use frontend/match directly
21# - Could we STREAM the lexer?
22# - Instead of sentinel model, use something else!
23# - default is sentinel with padding, and there is YYFILL with padding
24# - there is also the separate --storable-state option
25# - because this can be used queries that don't allocate
26# - I may also want to do this with JSON
27#
28# Not working yet:
29# - understanding all entities &zz;
30# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
31# - capital letters <TR/> - I guess we can normalize the case
32#
33# Leniency:
34# - foo=1&bar=2 is extremely common
35# - well then does that mean you allow <p>a & b</b too?
36# - and then it's not far from that to <p id="value >"> - the quotes help
37# - I guess you can have a rule for unescaped &, just like unescaped backslash
38# - you can warn about it, but it doesn't cause much problem?
39# We are already firmly in HTML territory, not in XML ...
40#
41# Features:
42# - Are there special rules for <svg> and <math>?
43# - Do we need to know about <textarea> <pre>? Those don't have the same
44# whitespace rules
45
46
47REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
48
49# Special case: we need $REPO_ROOT
50: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
51source $LIB_OSH/bash-strict.sh
52source $LIB_OSH/task-five.sh
53
54# parse with lazylex/html.py, or data_lang/htm8.py
55
56site-files() {
57 #find ../../oilshell/oilshell.org__deploy -name '*.html'
58
59 # omit all the _ files
60 git ls-files | grep '\.html$'
61}
62
63# Issues with lazylex/html.py
64#
65# - Token ID is annoying to express in Python
66# - re.DOTALL for newlines
67# - can we change that with [.\n]*?
68# - nongreedy match for --> and ?>
69
70htm8-tool() {
71 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
72 $REPO_ROOT/lazylex/html.py "$@"
73}
74
75test-well-formed() {
76 cat >_tmp/bad.html <<EOF
77unfinished <!--
78hi && bye
79EOF
80 echo '_tmp/bad.html' | htm8-tool well-formed
81}
82
83# site errors
84#
85# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
86# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
87# 5833374 tokens in 4710 files
88#
89# The second is the "Woboq" browser, which has CDATA
90# Ah I wonder if we need that.
91
92# Takes ~13 seconds
93test-site() {
94 local new_site=${1:-}
95
96 # TODO:
97 # - test that the top level lexes
98 # - test that each tag lexers
99 # - test that each quoted attribute lexes
100 # - test that tags are balanced
101
102 local dir
103 local action
104 if test -n "$new_site"; then
105 dir='../oils.pub__deploy'
106 action='parse-htm8'
107 else
108 dir='../../oilshell/oilshell.org__deploy'
109 action='lex-htm8'
110 fi
111
112 pushd $dir
113
114 # Too many files
115 # site-files | xargs wc -l | grep total
116
117 # Not using xargs
118 time site-files | $REPO_ROOT/$0 htm8-tool $action
119
120 popd
121}
122
123readonly SOIL_ID=8924
124readonly WWZ_DIR=_tmp/$SOIL_ID
125
126sync-wwz() {
127 mkdir -p $WWZ_DIR
128 rsync --archive --verbose \
129 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
130}
131
132extract-wwz() {
133 pushd $WWZ_DIR
134 for z in *.wwz; do
135 local name=$(basename $z .wwz)
136
137 mkdir -p $name
138 pushd $name >/dev/null
139
140 echo $name
141 unzip ../$z
142
143 popd >/dev/null
144 done
145 popd
146}
147
148tree-wwz() {
149 tree $WWZ_DIR
150}
151
152test-wwz() {
153 pushd $WWZ_DIR
154
155 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
156
157 popd
158}
159
160find-xml() {
161 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
162}
163
164test-other-xml() {
165 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
166 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
167}
168
169test-repo-xml() {
170 # OK these parse
171 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
172 | $REPO_ROOT/$0 htm8-tool parse-xml
173}
174
175test-repo-html() {
176 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
177}
178
179test-docs() {
180 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
181}
182
183soil-run() {
184 test-docs
185}
186
187# OK we have to skip the <script> tag! And <style>
188#
189# document.location = '#' + params.join('&');
190# gUrlHash = new UrlHash(location.hash);
191#
192# I think textarea we don't though?
193
194
195task-five "$@"
196exit
197
198
199echo '
200In HTML5, instead of
201<script>
202<![CDATA[
203 if (x < y) { ... }
204]]>
205</script>
206
207You can write
208
209<script>
210 if (x < y) { ... }
211</script>
212
213<script> <style> <textarea>
214
215These have special escaping rules. I guess we just do NOT lex them at all?
216We can totally SKIP them.
217
218CDATA vs. RCDATA
219
220<textarea>
221 &lt;p&gt; <!-- This will show as: <p> -->
222 &amp; <!-- This will show as: & -->
223</textarea>
224
225<script>
226 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
227 &amp; <!-- This will show literally as: &amp; -->
228</script>
229
230The main practical difference is that RCDATA processes HTML entities while
231CDATA treats them as literal text. Both modes ignore HTML tags (treating them
232as plain text) except for their own closing tag. '
233'
234