OILS / data_lang / htm8-test.sh View on Github | oils.pub

232 lines, 86 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7#
8# - htm8.py should use one-pass algorithm
9# - micro-syntax should check all errors
10# - with tests
11# - and then download CommonCrawl data set?
12#
13# - translate to C++
14# - how to handle the regexes in the lexer? Port to re2c directly?
15# - for find(), do we need a C++ primitive for it?
16# - no allocation for TagName()
17#
18# re2c considerations:
19# - We need to use CAPTURES, so we can't use frontend/match directly
20# - Could we STREAM the lexer?
21# - Instead of sentinel model, use something else!
22# - default is sentinel with padding, and there is YYFILL with padding
23# - there is also the separate --storable-state option
24# - because this can be used queries that don't allocate
25# - I may also want to do this with JSON
26#
27# Features:
28# - Are there special rules for <svg> and <math>?
29# - Do we need to know about <textarea> <pre>? Those don't have the same
30# whitespace rules
31
32
33REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
34
35# Special case: we need $REPO_ROOT
36: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
37source $LIB_OSH/bash-strict.sh
38source $LIB_OSH/task-five.sh
39
40site-files() {
41 #find ../../oilshell/oilshell.org__deploy -name '*.html'
42
43 # omit all the _ files
44 git ls-files | grep '\.html$'
45}
46
47# Issues with lazylex/html.py
48#
49# - Token ID is annoying to express in Python
50# - re.DOTALL for newlines
51# - can we change that with [.\n]*?
52# - nongreedy match for --> and ?>
53
54htm8-tool() {
55 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
56 $REPO_ROOT/data_lang/htm8_util.py "$@"
57}
58
59test-quick-scan() {
60 cat >_tmp/bad-top.html <<EOF
61unfinished <!--
62hi && bye
63EOF
64
65 set +o errexit
66 echo '_tmp/bad-top.html' | htm8-tool quick-scan
67
68 echo '_tmp/bad-top.html' | htm8-tool lex-htm8
69
70 cat >_tmp/bad-attr.html <<EOF
71hi <a href !>
72EOF
73
74 echo '*** bad-attr quick-scan'
75 echo '_tmp/bad-attr.html' | htm8-tool quick-scan
76
77 echo '*** bad-attr lex-htm8'
78 echo '_tmp/bad-attr.html' | htm8-tool lex-htm8
79}
80
81# site errors
82#
83# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
84# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
85# 5833374 tokens in 4710 files
86#
87# The second is the "Woboq" browser, which has CDATA
88# Ah I wonder if we need that.
89
90# Takes ~13 seconds
91test-site() {
92 local new_site=${1:-}
93
94 # TODO:
95 # - test that the top level lexes
96 # - test that each tag lexers
97 # - test that each quoted attribute lexes
98 # - test that tags are balanced
99
100 local dir
101 local action
102 if test -n "$new_site"; then
103 dir='../oils.pub__deploy'
104 action='parse-htm8'
105 else
106 dir='../../oilshell/oilshell.org__deploy'
107 action='lex-htm8'
108 fi
109
110 pushd $dir
111
112 # Too many files
113 # site-files | xargs wc -l | grep total
114
115 # Not using xargs
116 time site-files | $REPO_ROOT/$0 htm8-tool $action
117
118 popd
119}
120
121readonly SOIL_ID=8924
122readonly WWZ_DIR=_tmp/$SOIL_ID
123
124sync-wwz() {
125 mkdir -p $WWZ_DIR
126 rsync --archive --verbose \
127 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
128}
129
130extract-wwz() {
131 pushd $WWZ_DIR
132 for z in *.wwz; do
133 local name=$(basename $z .wwz)
134
135 mkdir -p $name
136 pushd $name >/dev/null
137
138 echo $name
139 unzip ../$z
140
141 popd >/dev/null
142 done
143 popd
144}
145
146tree-wwz() {
147 tree $WWZ_DIR
148}
149
150test-wwz() {
151 pushd $WWZ_DIR
152
153 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
154
155 popd
156}
157
158find-xml() {
159 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
160}
161
162test-other-xml() {
163 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
164 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
165}
166
167test-repo-xml() {
168 # OK these parse
169 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
170 | $REPO_ROOT/$0 htm8-tool parse-xml
171}
172
173test-repo-html() {
174 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
175}
176
177test-docs() {
178 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
179}
180
181soil-run() {
182 test-docs
183}
184
185# OK we have to skip the <script> tag! And <style>
186#
187# document.location = '#' + params.join('&');
188# gUrlHash = new UrlHash(location.hash);
189#
190# I think textarea we don't though?
191
192
193task-five "$@"
194exit
195
196
197echo '
198In HTML5, instead of
199<script>
200<![CDATA[
201 if (x < y) { ... }
202]]>
203</script>
204
205You can write
206
207<script>
208 if (x < y) { ... }
209</script>
210
211<script> <style> <textarea>
212
213These have special escaping rules. I guess we just do NOT lex them at all?
214We can totally SKIP them.
215
216CDATA vs. RCDATA
217
218<textarea>
219 &lt;p&gt; <!-- This will show as: <p> -->
220 &amp; <!-- This will show as: & -->
221</textarea>
222
223<script>
224 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
225 &amp; <!-- This will show literally as: &amp; -->
226</script>
227
228The main practical difference is that RCDATA processes HTML entities while
229CDATA treats them as literal text. Both modes ignore HTML tags (treating them
230as plain text) except for their own closing tag. '
231'
232