OILS / data_lang / htm8-test.sh View on Github | oils.pub

227 lines, 86 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7#
8# - translate to C++
9# - how to handle the regexes in the lexer? Port to re2c directly?
10# - for find(), do we need a C++ primitive for it?
11# - no allocation for TagName()
12#
13# re2c considerations:
14# - We need to use CAPTURES, so we can't use frontend/match directly
15# - Could we STREAM the lexer?
16# - Instead of sentinel model, use something else!
17# - default is sentinel with padding, and there is YYFILL with padding
18# - there is also the separate --storable-state option
19# - because this can be used queries that don't allocate
20# - I may also want to do this with JSON
21#
22# Features:
23# - Are there special rules for <svg> and <math>?
24# - Do we need to know about <textarea> <pre>? Those don't have the same
25# whitespace rules
26
27
28REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30# Special case: we need $REPO_ROOT
31: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32source $LIB_OSH/bash-strict.sh
33source $LIB_OSH/task-five.sh
34
35site-files() {
36 #find ../../oilshell/oilshell.org__deploy -name '*.html'
37
38 # omit all the _ files
39 git ls-files | grep '\.html$'
40}
41
42# Issues with lazylex/html.py
43#
44# - Token ID is annoying to express in Python
45# - re.DOTALL for newlines
46# - can we change that with [.\n]*?
47# - nongreedy match for --> and ?>
48
49htm8-tool() {
50 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
51 $REPO_ROOT/data_lang/htm8_util.py "$@"
52}
53
54test-quick-scan() {
55 cat >_tmp/bad-top.html <<EOF
56unfinished <!--
57hi && bye
58EOF
59
60 set +o errexit
61 echo '_tmp/bad-top.html' | htm8-tool quick-scan
62
63 echo '_tmp/bad-top.html' | htm8-tool lex-htm8
64
65 cat >_tmp/bad-attr.html <<EOF
66hi <a href !>
67EOF
68
69 echo '*** bad-attr quick-scan'
70 echo '_tmp/bad-attr.html' | htm8-tool quick-scan
71
72 echo '*** bad-attr lex-htm8'
73 echo '_tmp/bad-attr.html' | htm8-tool lex-htm8
74}
75
76# site errors
77#
78# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
79# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
80# 5833374 tokens in 4710 files
81#
82# The second is the "Woboq" browser, which has CDATA
83# Ah I wonder if we need that.
84
85# Takes ~13 seconds
86test-site() {
87 local new_site=${1:-}
88
89 # TODO:
90 # - test that the top level lexes
91 # - test that each tag lexers
92 # - test that each quoted attribute lexes
93 # - test that tags are balanced
94
95 local dir
96 local action
97 if test -n "$new_site"; then
98 dir='../oils.pub__deploy'
99 action='parse-htm8'
100 else
101 dir='../../oilshell/oilshell.org__deploy'
102 action='lex-htm8'
103 fi
104
105 pushd $dir
106
107 # Too many files
108 # site-files | xargs wc -l | grep total
109
110 # Not using xargs
111 time site-files | $REPO_ROOT/$0 htm8-tool $action
112
113 popd
114}
115
116readonly SOIL_ID=8924
117readonly WWZ_DIR=_tmp/$SOIL_ID
118
119sync-wwz() {
120 mkdir -p $WWZ_DIR
121 rsync --archive --verbose \
122 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
123}
124
125extract-wwz() {
126 pushd $WWZ_DIR
127 for z in *.wwz; do
128 local name=$(basename $z .wwz)
129
130 mkdir -p $name
131 pushd $name >/dev/null
132
133 echo $name
134 unzip ../$z
135
136 popd >/dev/null
137 done
138 popd
139}
140
141tree-wwz() {
142 tree $WWZ_DIR
143}
144
145test-wwz() {
146 pushd $WWZ_DIR
147
148 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
149
150 popd
151}
152
153find-xml() {
154 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
155}
156
157test-other-xml() {
158 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
159 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
160}
161
162test-repo-xml() {
163 # OK these parse
164 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
165 | $REPO_ROOT/$0 htm8-tool parse-xml
166}
167
168test-repo-html() {
169 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
170}
171
172test-docs() {
173 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
174}
175
176soil-run() {
177 test-docs
178}
179
180# OK we have to skip the <script> tag! And <style>
181#
182# document.location = '#' + params.join('&');
183# gUrlHash = new UrlHash(location.hash);
184#
185# I think textarea we don't though?
186
187
188task-five "$@"
189exit
190
191
192echo '
193In HTML5, instead of
194<script>
195<![CDATA[
196 if (x < y) { ... }
197]]>
198</script>
199
200You can write
201
202<script>
203 if (x < y) { ... }
204</script>
205
206<script> <style> <textarea>
207
208These have special escaping rules. I guess we just do NOT lex them at all?
209We can totally SKIP them.
210
211CDATA vs. RCDATA
212
213<textarea>
214 &lt;p&gt; <!-- This will show as: <p> -->
215 &amp; <!-- This will show as: & -->
216</textarea>
217
218<script>
219 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
220 &amp; <!-- This will show literally as: &amp; -->
221</script>
222
223The main practical difference is that RCDATA processes HTML entities while
224CDATA treats them as literal text. Both modes ignore HTML tags (treating them
225as plain text) except for their own closing tag. '
226'
227