OILS / data_lang / htm8-test.sh View on Github | oils.pub

225 lines, 79 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7# - Move code into data_lang/htm8.py
8# - iterators stay in lazylex/html.py?
9#
10# - statically type it
11# - revive pyannotate
12# - translate to C++
13# - how to handle the regexes in the lexer? Port to re2c directly?
14# - for find(), do we need a C++ primitive for it?
15# - no allocation for TagName()
16# - ASDL file for Tok.Foo?
17# - remove TagName() from TagLexer(), it is on the Htm8Lexer
18#
19# re2c considerations:
20# - We need to use CAPTURES, so we can't use frontend/match directly
21# - Could we STREAM the lexer?
22# - Instead of sentinel model, use something else!
23# - default is sentinel with padding, and there is YYFILL with padding
24# - there is also the separate --storable-state option
25# - because this can be used queries that don't allocate
26# - I may also want to do this with JSON
27#
28# Not working yet:
29# - capital letters <TR/> - I guess we can normalize the case
30# - islower()
31#
32# Features:
33# - Are there special rules for <svg> and <math>?
34# - Do we need to know about <textarea> <pre>? Those don't have the same
35# whitespace rules
36
37
38REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
39
40# Special case: we need $REPO_ROOT
41: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
42source $LIB_OSH/bash-strict.sh
43source $LIB_OSH/task-five.sh
44
45# parse with lazylex/html.py, or data_lang/htm8.py
46
47site-files() {
48 #find ../../oilshell/oilshell.org__deploy -name '*.html'
49
50 # omit all the _ files
51 git ls-files | grep '\.html$'
52}
53
54# Issues with lazylex/html.py
55#
56# - Token ID is annoying to express in Python
57# - re.DOTALL for newlines
58# - can we change that with [.\n]*?
59# - nongreedy match for --> and ?>
60
61htm8-tool() {
62 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
63 $REPO_ROOT/lazylex/html.py "$@"
64}
65
66test-well-formed() {
67 cat >_tmp/bad.html <<EOF
68unfinished <!--
69hi && bye
70EOF
71 echo '_tmp/bad.html' | htm8-tool well-formed
72}
73
74# site errors
75#
76# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
77# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
78# 5833374 tokens in 4710 files
79#
80# The second is the "Woboq" browser, which has CDATA
81# Ah I wonder if we need that.
82
83# Takes ~13 seconds
84test-site() {
85 local new_site=${1:-}
86
87 # TODO:
88 # - test that the top level lexes
89 # - test that each tag lexers
90 # - test that each quoted attribute lexes
91 # - test that tags are balanced
92
93 local dir
94 local action
95 if test -n "$new_site"; then
96 dir='../oils.pub__deploy'
97 action='parse-htm8'
98 else
99 dir='../../oilshell/oilshell.org__deploy'
100 action='lex-htm8'
101 fi
102
103 pushd $dir
104
105 # Too many files
106 # site-files | xargs wc -l | grep total
107
108 # Not using xargs
109 time site-files | $REPO_ROOT/$0 htm8-tool $action
110
111 popd
112}
113
114readonly SOIL_ID=8924
115readonly WWZ_DIR=_tmp/$SOIL_ID
116
117sync-wwz() {
118 mkdir -p $WWZ_DIR
119 rsync --archive --verbose \
120 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
121}
122
123extract-wwz() {
124 pushd $WWZ_DIR
125 for z in *.wwz; do
126 local name=$(basename $z .wwz)
127
128 mkdir -p $name
129 pushd $name >/dev/null
130
131 echo $name
132 unzip ../$z
133
134 popd >/dev/null
135 done
136 popd
137}
138
139tree-wwz() {
140 tree $WWZ_DIR
141}
142
143test-wwz() {
144 pushd $WWZ_DIR
145
146 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
147
148 popd
149}
150
151find-xml() {
152 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
153}
154
155test-other-xml() {
156 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
157 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
158}
159
160test-repo-xml() {
161 # OK these parse
162 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
163 | $REPO_ROOT/$0 htm8-tool parse-xml
164}
165
166test-repo-html() {
167 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
168}
169
170test-docs() {
171 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
172}
173
174soil-run() {
175 test-docs
176}
177
178# OK we have to skip the <script> tag! And <style>
179#
180# document.location = '#' + params.join('&');
181# gUrlHash = new UrlHash(location.hash);
182#
183# I think textarea we don't though?
184
185
186task-five "$@"
187exit
188
189
190echo '
191In HTML5, instead of
192<script>
193<![CDATA[
194 if (x < y) { ... }
195]]>
196</script>
197
198You can write
199
200<script>
201 if (x < y) { ... }
202</script>
203
204<script> <style> <textarea>
205
206These have special escaping rules. I guess we just do NOT lex them at all?
207We can totally SKIP them.
208
209CDATA vs. RCDATA
210
211<textarea>
212 &lt;p&gt; <!-- This will show as: <p> -->
213 &amp; <!-- This will show as: & -->
214</textarea>
215
216<script>
217 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
218 &amp; <!-- This will show literally as: &amp; -->
219</script>
220
221The main practical difference is that RCDATA processes HTML entities while
222CDATA treats them as literal text. Both modes ignore HTML tags (treating them
223as plain text) except for their own closing tag. '
224'
225