OILS / data_lang / htm8-test.sh View on Github | oils.pub

215 lines, 79 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7#
8# - translate to C++
9# - how to handle the regexes in the lexer? Port to re2c directly?
10# - for find(), do we need a C++ primitive for it?
11# - no allocation for TagName()
12#
13# re2c considerations:
14# - We need to use CAPTURES, so we can't use frontend/match directly
15# - Could we STREAM the lexer?
16# - Instead of sentinel model, use something else!
17# - default is sentinel with padding, and there is YYFILL with padding
18# - there is also the separate --storable-state option
19# - because this can be used queries that don't allocate
20# - I may also want to do this with JSON
21#
22# Features:
23# - Are there special rules for <svg> and <math>?
24# - Do we need to know about <textarea> <pre>? Those don't have the same
25# whitespace rules
26
27
28REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30# Special case: we need $REPO_ROOT
31: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32source $LIB_OSH/bash-strict.sh
33source $LIB_OSH/task-five.sh
34
35# parse with lazylex/html.py, or data_lang/htm8.py
36
37site-files() {
38 #find ../../oilshell/oilshell.org__deploy -name '*.html'
39
40 # omit all the _ files
41 git ls-files | grep '\.html$'
42}
43
44# Issues with lazylex/html.py
45#
46# - Token ID is annoying to express in Python
47# - re.DOTALL for newlines
48# - can we change that with [.\n]*?
49# - nongreedy match for --> and ?>
50
51htm8-tool() {
52 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
53 $REPO_ROOT/lazylex/html.py "$@"
54}
55
56test-well-formed() {
57 cat >_tmp/bad.html <<EOF
58unfinished <!--
59hi && bye
60EOF
61 echo '_tmp/bad.html' | htm8-tool well-formed
62}
63
64# site errors
65#
66# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
67# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
68# 5833374 tokens in 4710 files
69#
70# The second is the "Woboq" browser, which has CDATA
71# Ah I wonder if we need that.
72
73# Takes ~13 seconds
74test-site() {
75 local new_site=${1:-}
76
77 # TODO:
78 # - test that the top level lexes
79 # - test that each tag lexers
80 # - test that each quoted attribute lexes
81 # - test that tags are balanced
82
83 local dir
84 local action
85 if test -n "$new_site"; then
86 dir='../oils.pub__deploy'
87 action='parse-htm8'
88 else
89 dir='../../oilshell/oilshell.org__deploy'
90 action='lex-htm8'
91 fi
92
93 pushd $dir
94
95 # Too many files
96 # site-files | xargs wc -l | grep total
97
98 # Not using xargs
99 time site-files | $REPO_ROOT/$0 htm8-tool $action
100
101 popd
102}
103
104readonly SOIL_ID=8924
105readonly WWZ_DIR=_tmp/$SOIL_ID
106
107sync-wwz() {
108 mkdir -p $WWZ_DIR
109 rsync --archive --verbose \
110 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
111}
112
113extract-wwz() {
114 pushd $WWZ_DIR
115 for z in *.wwz; do
116 local name=$(basename $z .wwz)
117
118 mkdir -p $name
119 pushd $name >/dev/null
120
121 echo $name
122 unzip ../$z
123
124 popd >/dev/null
125 done
126 popd
127}
128
129tree-wwz() {
130 tree $WWZ_DIR
131}
132
133test-wwz() {
134 pushd $WWZ_DIR
135
136 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
137
138 popd
139}
140
141find-xml() {
142 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
143}
144
145test-other-xml() {
146 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
147 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
148}
149
150test-repo-xml() {
151 # OK these parse
152 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
153 | $REPO_ROOT/$0 htm8-tool parse-xml
154}
155
156test-repo-html() {
157 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
158}
159
160test-docs() {
161 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
162}
163
164soil-run() {
165 test-docs
166}
167
168# OK we have to skip the <script> tag! And <style>
169#
170# document.location = '#' + params.join('&');
171# gUrlHash = new UrlHash(location.hash);
172#
173# I think textarea we don't though?
174
175
176task-five "$@"
177exit
178
179
180echo '
181In HTML5, instead of
182<script>
183<![CDATA[
184 if (x < y) { ... }
185]]>
186</script>
187
188You can write
189
190<script>
191 if (x < y) { ... }
192</script>
193
194<script> <style> <textarea>
195
196These have special escaping rules. I guess we just do NOT lex them at all?
197We can totally SKIP them.
198
199CDATA vs. RCDATA
200
201<textarea>
202 &lt;p&gt; <!-- This will show as: <p> -->
203 &amp; <!-- This will show as: & -->
204</textarea>
205
206<script>
207 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
208 &amp; <!-- This will show literally as: &amp; -->
209</script>
210
211The main practical difference is that RCDATA processes HTML entities while
212CDATA treats them as literal text. Both modes ignore HTML tags (treating them
213as plain text) except for their own closing tag. '
214'
215