OILS / data_lang / htm8-test.sh View on Github | oils.pub

213 lines, 79 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7#
8# - translate to C++
9# - how to handle the regexes in the lexer? Port to re2c directly?
10# - for find(), do we need a C++ primitive for it?
11# - no allocation for TagName()
12#
13# re2c considerations:
14# - We need to use CAPTURES, so we can't use frontend/match directly
15# - Could we STREAM the lexer?
16# - Instead of sentinel model, use something else!
17# - default is sentinel with padding, and there is YYFILL with padding
18# - there is also the separate --storable-state option
19# - because this can be used queries that don't allocate
20# - I may also want to do this with JSON
21#
22# Features:
23# - Are there special rules for <svg> and <math>?
24# - Do we need to know about <textarea> <pre>? Those don't have the same
25# whitespace rules
26
27
28REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
29
30# Special case: we need $REPO_ROOT
31: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
32source $LIB_OSH/bash-strict.sh
33source $LIB_OSH/task-five.sh
34
35site-files() {
36 #find ../../oilshell/oilshell.org__deploy -name '*.html'
37
38 # omit all the _ files
39 git ls-files | grep '\.html$'
40}
41
42# Issues with lazylex/html.py
43#
44# - Token ID is annoying to express in Python
45# - re.DOTALL for newlines
46# - can we change that with [.\n]*?
47# - nongreedy match for --> and ?>
48
49htm8-tool() {
50 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
51 $REPO_ROOT/data_lang/htm8_util.py "$@"
52}
53
54test-well-formed() {
55 cat >_tmp/bad.html <<EOF
56unfinished <!--
57hi && bye
58EOF
59 echo '_tmp/bad.html' | htm8-tool well-formed
60}
61
62# site errors
63#
64# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
65# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
66# 5833374 tokens in 4710 files
67#
68# The second is the "Woboq" browser, which has CDATA
69# Ah I wonder if we need that.
70
71# Takes ~13 seconds
72test-site() {
73 local new_site=${1:-}
74
75 # TODO:
76 # - test that the top level lexes
77 # - test that each tag lexers
78 # - test that each quoted attribute lexes
79 # - test that tags are balanced
80
81 local dir
82 local action
83 if test -n "$new_site"; then
84 dir='../oils.pub__deploy'
85 action='parse-htm8'
86 else
87 dir='../../oilshell/oilshell.org__deploy'
88 action='lex-htm8'
89 fi
90
91 pushd $dir
92
93 # Too many files
94 # site-files | xargs wc -l | grep total
95
96 # Not using xargs
97 time site-files | $REPO_ROOT/$0 htm8-tool $action
98
99 popd
100}
101
102readonly SOIL_ID=8924
103readonly WWZ_DIR=_tmp/$SOIL_ID
104
105sync-wwz() {
106 mkdir -p $WWZ_DIR
107 rsync --archive --verbose \
108 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
109}
110
111extract-wwz() {
112 pushd $WWZ_DIR
113 for z in *.wwz; do
114 local name=$(basename $z .wwz)
115
116 mkdir -p $name
117 pushd $name >/dev/null
118
119 echo $name
120 unzip ../$z
121
122 popd >/dev/null
123 done
124 popd
125}
126
127tree-wwz() {
128 tree $WWZ_DIR
129}
130
131test-wwz() {
132 pushd $WWZ_DIR
133
134 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
135
136 popd
137}
138
139find-xml() {
140 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
141}
142
143test-other-xml() {
144 # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
145 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
146}
147
148test-repo-xml() {
149 # OK these parse
150 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
151 | $REPO_ROOT/$0 htm8-tool parse-xml
152}
153
154test-repo-html() {
155 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
156}
157
158test-docs() {
159 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
160}
161
162soil-run() {
163 test-docs
164}
165
166# OK we have to skip the <script> tag! And <style>
167#
168# document.location = '#' + params.join('&');
169# gUrlHash = new UrlHash(location.hash);
170#
171# I think textarea we don't though?
172
173
174task-five "$@"
175exit
176
177
178echo '
179In HTML5, instead of
180<script>
181<![CDATA[
182 if (x < y) { ... }
183]]>
184</script>
185
186You can write
187
188<script>
189 if (x < y) { ... }
190</script>
191
192<script> <style> <textarea>
193
194These have special escaping rules. I guess we just do NOT lex them at all?
195We can totally SKIP them.
196
197CDATA vs. RCDATA
198
199<textarea>
200 &lt;p&gt; <!-- This will show as: <p> -->
201 &amp; <!-- This will show as: & -->
202</textarea>
203
204<script>
205 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
206 &amp; <!-- This will show literally as: &amp; -->
207</script>
208
209The main practical difference is that RCDATA processes HTML entities while
210CDATA treats them as literal text. Both modes ignore HTML tags (treating them
211as plain text) except for their own closing tag. '
212'
213