OILS / data_lang / htm8-test.sh View on Github | oils.pub

228 lines, 75 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7# - Refactor Validate(): take FLAGS, return stats optionally
8# - add LEX_QUOTED_VALUES
9# - and then re-run all the tests
10# - Rename to data_lang/htm8.py
11# - it has NO_SPECIAL_TAGS mode for XML
12#
13# - Soil
14# - Validate all the HTML in the repo - well-formed check
15# - this should go in the CI
16# - Automate some more tests:
17# - site oils.pub, site oilshell.org
18# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
19#
20# - statically type it
21# - revive pyannotate
22# - translate to C++
23# - what to do about all the regexes? Port to re2c directly?
24# - for find(), do we need a C++ primitive for it?
25# - no allocation for TagName()
26# - ASDL file for Tok.Foo?
27# - refactor TagName() API - remove it from the TagLexer?
28# - that is really the AttrLexer()
29#
30# - build a DOM with objects in YSH?
31# - rewrite ul-table in that?
32#
33# YSH API
34# - Generating HTML/HTM8 is much more common than parsing it
35# - although maybe we can do RemoveComments as a demo?
36# - that is the lowest level "sed" model
37# - For parsing, a minimum idea is:
38# - lexer-based algorithms for query by tag, class name, and id
39# - and then toTree()
40# - .tag and .attrs?
41# - .innerHTML() and .outerHTML() perhaps
42# - and maybe you can mutate it directly
43
44REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
45
46# Special case: we need $REPO_ROOT
47: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
48source $LIB_OSH/bash-strict.sh
49source $LIB_OSH/task-five.sh
50
51# parse with lazylex/html.py, or data_lang/htm8.py
52
53site-files() {
54 #find ../../oilshell/oilshell.org__deploy -name '*.html'
55
56 # omit all the _ files
57 git ls-files | grep '\.html$'
58}
59
60# Issues with lazylex/html.py
61#
62# - Token ID is annoying to express in Python
63# - re.DOTALL for newlines
64# - can we change that with [.\n]*?
65# - nongreedy match for --> and ?>
66
67htm8-tool() {
68 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
69 $REPO_ROOT/lazylex/html.py "$@"
70}
71
72test-well-formed() {
73 cat >_tmp/bad.html <<EOF
74unfinished <!--
75hi && bye
76EOF
77 echo '_tmp/bad.html' | htm8-tool well-formed
78}
79
80# site errors
81#
82# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
83# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
84# 5833374 tokens in 4710 files
85#
86# The second is the "Woboq" browser, which has CDATA
87# Ah I wonder if we need that.
88
89# Takes ~13 seconds
90test-site() {
91 local new_site=${1:-}
92
93 # TODO:
94 # - test that the top level lexes
95 # - test that each tag lexers
96 # - test that each quoted attribute lexes
97 # - test that tags are balanced
98
99 if test -n "$new_site"; then
100 dir='../oils.pub__deploy'
101 else
102 dir='../../oilshell/oilshell.org__deploy'
103 fi
104
105 pushd $dir
106
107 # Too many files
108 # site-files | xargs wc -l | grep total
109
110 # Not using xargs
111 time site-files | $REPO_ROOT/$0 htm8-tool validate
112
113 popd
114}
115
116readonly SOIL_ID=8917
117readonly WWZ_DIR=_tmp/$SOIL_ID
118
119sync-wwz() {
120 mkdir -p $WWZ_DIR
121 rsync --archive --verbose \
122 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
123}
124
125extract-wwz() {
126 pushd $WWZ_DIR
127 for z in *.wwz; do
128 local name=$(basename $z .wwz)
129
130 mkdir -p $name
131 pushd $name >/dev/null
132
133 echo $name
134 unzip ../$z
135
136 popd >/dev/null
137 done
138 popd
139}
140
141tree-wwz() {
142 tree $WWZ_DIR
143}
144
145test-wwz() {
146 pushd $WWZ_DIR
147
148 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
149
150 popd
151}
152
153find-xml() {
154 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
155}
156
157test-other-xml() {
158 # problem with &ent1;
159 # CDATA support! haha OK
160 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool validate
161}
162
163test-repo-xml() {
164 # OK these parse
165 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
166 | $REPO_ROOT/$0 htm8-tool validate
167}
168
169test-repo-html() {
170 time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
171}
172
173test-docs() {
174 time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
175}
176
177soil-run() {
178 test-docs
179}
180
181# OK we have to skip the <script> tag! And <style>
182#
183# document.location = '#' + params.join('&');
184# gUrlHash = new UrlHash(location.hash);
185#
186# I think textarea we don't though?
187
188
189task-five "$@"
190exit
191
192
193echo '
194In HTML5, instead of
195<script>
196<![CDATA[
197 if (x < y) { ... }
198]]>
199</script>
200
201You can write
202
203<script>
204 if (x < y) { ... }
205</script>
206
207<script> <style> <textarea>
208
209These have special escaping rules. I guess we just do NOT lex them at all?
210We can totally SKIP them.
211
212CDATA vs. RCDATA
213
214<textarea>
215 &lt;p&gt; <!-- This will show as: <p> -->
216 &amp; <!-- This will show as: & -->
217</textarea>
218
219<script>
220 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
221 &amp; <!-- This will show literally as: &amp; -->
222</script>
223
224The main practical difference is that RCDATA processes HTML entities while
225CDATA treats them as literal text. Both modes ignore HTML tags (treating them
226as plain text) except for their own closing tag. '
227'
228