OILS / data_lang / htm8-test.sh View on Github | oils.pub

194 lines, 66 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7# - Rename to DML8? Because it can handle XML
8# - CDATA in XML, which is not a script
9#
10# Operations / Levels:
11#
12# - Lexing
13# - lex-tags
14# - lex-attrs - validate all Start tags, all StartEnd tags
15# - lex-quoted-values - unescaping, etc.
16# - are there invalid entities?
17# - Parsing
18# - well-formed / tag balance check
19# - Schema
20# - not sure if we check the HTML schema or not - it might be too restrictive
21
22REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
23
24# Special case: we need $REPO_ROOT
25: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
26source $LIB_OSH/bash-strict.sh
27source $LIB_OSH/task-five.sh
28
29# parse with lazylex/html.py, or data_lang/htm8.py
30
31site-files() {
32 #find ../../oilshell/oilshell.org__deploy -name '*.html'
33
34 # omit all the _ files
35 git ls-files | grep '\.html$'
36}
37
38# Issues with lazylex/html.py
39#
40# - Token ID is annoying to express in Python
41# - re.DOTALL for newlines
42# - can we change that with [.\n]*?
43# - nongreedy match for --> and ?>
44
45ht8-tool() {
46 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
47 $REPO_ROOT/lazylex/html.py "$@"
48}
49
50test-well-formed() {
51 cat >_tmp/bad.html <<EOF
52unfinished <!--
53hi && bye
54EOF
55 echo '_tmp/bad.html' | ht8-tool well-formed
56}
57
58# site errors
59#
60# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
61# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
62# 5833374 tokens in 4710 files
63#
64# The second is the "Woboq" browser, which has CDATA
65# Ah I wonder if we need that.
66
67# Takes ~13 seconds
68test-site() {
69 local new_site=${1:-}
70
71 # TODO:
72 # - test that the top level lexes
73 # - test that each tag lexers
74 # - test that each quoted attribute lexes
75 # - test that tags are balanced
76
77 if test -n "$new_site"; then
78 dir='../oils.pub__deploy'
79 else
80 dir='../../oilshell/oilshell.org__deploy'
81 fi
82
83 pushd $dir
84
85 # Too many files
86 # site-files | xargs wc -l | grep total
87
88 # Not using xargs
89 time site-files | $REPO_ROOT/$0 ht8-tool well-formed
90
91 popd
92}
93
94readonly SOIL_ID=8915
95readonly WWZ_DIR=_tmp/$SOIL_ID
96
97sync-wwz() {
98 mkdir -p $WWZ_DIR
99 rsync --archive --verbose \
100 op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
101}
102
103extract-wwz() {
104 pushd $WWZ_DIR
105 for z in *.wwz; do
106 local name=$(basename $z .wwz)
107
108 mkdir -p $name
109 pushd $name >/dev/null
110
111 echo $name
112 unzip ../$z
113
114 popd >/dev/null
115 done
116 popd
117}
118
119tree-wwz() {
120 tree $WWZ_DIR
121}
122
123test-wwz() {
124 pushd $WWZ_DIR
125
126 time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
127
128 popd
129}
130
131find-xml() {
132 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
133}
134
135test-other-xml() {
136 # problem with &ent1;
137 # CDATA support! haha OK
138 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
139}
140
141test-repo-xml() {
142 # OK these parse
143 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
144 | $REPO_ROOT/$0 ht8-tool well-formed
145}
146
147# OK we have to skip the <script> tag! And <style>
148#
149# document.location = '#' + params.join('&');
150# gUrlHash = new UrlHash(location.hash);
151#
152# I think textarea we don't though?
153
154
155task-five "$@"
156exit
157
158
159echo '
160In HTML5, instead of
161<script>
162<![CDATA[
163 if (x < y) { ... }
164]]>
165</script>
166
167You can write
168
169<script>
170 if (x < y) { ... }
171</script>
172
173<script> <style> <textarea>
174
175These have special escaping rules. I guess we just do NOT lex them at all?
176We can totally SKIP them.
177
178CDATA vs. RCDATA
179
180<textarea>
181 &lt;p&gt; <!-- This will show as: <p> -->
182 &amp; <!-- This will show as: & -->
183</textarea>
184
185<script>
186 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
187 &amp; <!-- This will show literally as: &amp; -->
188</script>
189
190The main practical difference is that RCDATA processes HTML entities while
191CDATA treats them as literal text. Both modes ignore HTML tags (treating them
192as plain text) except for their own closing tag. '
193'
194