OILS / data_lang / htm8-test.sh View on Github | oils.pub

185 lines, 59 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5#
6# TODO:
7# - Rename to DML8? Because it can handle XML
8# - CDATA in XML, which is not a script
9#
10# Operations / Levels:
11#
12# - Lexing
13# - lex-tags
14# - lex-attrs - validate all Start tags, all StartEnd tags
15# - lex-quoted-values - unescaping, etc.
16# - are there invalid entities?
17# - Parsing
18# - well-formed / tag balance check
19# - Schema
20# - not sure if we check the HTML schema or not - it might be too restrictive
21
22REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
23
24# Special case: we need $REPO_ROOT
25: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
26source $LIB_OSH/bash-strict.sh
27source $LIB_OSH/task-five.sh
28
29# parse with lazylex/html.py, or data_lang/htm8.py
30
31site-files() {
32 #find ../../oilshell/oilshell.org__deploy -name '*.html'
33
34 # omit all the _ files
35 git ls-files | grep '\.html$'
36}
37
38# Issues with lazylex/html.py
39#
40# - Token ID is annoying to express in Python
41# - re.DOTALL for newlines
42# - can we change that with [.\n]*?
43# - nongreedy match for --> and ?>
44
45ht8-tool() {
46 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
47 $REPO_ROOT/lazylex/html.py "$@"
48}
49
50test-well-formed() {
51 cat >_tmp/bad.html <<EOF
52unfinished <!--
53hi && bye
54EOF
55 echo '_tmp/bad.html' | ht8-tool well-formed
56}
57
58# site errors
59#
60# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
61# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
62# 5833374 tokens in 4710 files
63#
64# The second is the "Woboq" browser, which has CDATA
65# Ah I wonder if we need that.
66
67# Takes ~13 seconds
68test-site() {
69 # TODO:
70 # - test that the top level lexes
71 # - test that each tag lexers
72 # - test that each quoted attribute lexes
73 # - test that tags are balanced
74
75 pushd ../../oilshell/oilshell.org__deploy
76
77 # Too many files
78 # site-files | xargs wc -l | grep total
79
80 # Not using xargs
81 time site-files | $REPO_ROOT/$0 ht8-tool well-formed
82
83 popd
84}
85
86readonly WWZ_DIR=_tmp/8899
87
88sync-wwz() {
89 mkdir -p $WWZ_DIR
90 rsync --archive --verbose \
91 op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
92}
93
94extract-wwz() {
95 pushd $WWZ_DIR
96 for z in *.wwz; do
97 local name=$(basename $z .wwz)
98
99 mkdir -p $name
100 pushd $name >/dev/null
101
102 echo $name
103 unzip ../$z
104
105 popd >/dev/null
106 done
107 popd
108}
109
110tree-wwz() {
111 tree $WWZ_DIR
112}
113
114test-wwz() {
115 pushd $WWZ_DIR
116
117 time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
118
119 popd
120}
121
122find-xml() {
123 time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
124}
125
126test-other-xml() {
127 # problem with &ent1;
128 # CDATA support! haha OK
129 time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
130}
131
132test-repo-xml() {
133 # OK these parse
134 time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
135 | $REPO_ROOT/$0 ht8-tool well-formed
136}
137
138# OK we have to skip the <script> tag! And <style>
139#
140# document.location = '#' + params.join('&');
141# gUrlHash = new UrlHash(location.hash);
142#
143# I think textarea we don't though?
144
145
146task-five "$@"
147exit
148
149
150echo '
151In HTML5, instead of
152<script>
153<![CDATA[
154 if (x < y) { ... }
155]]>
156</script>
157
158You can write
159
160<script>
161 if (x < y) { ... }
162</script>
163
164<script> <style> <textarea>
165
166These have special escaping rules. I guess we just do NOT lex them at all?
167We can totally SKIP them.
168
169CDATA vs. RCDATA
170
171<textarea>
172 &lt;p&gt; <!-- This will show as: <p> -->
173 &amp; <!-- This will show as: & -->
174</textarea>
175
176<script>
177 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
178 &amp; <!-- This will show literally as: &amp; -->
179</script>
180
181The main practical difference is that RCDATA processes HTML entities while
182CDATA treats them as literal text. Both modes ignore HTML tags (treating them
183as plain text) except for their own closing tag. '
184'
185