1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Usage:
|
4 | # data_lang/htm8-test.sh
|
5 | #
|
6 | # TODO:
|
7 | # - Rename to DML8? Because it can handle XML
|
8 | # - CDATA in XML, which is not a script
|
9 | #
|
10 | # Operations / Levels:
|
11 | #
|
12 | # - Lexing
|
13 | # - lex-tags
|
14 | # - lex-attrs - validate all Start tags, all StartEnd tags
|
15 | # - lex-quoted-values - unescaping, etc.
|
16 | # - are there invalid entities?
|
17 | # - Parsing
|
18 | # - well-formed / tag balance check
|
19 | # - Schema
|
20 | # - not sure if we check the HTML schema or not - it might be too restrictive
|
21 |
|
22 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
23 |
|
24 | # Special case: we need $REPO_ROOT
|
25 | : ${LIB_OSH=$REPO_ROOT/stdlib/osh}
|
26 | source $LIB_OSH/bash-strict.sh
|
27 | source $LIB_OSH/task-five.sh
|
28 |
|
29 | # parse with lazylex/html.py, or data_lang/htm8.py
|
30 |
|
31 | site-files() {
|
32 | #find ../../oilshell/oilshell.org__deploy -name '*.html'
|
33 |
|
34 | # omit all the _ files
|
35 | git ls-files | grep '\.html$'
|
36 | }
|
37 |
|
38 | # Issues with lazylex/html.py
|
39 | #
|
40 | # - Token ID is annoying to express in Python
|
41 | # - re.DOTALL for newlines
|
42 | # - can we change that with [.\n]*?
|
43 | # - nongreedy match for --> and ?>
|
44 |
|
45 | ht8-tool() {
|
46 | PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
|
47 | $REPO_ROOT/lazylex/html.py "$@"
|
48 | }
|
49 |
|
50 | test-well-formed() {
|
51 | cat >_tmp/bad.html <<EOF
|
52 | unfinished <!--
|
53 | hi && bye
|
54 | EOF
|
55 | echo '_tmp/bad.html' | ht8-tool well-formed
|
56 | }
|
57 |
|
58 | # site errors
|
59 | #
|
60 | # Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
|
61 | # Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
|
62 | # 5833374 tokens in 4710 files
|
63 | #
|
64 | # The second is the "Woboq" browser, which has CDATA
|
65 | # Ah I wonder if we need that.
|
66 |
|
67 | # Takes ~13 seconds
|
68 | test-site() {
|
69 | # TODO:
|
70 | # - test that the top level lexes
|
71 | # - test that each tag lexers
|
72 | # - test that each quoted attribute lexes
|
73 | # - test that tags are balanced
|
74 |
|
75 | pushd ../../oilshell/oilshell.org__deploy
|
76 |
|
77 | # Too many files
|
78 | # site-files | xargs wc -l | grep total
|
79 |
|
80 | # Not using xargs
|
81 | time site-files | $REPO_ROOT/$0 ht8-tool well-formed
|
82 |
|
83 | popd
|
84 | }
|
85 |
|
86 | readonly WWZ_DIR=_tmp/8899
|
87 |
|
88 | sync-wwz() {
|
89 | mkdir -p $WWZ_DIR
|
90 | rsync --archive --verbose \
|
91 | op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
|
92 | }
|
93 |
|
94 | extract-wwz() {
|
95 | pushd $WWZ_DIR
|
96 | for z in *.wwz; do
|
97 | local name=$(basename $z .wwz)
|
98 |
|
99 | mkdir -p $name
|
100 | pushd $name >/dev/null
|
101 |
|
102 | echo $name
|
103 | unzip ../$z
|
104 |
|
105 | popd >/dev/null
|
106 | done
|
107 | popd
|
108 | }
|
109 |
|
110 | tree-wwz() {
|
111 | tree $WWZ_DIR
|
112 | }
|
113 |
|
114 | test-wwz() {
|
115 | pushd $WWZ_DIR
|
116 |
|
117 | time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
|
118 |
|
119 | popd
|
120 | }
|
121 |
|
122 | find-xml() {
|
123 | time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
|
124 | }
|
125 |
|
126 | test-other-xml() {
|
127 | # problem with &ent1;
|
128 | # CDATA support! haha OK
|
129 | time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
|
130 | }
|
131 |
|
132 | test-repo-xml() {
|
133 | # OK these parse
|
134 | time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
|
135 | | $REPO_ROOT/$0 ht8-tool well-formed
|
136 | }
|
137 |
|
138 | # OK we have to skip the <script> tag! And <style>
|
139 | #
|
140 | # document.location = '#' + params.join('&');
|
141 | # gUrlHash = new UrlHash(location.hash);
|
142 | #
|
143 | # I think textarea we don't though?
|
144 |
|
145 |
|
146 | task-five "$@"
|
147 | exit
|
148 |
|
149 |
|
150 | echo '
|
151 | In HTML5, instead of
|
152 | <script>
|
153 | <![CDATA[
|
154 | if (x < y) { ... }
|
155 | ]]>
|
156 | </script>
|
157 |
|
158 | You can write
|
159 |
|
160 | <script>
|
161 | if (x < y) { ... }
|
162 | </script>
|
163 |
|
164 | <script> <style> <textarea>
|
165 |
|
166 | These have special escaping rules. I guess we just do NOT lex them at all?
|
167 | We can totally SKIP them.
|
168 |
|
169 | CDATA vs. RCDATA
|
170 |
|
171 | <textarea>
|
172 | <p> <!-- This will show as: <p> -->
|
173 | & <!-- This will show as: & -->
|
174 | </textarea>
|
175 |
|
176 | <script>
|
177 | <p> <!-- This will show literally as: <p> -->
|
178 | & <!-- This will show literally as: & -->
|
179 | </script>
|
180 |
|
181 | The main practical difference is that RCDATA processes HTML entities while
|
182 | CDATA treats them as literal text. Both modes ignore HTML tags (treating them
|
183 | as plain text) except for their own closing tag. '
|
184 | '
|
185 |
|