1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Usage:
|
4 | # data_lang/htm8-test.sh
|
5 | #
|
6 | # TODO:
|
7 | # - Rename to data_lang/htm8.py
|
8 | # - it has NO_SPECIAL_TAGS mode for XML
|
9 | # - put iterators at a higher level in doctools/ ?
|
10 | #
|
11 | # - statically type it
|
12 | # - revive pyannotate
|
13 | # - translate to C++
|
14 | # - how to handle the regexes in the lexer? Port to re2c directly?
|
15 | # - for find(), do we need a C++ primitive for it?
|
16 | # - no allocation for TagName()
|
17 | # - ASDL file for Tok.Foo?
|
18 | # - refactor TagName() API - remove it from the TagLexer?
|
19 | # - that is really the AttrLexer()
|
20 | #
|
21 | # Not working yet:
|
22 | # - understanding all entities &zz;
|
23 | # - there are over 2000 of them, not sure I want to build them all into the Oils binaries
|
24 | # - capital letters <TR/> - I guess we can normalize the case
|
25 | #
|
26 | # Leniency:
|
27 | # - foo=1&bar=2 is extremely common
|
28 | # - well then does that mean you allow <p>a & b</b too?
|
29 | # - and then it's not far from that to <p id="value >"> - the quotes help
|
30 | # - I guess you can have a rule for unescaped &, just like unescaped backslash
|
31 | # - you can warn about it, but it doesn't cause much problem?
|
32 | # We are already firmly in HTML territory, not in XML ...
|
33 | #
|
34 | # Features:
|
35 | # - Are there special rules for <svg> and <math>?
|
36 | # - Do we need to know about <textarea> <pre>? Those don't have the same
|
37 | # whitespace rules
|
38 | #
|
39 | # YSH API
|
40 | # - Generating HTML/HTM8 is much more common than parsing it
|
41 | # - although maybe we can do RemoveComments as a demo?
|
42 | # - that is the lowest level "sed" model
|
43 | # - For parsing, a minimum idea is:
|
44 | # - lexer-based algorithms for query by tag, class name, and id
|
45 | # - and then toTree() - this is a DOM
|
46 | # - .tag and .attrs?
|
47 | # - .innerHTML() and .outerHTML() perhaps
|
48 | # - rewrite ul-table in that?
|
49 | # - does that mean you mutate it, or construct text?
|
50 | # - I think you can set the innerHTML probably
|
51 | #
|
52 | # - Testing of html.ysh aka htm8.ysh in the stdlib
|
53 | #
|
54 | # Cases:
|
55 | # html 'hello <b>world</b>'
|
56 | # html "hello <b>$name</b>"html
|
57 | # html ["hello <b>$name</b>"] # hm this isn't bad, it's an unevaluated expression?
|
58 | # commonmark 'hello **world**'
|
59 | # md 'hello **world**'
|
60 | # md ['hello **$escape**'] ? We don't have a good escaping algorithm
|
61 |
|
62 |
|
63 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
64 |
|
65 | # Special case: we need $REPO_ROOT
|
66 | : ${LIB_OSH=$REPO_ROOT/stdlib/osh}
|
67 | source $LIB_OSH/bash-strict.sh
|
68 | source $LIB_OSH/task-five.sh
|
69 |
|
70 | # parse with lazylex/html.py, or data_lang/htm8.py
|
71 |
|
72 | site-files() {
|
73 | #find ../../oilshell/oilshell.org__deploy -name '*.html'
|
74 |
|
75 | # omit all the _ files
|
76 | git ls-files | grep '\.html$'
|
77 | }
|
78 |
|
79 | # Issues with lazylex/html.py
|
80 | #
|
81 | # - Token ID is annoying to express in Python
|
82 | # - re.DOTALL for newlines
|
83 | # - can we change that with [.\n]*?
|
84 | # - nongreedy match for --> and ?>
|
85 |
|
86 | htm8-tool() {
|
87 | PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
|
88 | $REPO_ROOT/lazylex/html.py "$@"
|
89 | }
|
90 |
|
91 | test-well-formed() {
|
92 | cat >_tmp/bad.html <<EOF
|
93 | unfinished <!--
|
94 | hi && bye
|
95 | EOF
|
96 | echo '_tmp/bad.html' | htm8-tool well-formed
|
97 | }
|
98 |
|
99 | # site errors
|
100 | #
|
101 | # Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
|
102 | # Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
|
103 | # 5833374 tokens in 4710 files
|
104 | #
|
105 | # The second is the "Woboq" browser, which has CDATA
|
106 | # Ah I wonder if we need that.
|
107 |
|
108 | # Takes ~13 seconds
|
109 | test-site() {
|
110 | local new_site=${1:-}
|
111 |
|
112 | # TODO:
|
113 | # - test that the top level lexes
|
114 | # - test that each tag lexers
|
115 | # - test that each quoted attribute lexes
|
116 | # - test that tags are balanced
|
117 |
|
118 | local dir
|
119 | local action
|
120 | if test -n "$new_site"; then
|
121 | dir='../oils.pub__deploy'
|
122 | action='parse-htm8'
|
123 | else
|
124 | dir='../../oilshell/oilshell.org__deploy'
|
125 | action='lex-htm8'
|
126 | fi
|
127 |
|
128 | pushd $dir
|
129 |
|
130 | # Too many files
|
131 | # site-files | xargs wc -l | grep total
|
132 |
|
133 | # Not using xargs
|
134 | time site-files | $REPO_ROOT/$0 htm8-tool $action
|
135 |
|
136 | popd
|
137 | }
|
138 |
|
139 | readonly SOIL_ID=8924
|
140 | readonly WWZ_DIR=_tmp/$SOIL_ID
|
141 |
|
142 | sync-wwz() {
|
143 | mkdir -p $WWZ_DIR
|
144 | rsync --archive --verbose \
|
145 | op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
|
146 | }
|
147 |
|
148 | extract-wwz() {
|
149 | pushd $WWZ_DIR
|
150 | for z in *.wwz; do
|
151 | local name=$(basename $z .wwz)
|
152 |
|
153 | mkdir -p $name
|
154 | pushd $name >/dev/null
|
155 |
|
156 | echo $name
|
157 | unzip ../$z
|
158 |
|
159 | popd >/dev/null
|
160 | done
|
161 | popd
|
162 | }
|
163 |
|
164 | tree-wwz() {
|
165 | tree $WWZ_DIR
|
166 | }
|
167 |
|
168 | test-wwz() {
|
169 | pushd $WWZ_DIR
|
170 |
|
171 | time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
172 |
|
173 | popd
|
174 | }
|
175 |
|
176 | find-xml() {
|
177 | time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
|
178 | }
|
179 |
|
180 | test-other-xml() {
|
181 | # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
|
182 | time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
|
183 | }
|
184 |
|
185 | test-repo-xml() {
|
186 | # OK these parse
|
187 | time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
|
188 | | $REPO_ROOT/$0 htm8-tool parse-xml
|
189 | }
|
190 |
|
191 | test-repo-html() {
|
192 | time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
193 | }
|
194 |
|
195 | test-docs() {
|
196 | time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
197 | }
|
198 |
|
199 | soil-run() {
|
200 | test-docs
|
201 | }
|
202 |
|
203 | # OK we have to skip the <script> tag! And <style>
|
204 | #
|
205 | # document.location = '#' + params.join('&');
|
206 | # gUrlHash = new UrlHash(location.hash);
|
207 | #
|
208 | # I think textarea we don't though?
|
209 |
|
210 |
|
211 | task-five "$@"
|
212 | exit
|
213 |
|
214 |
|
215 | echo '
|
216 | In HTML5, instead of
|
217 | <script>
|
218 | <![CDATA[
|
219 | if (x < y) { ... }
|
220 | ]]>
|
221 | </script>
|
222 |
|
223 | You can write
|
224 |
|
225 | <script>
|
226 | if (x < y) { ... }
|
227 | </script>
|
228 |
|
229 | <script> <style> <textarea>
|
230 |
|
231 | These have special escaping rules. I guess we just do NOT lex them at all?
|
232 | We can totally SKIP them.
|
233 |
|
234 | CDATA vs. RCDATA
|
235 |
|
236 | <textarea>
|
237 | <p> <!-- This will show as: <p> -->
|
238 | & <!-- This will show as: & -->
|
239 | </textarea>
|
240 |
|
241 | <script>
|
242 | <p> <!-- This will show literally as: <p> -->
|
243 | & <!-- This will show literally as: & -->
|
244 | </script>
|
245 |
|
246 | The main practical difference is that RCDATA processes HTML entities while
|
247 | CDATA treats them as literal text. Both modes ignore HTML tags (treating them
|
248 | as plain text) except for their own closing tag. '
|
249 | '
|
250 |
|