1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Usage:
|
4 | # data_lang/htm8-test.sh
|
5 |
|
6 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
7 |
|
8 | # Special case: we need $REPO_ROOT
|
9 | : ${LIB_OSH=$REPO_ROOT/stdlib/osh}
|
10 | source $LIB_OSH/bash-strict.sh
|
11 | source $LIB_OSH/task-five.sh
|
12 |
|
13 | # parse with lazylex/html.py, or data_lang/htm8.py
|
14 |
|
15 | site-files() {
|
16 | #find ../../oilshell/oilshell.org__deploy -name '*.html'
|
17 |
|
18 | # omit all the _ files
|
19 | git ls-files | grep '\.html$'
|
20 | }
|
21 |
|
22 | # Issues with lazylex/html.py
|
23 | #
|
24 | # - Token ID is annoying to express in Python
|
25 | # - re.DOTALL for newlines
|
26 | # - can we change that with [.\n]*?
|
27 | # - nongreedy match for --> and ?>
|
28 |
|
29 | ht8-tool() {
|
30 | PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
|
31 | $REPO_ROOT/lazylex/html.py "$@"
|
32 | }
|
33 |
|
34 | test-well-formed() {
|
35 | cat >_tmp/bad.html <<EOF
|
36 | hi && bye
|
37 | EOF
|
38 | echo '_tmp/bad.html' | ht8-tool well-formed
|
39 | }
|
40 |
|
41 | # site errors
|
42 | #
|
43 | # Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
|
44 | # Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
|
45 | # 5833374 tokens in 4710 files
|
46 | #
|
47 | # The second is the "Woboq" browser, which has CDATA
|
48 | # Ah I wonder if we need that.
|
49 |
|
50 | # Takes ~13 seconds
|
51 | test-site() {
|
52 | # TODO:
|
53 | # - test that the top level lexes
|
54 | # - test that each tag lexers
|
55 | # - test that each quoted attribute lexes
|
56 | # - test that tags are balanced
|
57 |
|
58 | pushd ../../oilshell/oilshell.org__deploy
|
59 |
|
60 | # Too many files
|
61 | # site-files | xargs wc -l | grep total
|
62 |
|
63 | # Not using xargs
|
64 | time site-files | $REPO_ROOT/$0 ht8-tool well-formed
|
65 |
|
66 | popd
|
67 | }
|
68 |
|
69 | test-wwz() {
|
70 | echo 'TODO: download .wwz from CI'
|
71 | }
|
72 |
|
73 | task-five "$@"
|
74 | exit
|
75 |
|
76 |
|
77 | echo '
|
78 | In HTML5, instead of
|
79 | <script>
|
80 | <![CDATA[
|
81 | if (x < y) { ... }
|
82 | ]]>
|
83 | </script>
|
84 |
|
85 | You can write
|
86 |
|
87 | <script>
|
88 | if (x < y) { ... }
|
89 | </script>
|
90 |
|
91 | <script> <style> <textarea>
|
92 |
|
93 | These have special escaping rules. I guess we just do NOT lex them at all?
|
94 | We can totally SKIP them.
|
95 |
|
96 | CDATA vs. RCDATA
|
97 |
|
98 | <textarea>
|
99 | <p> <!-- This will show as: <p> -->
|
100 | & <!-- This will show as: & -->
|
101 | </textarea>
|
102 |
|
103 | <script>
|
104 | <p> <!-- This will show literally as: <p> -->
|
105 | & <!-- This will show literally as: & -->
|
106 | </script>
|
107 |
|
108 | The main practical difference is that RCDATA processes HTML entities while
|
109 | CDATA treats them as literal text. Both modes ignore HTML tags (treating them
|
110 | as plain text) except for their own closing tag. '
|
111 | '
|
112 |
|