OILS / data_lang / htm8-test.sh View on Github | oils.pub

153 lines, 49 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5
6REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
7
8# Special case: we need $REPO_ROOT
9: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
10source $LIB_OSH/bash-strict.sh
11source $LIB_OSH/task-five.sh
12
13# parse with lazylex/html.py, or data_lang/htm8.py
14
15site-files() {
16 #find ../../oilshell/oilshell.org__deploy -name '*.html'
17
18 # omit all the _ files
19 git ls-files | grep '\.html$'
20}
21
22# Issues with lazylex/html.py
23#
24# - Token ID is annoying to express in Python
25# - re.DOTALL for newlines
26# - can we change that with [.\n]*?
27# - nongreedy match for --> and ?>
28
29ht8-tool() {
30 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
31 $REPO_ROOT/lazylex/html.py "$@"
32}
33
34test-well-formed() {
35 cat >_tmp/bad.html <<EOF
36unfinished <!--
37hi && bye
38EOF
39 echo '_tmp/bad.html' | ht8-tool well-formed
40}
41
42# site errors
43#
44# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
45# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
46# 5833374 tokens in 4710 files
47#
48# The second is the "Woboq" browser, which has CDATA
49# Ah I wonder if we need that.
50
51# Takes ~13 seconds
52test-site() {
53 # TODO:
54 # - test that the top level lexes
55 # - test that each tag lexers
56 # - test that each quoted attribute lexes
57 # - test that tags are balanced
58
59 pushd ../../oilshell/oilshell.org__deploy
60
61 # Too many files
62 # site-files | xargs wc -l | grep total
63
64 # Not using xargs
65 time site-files | $REPO_ROOT/$0 ht8-tool well-formed
66
67 popd
68}
69
70readonly WWZ_DIR=_tmp/8899
71
72sync-wwz() {
73 mkdir -p $WWZ_DIR
74 rsync --archive --verbose \
75 op.oilshell.org:op.oilshell.org/uuu/github-jobs/8899/ $WWZ_DIR/
76}
77
78extract-wwz() {
79 pushd $WWZ_DIR
80 for z in *.wwz; do
81 local name=$(basename $z .wwz)
82
83 mkdir -p $name
84 pushd $name >/dev/null
85
86 echo $name
87 unzip ../$z
88
89 popd >/dev/null
90 done
91 popd
92}
93
94tree-wwz() {
95 tree $WWZ_DIR
96}
97
98test-wwz() {
99 pushd $WWZ_DIR
100
101 find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
102
103 popd
104}
105
106# OK we have to skip the <script> tag! And <style>
107#
108# document.location = '#' + params.join('&');
109# gUrlHash = new UrlHash(location.hash);
110#
111# I think textarea we don't though?
112
113
114task-five "$@"
115exit
116
117
118echo '
119In HTML5, instead of
120<script>
121<![CDATA[
122 if (x < y) { ... }
123]]>
124</script>
125
126You can write
127
128<script>
129 if (x < y) { ... }
130</script>
131
132<script> <style> <textarea>
133
134These have special escaping rules. I guess we just do NOT lex them at all?
135We can totally SKIP them.
136
137CDATA vs. RCDATA
138
139<textarea>
140 &lt;p&gt; <!-- This will show as: <p> -->
141 &amp; <!-- This will show as: & -->
142</textarea>
143
144<script>
145 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
146 &amp; <!-- This will show literally as: &amp; -->
147</script>
148
149The main practical difference is that RCDATA processes HTML entities while
150CDATA treats them as literal text. Both modes ignore HTML tags (treating them
151as plain text) except for their own closing tag. '
152'
153