OILS / data_lang / htm8-test.sh View on Github | oils.pub

112 lines, 26 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# data_lang/htm8-test.sh
5
6REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
7
8# Special case: we need $REPO_ROOT
9: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
10source $LIB_OSH/bash-strict.sh
11source $LIB_OSH/task-five.sh
12
13# parse with lazylex/html.py, or data_lang/htm8.py
14
15site-files() {
16 #find ../../oilshell/oilshell.org__deploy -name '*.html'
17
18 # omit all the _ files
19 git ls-files | grep '\.html$'
20}
21
22# Issues with lazylex/html.py
23#
24# - Token ID is annoying to express in Python
25# - re.DOTALL for newlines
26# - can we change that with [.\n]*?
27# - nongreedy match for --> and ?>
28
29ht8-tool() {
30 PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
31 $REPO_ROOT/lazylex/html.py "$@"
32}
33
34test-well-formed() {
35 cat >_tmp/bad.html <<EOF
36hi && bye
37EOF
38 echo '_tmp/bad.html' | ht8-tool well-formed
39}
40
41# site errors
42#
43# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
44# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
45# 5833374 tokens in 4710 files
46#
47# The second is the "Woboq" browser, which has CDATA
48# Ah I wonder if we need that.
49
50# Takes ~13 seconds
51test-site() {
52 # TODO:
53 # - test that the top level lexes
54 # - test that each tag lexers
55 # - test that each quoted attribute lexes
56 # - test that tags are balanced
57
58 pushd ../../oilshell/oilshell.org__deploy
59
60 # Too many files
61 # site-files | xargs wc -l | grep total
62
63 # Not using xargs
64 time site-files | $REPO_ROOT/$0 ht8-tool well-formed
65
66 popd
67}
68
69test-wwz() {
70 echo 'TODO: download .wwz from CI'
71}
72
73task-five "$@"
74exit
75
76
77echo '
78In HTML5, instead of
79<script>
80<![CDATA[
81 if (x < y) { ... }
82]]>
83</script>
84
85You can write
86
87<script>
88 if (x < y) { ... }
89</script>
90
91<script> <style> <textarea>
92
93These have special escaping rules. I guess we just do NOT lex them at all?
94We can totally SKIP them.
95
96CDATA vs. RCDATA
97
98<textarea>
99 &lt;p&gt; <!-- This will show as: <p> -->
100 &amp; <!-- This will show as: & -->
101</textarea>
102
103<script>
104 &lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
105 &amp; <!-- This will show literally as: &amp; -->
106</script>
107
108The main practical difference is that RCDATA processes HTML entities while
109CDATA treats them as literal text. Both modes ignore HTML tags (treating them
110as plain text) except for their own closing tag. '
111'
112