data_lang/htm8-test.sh

OILS / data_lang / htm8-test.sh View on Github | oils.pub

112 lines, 26 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# data_lang/htm8-test.sh
5
6	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
7
8	# Special case: we need $REPO_ROOT
9	: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
10	source $LIB_OSH/bash-strict.sh
11	source $LIB_OSH/task-five.sh
12
13	# parse with lazylex/html.py, or data_lang/htm8.py
14
15	site-files() {
16	#find ../../oilshell/oilshell.org__deploy -name '*.html'
17
18	# omit all the _ files
19	git ls-files \| grep '\.html$'
20	}
21
22	# Issues with lazylex/html.py
23	#
24	# - Token ID is annoying to express in Python
25	# - re.DOTALL for newlines
26	# - can we change that with [.\n]*?
27	# - nongreedy match for --> and ?>
28
29	ht8-tool() {
30	PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
31	$REPO_ROOT/lazylex/html.py "$@"
32	}
33
34	test-well-formed() {
35	cat >_tmp/bad.html <<EOF
36	hi && bye
37	EOF
38	echo '_tmp/bad.html' \| ht8-tool well-formed
39	}
40
41	# site errors
42	#
43	# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or \|\|</h4>\n<!-- 2')
44	# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
45	# 5833374 tokens in 4710 files
46	#
47	# The second is the "Woboq" browser, which has CDATA
48	# Ah I wonder if we need that.
49
50	# Takes ~13 seconds
51	test-site() {
52	# TODO:
53	# - test that the top level lexes
54	# - test that each tag lexers
55	# - test that each quoted attribute lexes
56	# - test that tags are balanced
57
58	pushd ../../oilshell/oilshell.org__deploy
59
60	# Too many files
61	# site-files \| xargs wc -l \| grep total
62
63	# Not using xargs
64	time site-files \| $REPO_ROOT/$0 ht8-tool well-formed
65
66	popd
67	}
68
69	test-wwz() {
70	echo 'TODO: download .wwz from CI'
71	}
72
73	task-five "$@"
74	exit
75
76
77	echo '
78	In HTML5, instead of
79	<script>
80	<![CDATA[
81	if (x < y) { ... }
82	]]>
83	</script>
84
85	You can write
86
87	<script>
88	if (x < y) { ... }
89	</script>
90
91	<script> <style> <textarea>
92
93	These have special escaping rules. I guess we just do NOT lex them at all?
94	We can totally SKIP them.
95
96	CDATA vs. RCDATA
97
98	<textarea>
99	<p> <!-- This will show as: <p> -->
100	& <!-- This will show as: & -->
101	</textarea>
102
103	<script>
104	<p> <!-- This will show literally as: <p> -->
105	& <!-- This will show literally as: & -->
106	</script>
107
108	The main practical difference is that RCDATA processes HTML entities while
109	CDATA treats them as literal text. Both modes ignore HTML tags (treating them
110	as plain text) except for their own closing tag. '
111	'
112