spec/ysh-regex-bytes-chars.test.sh

OILS / spec / ysh-regex-bytes-chars.test.sh View on Github | oils.pub

281 lines, 116 significant

1	## oils_failures_allowed: 1
2
3	#### Match tab character with [\t]
4	shopt -s ysh:all
5
6	var pat = / ('a' [\t] 'b') /
7	pp test_ (str(pat))
8
9	var lines = :\| b'aa\tbb' b'cc\tdd' \|
10	write @lines \| egrep $pat \| od -A n -t c
11
12	## STDOUT:
13	(Str) "(a[\t]b)"
14	a a \t b b \n
15	## END
16
17	#### Match newline with [\n]
18	shopt -s ysh:all
19
20	var pat = / [\n] /
21
22	pp test_ (str(pat))
23
24	pp test_ ('z' ~ pat)
25
26	# this matches
27	pp test_ (b'\n' ~ pat)
28
29	# but then what happens with grep?
30	# invalid regular expression
31
32	# write 1 2 3 \| egrep "$pat"
33
34	## STDOUT:
35	(Str) "[\n]"
36	(Bool) false
37	(Bool) true
38	## END
39
40	#### ERE: 'dot' matches newline
41
42	= 'z' ~ /dot/
43	= b'\n' ~ /dot/
44	= '' ~ /dot/
45
46	## STDOUT:
47	(Bool) true
48	(Bool) true
49	(Bool) false
50	## END
51
52	#### ERE: 'dot' matches code point represented with multiple bytes (mu = 0xce 0xbe)
53
54	var pat = / 'a' dot 'b' /
55
56	pp test_ ('axb' ~ pat )
57	# mu character
58	pp test_ (b'a\yce\ybcb' ~ pat )
59	pp test_ (b'a\u{3bc}b' ~ pat )
60
61	pp test_ ('ab' ~ pat )
62	pp test_ ('aZZb' ~ pat )
63
64	## STDOUT:
65	(Bool) true
66	(Bool) true
67	(Bool) true
68	(Bool) false
69	(Bool) false
70	## END
71
72	#### $'\xff' is disallowed in Eggex, because it's disallowed in YSH
73
74	# NOTE: This pattern doesn't work with en_US.UTF-8. I think the user should
75	# set LANG=C or shopt --unset libc_utf8.
76
77	shopt -s ysh:all
78	= /[ $'\xff' ]/;
79
80	## status: 2
81	## STDOUT:
82	## END
83
84	#### Match low ASCII with [\x01]
85	shopt -s ysh:all
86
87	pp test_ ('a' ~ / [a] /)
88	echo
89
90	pp test_ (b'\y01' ~ / [\x01] /)
91	pp test_ (b'\y02' ~ / [\x01] /)
92
93	# \y01 isn't accepted, because we punt \x01 translation to ERE ...
94	#pp test_ (b'\y02' ~ / [\y01] /)
95
96	# we print this as \u{1}
97	# = str( / [\x01] / )
98
99	## STDOUT:
100	(Bool) true
101
102	(Bool) true
103	(Bool) false
104	## END
105
106	#### Match low ASCII with \u{7f} - translates to valid ERE
107	shopt -s ysh:all
108	var pat = /[ \u{7f} ]/;
109
110	echo $pat \| od -A n -t x1
111	if (b'\y7f' ~ pat) { echo yes } else { echo no }
112	if (b'\y7e' ~ pat) { echo yes } else { echo no }
113
114	var pat2 = /[ \u{7f} ]/;
115	var pat3 = /[ \u{0007f} ]/;
116	test "$pat2" = "$pat3" && echo 'equal'
117
118	var range = / [ \u{70} - \u{7f} ] /
119	if (b'\y70' ~ range) { echo yes } else { echo no }
120	if (b'\y69' ~ range) { echo yes } else { echo no }
121
122	## STDOUT:
123	5b 7f 5d 0a
124	yes
125	no
126	equal
127	yes
128	no
129	## END
130
131	#### non-ASCII bytes must be singleton terms, e.g. b'\y7f\yff' is disallowed
132	var bytes = b'\y7f\yff'
133	var pat = / [ @bytes ] /
134	echo $pat
135	## status: 1
136	## stdout-json: ""
137
138	#### Bytes are denoted \y01 in Eggex char classes (not \x01)
139
140	# That is, eggex does have MODES like re.UNICODE
141	#
142	# We UNAMBIGUOUSLY accept
143	# - \y01 or \u{1} - these are the same
144	# - \yff or \u{ff} - these are DIFFERENT
145
146	var pat = / [\y01] /
147	pp test_ (b'\y01' ~ pat)
148	pp test_ ('a' ~ pat)
149
150	## STDOUT:
151	(Bool) true
152	(Bool) false
153	## END
154
155	#### NUL byte can be expressed in Eggex, but not in ERE
156
157	$SH <<'EOF'
158	pp test_ (b'\y01' ~ / [\y01] /)
159	pp test_ (b'\y00' ~ / [\y00] /)
160	EOF
161	echo status=$?
162
163	$SH <<'EOF'
164	pp test_ (b'\y01' ~ / [\u{1}] /)
165	pp test_ (b'\y00' ~ / [\u{0}] /)
166	EOF
167	echo status=$?
168
169
170	# legacy synonym
171
172	$SH <<'EOF'
173	pp test_ (b'\y01' ~ / [\x01] /)
174	pp test_ (b'\y00' ~ / [\x00] /)
175	EOF
176	echo status=$?
177
178	## STDOUT:
179	(Bool) true
180	status=1
181	(Bool) true
182	status=1
183	(Bool) true
184	status=1
185	## END
186
187	#### High bytes 0x80 0xff usually can't be matched - Eggex is UTF-8
188	shopt -s ysh:all
189
190	# ascii works
191	pp test_ (b'\y7f' ~ / [\x7f] /)
192	pp test_ (b'\y7e' ~ / [\x7f] /)
193
194	= str( / [\y80]/ )
195
196	#pp test_ (b'\y80' ~ / [\y80] /)
197	#pp test_ (b'\yff' ~ / [\yff] /)
198
199	## status: 1
200	## STDOUT:
201	(Bool) true
202	(Bool) false
203	## END
204
205	#### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
206
207	export LC_ALL=C
208
209	$SH <<'EOF'
210	var yes = b'foo \yff'
211	var no = b'foo'
212
213	# POSIX ERE string
214	var ere = b'[\yff]'
215
216	pp test_ (yes ~ ere)
217	pp test_ (no ~ ere)
218	EOF
219
220	## STDOUT:
221	(Bool) true
222	(Bool) false
223	## END
224
225	#### Code points like \u{3bc} can be matched
226
227	var pat = / [\u{3bc}] /
228	pp test_ (b'a' ~ pat)
229	pp test_ (b'\u{3bc}' ~ / [\u{3bc}] /)
230	echo
231
232	var pat = / [\u{10ffff}] /
233	pp test_ (b'a' ~ pat)
234	pp test_ (b'\u{10ffff}' ~ pat)
235
236	#echo "-$pat-"
237
238	## STDOUT:
239	(Bool) false
240	(Bool) true
241
242	(Bool) false
243	(Bool) true
244	## END
245
246	#### Code point ranges work in limited cases
247	shopt -s ysh:all
248
249	var range1 = /[ \u{1} - \u{7e} ]/;
250
251	pp test_ (u'\u{7f}' ~ range1)
252	pp test_ (u'\u{7e}' ~ range1)
253
254	exit
255
256	# Invalid collation character? Unicode ranges don't work I guess
257	var range2 = /[ \u{1} - \u{3bc} ]/;
258
259	pp test_ (b'\y7f' ~ range2)
260	pp test_ (b'\y7e' ~ range2)
261
262	## STDOUT:
263	(Bool) false
264	(Bool) true
265	## END
266
267
268	#### Max code point is disallowed at parse time
269
270	pp test_ (/ [\u{10ffff}] /)
271	pp test_ (/ [\u{110000}] /)
272
273	## STDOUT:
274	(Bool) false
275	(Bool) true
276
277	(Bool) false
278	(Bool) true
279	## END
280
281