OILS / spec / ysh-regex-bytes-chars.test.sh View on Github | oils.pub

281 lines, 116 significant
1## oils_failures_allowed: 1
2
3#### Match tab character with [\t]
4shopt -s ysh:all
5
6var pat = / ('a' [\t] 'b') /
7pp test_ (str(pat))
8
9var lines = :| b'aa\tbb' b'cc\tdd' |
10write @lines | egrep $pat | od -A n -t c
11
12## STDOUT:
13(Str) "(a[\t]b)"
14 a a \t b b \n
15## END
16
17#### Match newline with [\n]
18shopt -s ysh:all
19
20var pat = / [\n] /
21
22pp test_ (str(pat))
23
24pp test_ ('z' ~ pat)
25
26# this matches
27pp test_ (b'\n' ~ pat)
28
29# but then what happens with grep?
30# invalid regular expression
31
32# write 1 2 3 | egrep "$pat"
33
34## STDOUT:
35(Str) "[\n]"
36(Bool) false
37(Bool) true
38## END
39
40#### ERE: 'dot' matches newline
41
42= 'z' ~ /dot/
43= b'\n' ~ /dot/
44= '' ~ /dot/
45
46## STDOUT:
47(Bool) true
48(Bool) true
49(Bool) false
50## END
51
52#### ERE: 'dot' matches code point represented with multiple bytes (mu = 0xce 0xbe)
53
54var pat = / 'a' dot 'b' /
55
56pp test_ ('axb' ~ pat )
57# mu character
58pp test_ (b'a\yce\ybcb' ~ pat )
59pp test_ (b'a\u{3bc}b' ~ pat )
60
61pp test_ ('ab' ~ pat )
62pp test_ ('aZZb' ~ pat )
63
64## STDOUT:
65(Bool) true
66(Bool) true
67(Bool) true
68(Bool) false
69(Bool) false
70## END
71
72#### $'\xff' is disallowed in Eggex, because it's disallowed in YSH
73
74# NOTE: This pattern doesn't work with en_US.UTF-8. I think the user should
75# set LANG=C or shopt --unset libc_utf8.
76
77shopt -s ysh:all
78= /[ $'\xff' ]/;
79
80## status: 2
81## STDOUT:
82## END
83
84#### Match low ASCII with [\x01]
85shopt -s ysh:all
86
87pp test_ ('a' ~ / [a] /)
88echo
89
90pp test_ (b'\y01' ~ / [\x01] /)
91pp test_ (b'\y02' ~ / [\x01] /)
92
93# \y01 isn't accepted, because we punt \x01 translation to ERE ...
94#pp test_ (b'\y02' ~ / [\y01] /)
95
96# we print this as \u{1}
97# = str( / [\x01] / )
98
99## STDOUT:
100(Bool) true
101
102(Bool) true
103(Bool) false
104## END
105
106#### Match low ASCII with \u{7f} - translates to valid ERE
107shopt -s ysh:all
108var pat = /[ \u{7f} ]/;
109
110echo $pat | od -A n -t x1
111if (b'\y7f' ~ pat) { echo yes } else { echo no }
112if (b'\y7e' ~ pat) { echo yes } else { echo no }
113
114var pat2 = /[ \u{7f} ]/;
115var pat3 = /[ \u{0007f} ]/;
116test "$pat2" = "$pat3" && echo 'equal'
117
118var range = / [ \u{70} - \u{7f} ] /
119if (b'\y70' ~ range) { echo yes } else { echo no }
120if (b'\y69' ~ range) { echo yes } else { echo no }
121
122## STDOUT:
123 5b 7f 5d 0a
124yes
125no
126equal
127yes
128no
129## END
130
131#### non-ASCII bytes must be singleton terms, e.g. b'\y7f\yff' is disallowed
132var bytes = b'\y7f\yff'
133var pat = / [ @bytes ] /
134echo $pat
135## status: 1
136## stdout-json: ""
137
138#### Bytes are denoted \y01 in Eggex char classes (not \x01)
139
140# That is, eggex does have MODES like re.UNICODE
141#
142# We UNAMBIGUOUSLY accept
143# - \y01 or \u{1} - these are the same
144# - \yff or \u{ff} - these are DIFFERENT
145
146var pat = / [\y01] /
147pp test_ (b'\y01' ~ pat)
148pp test_ ('a' ~ pat)
149
150## STDOUT:
151(Bool) true
152(Bool) false
153## END
154
155#### NUL byte can be expressed in Eggex, but not in ERE
156
157$SH <<'EOF'
158pp test_ (b'\y01' ~ / [\y01] /)
159pp test_ (b'\y00' ~ / [\y00] /)
160EOF
161echo status=$?
162
163$SH <<'EOF'
164pp test_ (b'\y01' ~ / [\u{1}] /)
165pp test_ (b'\y00' ~ / [\u{0}] /)
166EOF
167echo status=$?
168
169
170# legacy synonym
171
172$SH <<'EOF'
173pp test_ (b'\y01' ~ / [\x01] /)
174pp test_ (b'\y00' ~ / [\x00] /)
175EOF
176echo status=$?
177
178## STDOUT:
179(Bool) true
180status=1
181(Bool) true
182status=1
183(Bool) true
184status=1
185## END
186
187#### High bytes 0x80 0xff usually can't be matched - Eggex is UTF-8
188shopt -s ysh:all
189
190# ascii works
191pp test_ (b'\y7f' ~ / [\x7f] /)
192pp test_ (b'\y7e' ~ / [\x7f] /)
193
194= str( / [\y80]/ )
195
196#pp test_ (b'\y80' ~ / [\y80] /)
197#pp test_ (b'\yff' ~ / [\yff] /)
198
199## status: 1
200## STDOUT:
201(Bool) true
202(Bool) false
203## END
204
205#### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
206
207export LC_ALL=C
208
209$SH <<'EOF'
210var yes = b'foo \yff'
211var no = b'foo'
212
213# POSIX ERE string
214var ere = b'[\yff]'
215
216pp test_ (yes ~ ere)
217pp test_ (no ~ ere)
218EOF
219
220## STDOUT:
221(Bool) true
222(Bool) false
223## END
224
225#### Code points like \u{3bc} can be matched
226
227var pat = / [\u{3bc}] /
228pp test_ (b'a' ~ pat)
229pp test_ (b'\u{3bc}' ~ / [\u{3bc}] /)
230echo
231
232var pat = / [\u{10ffff}] /
233pp test_ (b'a' ~ pat)
234pp test_ (b'\u{10ffff}' ~ pat)
235
236#echo "-$pat-"
237
238## STDOUT:
239(Bool) false
240(Bool) true
241
242(Bool) false
243(Bool) true
244## END
245
246#### Code point ranges work in limited cases
247shopt -s ysh:all
248
249var range1 = /[ \u{1} - \u{7e} ]/;
250
251pp test_ (u'\u{7f}' ~ range1)
252pp test_ (u'\u{7e}' ~ range1)
253
254exit
255
256# Invalid collation character? Unicode ranges don't work I guess
257var range2 = /[ \u{1} - \u{3bc} ]/;
258
259pp test_ (b'\y7f' ~ range2)
260pp test_ (b'\y7e' ~ range2)
261
262## STDOUT:
263(Bool) false
264(Bool) true
265## END
266
267
268#### Max code point is disallowed at parse time
269
270pp test_ (/ [\u{10ffff}] /)
271pp test_ (/ [\u{110000}] /)
272
273## STDOUT:
274(Bool) false
275(Bool) true
276
277(Bool) false
278(Bool) true
279## END
280
281