1 ## oils_failures_allowed: 1
2
3 #### Match tab character with [\t]
4 shopt -s ysh:all
5
6 var pat = / ('a' [\t] 'b') /
7 pp test_ (str(pat))
8
9 var lines = :| b'aa\tbb' b'cc\tdd' |
10 write @lines | egrep $pat | od -A n -t c
11
12 ## STDOUT:
13 (Str) "(a[\t]b)"
14 a a \t b b \n
15 ## END
16
17 #### Match newline with [\n]
18 shopt -s ysh:all
19
20 var pat = / [\n] /
21
22 pp test_ (str(pat))
23
24 pp test_ ('z' ~ pat)
25
26 # this matches
27 pp test_ (b'\n' ~ pat)
28
29 # but then what happens with grep?
30 # invalid regular expression
31
32 # write 1 2 3 | egrep "$pat"
33
34 ## STDOUT:
35 (Str) "[\n]"
36 (Bool) false
37 (Bool) true
38 ## END
39
40 #### ERE: 'dot' matches newline
41
42 = 'z' ~ /dot/
43 = b'\n' ~ /dot/
44 = '' ~ /dot/
45
46 ## STDOUT:
47 (Bool) true
48 (Bool) true
49 (Bool) false
50 ## END
51
52 #### ERE: 'dot' matches code point represented with multiple bytes (mu = 0xce 0xbe)
53
54 var pat = / 'a' dot 'b' /
55
56 pp test_ ('axb' ~ pat )
57 # mu character
58 pp test_ (b'a\yce\ybcb' ~ pat )
59 pp test_ (b'a\u{3bc}b' ~ pat )
60
61 pp test_ ('ab' ~ pat )
62 pp test_ ('aZZb' ~ pat )
63
64 ## STDOUT:
65 (Bool) true
66 (Bool) true
67 (Bool) true
68 (Bool) false
69 (Bool) false
70 ## END
71
72 #### $'\xff' is disallowed in Eggex, because it's disallowed in YSH
73
74 # NOTE: This pattern doesn't work with en_US.UTF-8. I think the user should
75 # set LANG=C or shopt --unset libc_utf8.
76
77 shopt -s ysh:all
78 = /[ $'\xff' ]/;
79
80 ## status: 2
81 ## STDOUT:
82 ## END
83
84 #### Match low ASCII with [\x01]
85 shopt -s ysh:all
86
87 pp test_ ('a' ~ / [a] /)
88 echo
89
90 pp test_ (b'\y01' ~ / [\x01] /)
91 pp test_ (b'\y02' ~ / [\x01] /)
92
93 # \y01 isn't accepted, because we punt \x01 translation to ERE ...
94 #pp test_ (b'\y02' ~ / [\y01] /)
95
96 # we print this as \u{1}
97 # = str( / [\x01] / )
98
99 ## STDOUT:
100 (Bool) true
101
102 (Bool) true
103 (Bool) false
104 ## END
105
106 #### Match low ASCII with \u{7f} - translates to valid ERE
107 shopt -s ysh:all
108 var pat = /[ \u{7f} ]/;
109
110 echo $pat | od -A n -t x1
111 if (b'\y7f' ~ pat) { echo yes } else { echo no }
112 if (b'\y7e' ~ pat) { echo yes } else { echo no }
113
114 var pat2 = /[ \u{7f} ]/;
115 var pat3 = /[ \u{0007f} ]/;
116 test "$pat2" = "$pat3" && echo 'equal'
117
118 var range = / [ \u{70} - \u{7f} ] /
119 if (b'\y70' ~ range) { echo yes } else { echo no }
120 if (b'\y69' ~ range) { echo yes } else { echo no }
121
122 ## STDOUT:
123 5b 7f 5d 0a
124 yes
125 no
126 equal
127 yes
128 no
129 ## END
130
131 #### non-ASCII bytes must be singleton terms, e.g. b'\y7f\yff' is disallowed
132 var bytes = b'\y7f\yff'
133 var pat = / [ @bytes ] /
134 echo $pat
135 ## status: 1
136 ## stdout-json: ""
137
138 #### Bytes are denoted \y01 in Eggex char classes (not \x01)
139
140 # That is, eggex does have MODES like re.UNICODE
141 #
142 # We UNAMBIGUOUSLY accept
143 # - \y01 or \u{1} - these are the same
144 # - \yff or \u{ff} - these are DIFFERENT
145
146 var pat = / [\y01] /
147 pp test_ (b'\y01' ~ pat)
148 pp test_ ('a' ~ pat)
149
150 ## STDOUT:
151 (Bool) true
152 (Bool) false
153 ## END
154
155 #### NUL byte can be expressed in Eggex, but not in ERE
156
157 $SH <<'EOF'
158 pp test_ (b'\y01' ~ / [\y01] /)
159 pp test_ (b'\y00' ~ / [\y00] /)
160 EOF
161 echo status=$?
162
163 $SH <<'EOF'
164 pp test_ (b'\y01' ~ / [\u{1}] /)
165 pp test_ (b'\y00' ~ / [\u{0}] /)
166 EOF
167 echo status=$?
168
169
170 # legacy synonym
171
172 $SH <<'EOF'
173 pp test_ (b'\y01' ~ / [\x01] /)
174 pp test_ (b'\y00' ~ / [\x00] /)
175 EOF
176 echo status=$?
177
178 ## STDOUT:
179 (Bool) true
180 status=1
181 (Bool) true
182 status=1
183 (Bool) true
184 status=1
185 ## END
186
187 #### High bytes 0x80 0xff usually can't be matched - Eggex is UTF-8
188 shopt -s ysh:all
189
190 # ascii works
191 pp test_ (b'\y7f' ~ / [\x7f] /)
192 pp test_ (b'\y7e' ~ / [\x7f] /)
193
194 = str( / [\y80]/ )
195
196 #pp test_ (b'\y80' ~ / [\y80] /)
197 #pp test_ (b'\yff' ~ / [\yff] /)
198
199 ## status: 1
200 ## STDOUT:
201 (Bool) true
202 (Bool) false
203 ## END
204
205 #### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
206
207 export LC_ALL=C
208
209 $SH <<'EOF'
210 var yes = b'foo \yff'
211 var no = b'foo'
212
213 # POSIX ERE string
214 var ere = b'[\yff]'
215
216 pp test_ (yes ~ ere)
217 pp test_ (no ~ ere)
218 EOF
219
220 ## STDOUT:
221 (Bool) true
222 (Bool) false
223 ## END
224
225 #### Code points like \u{3bc} can be matched
226
227 var pat = / [\u{3bc}] /
228 pp test_ (b'a' ~ pat)
229 pp test_ (b'\u{3bc}' ~ / [\u{3bc}] /)
230 echo
231
232 var pat = / [\u{10ffff}] /
233 pp test_ (b'a' ~ pat)
234 pp test_ (b'\u{10ffff}' ~ pat)
235
236 #echo "-$pat-"
237
238 ## STDOUT:
239 (Bool) false
240 (Bool) true
241
242 (Bool) false
243 (Bool) true
244 ## END
245
246 #### Code point ranges work in limited cases
247 shopt -s ysh:all
248
249 var range1 = /[ \u{1} - \u{7e} ]/;
250
251 pp test_ (u'\u{7f}' ~ range1)
252 pp test_ (u'\u{7e}' ~ range1)
253
254 exit
255
256 # Invalid collation character? Unicode ranges don't work I guess
257 var range2 = /[ \u{1} - \u{3bc} ]/;
258
259 pp test_ (b'\y7f' ~ range2)
260 pp test_ (b'\y7e' ~ range2)
261
262 ## STDOUT:
263 (Bool) false
264 (Bool) true
265 ## END
266
267
268 #### Max code point is disallowed at parse time
269
270 pp test_ (/ [\u{10ffff}] /)
271 pp test_ (/ [\u{110000}] /)
272
273 ## STDOUT:
274 (Bool) false
275 (Bool) true
276
277 (Bool) false
278 (Bool) true
279 ## END
280
281