| 1 | ## oils_failures_allowed: 1
|
| 2 |
|
| 3 | #### Match tab character with [\t]
|
| 4 | shopt -s ysh:all
|
| 5 |
|
| 6 | var pat = / ('a' [\t] 'b') /
|
| 7 | pp test_ (str(pat))
|
| 8 |
|
| 9 | var lines = :| b'aa\tbb' b'cc\tdd' |
|
| 10 | write @lines | egrep $pat | od -A n -t c
|
| 11 |
|
| 12 | ## STDOUT:
|
| 13 | (Str) "(a[\t]b)"
|
| 14 | a a \t b b \n
|
| 15 | ## END
|
| 16 |
|
| 17 | #### Match newline with [\n]
|
| 18 | shopt -s ysh:all
|
| 19 |
|
| 20 | var pat = / [\n] /
|
| 21 |
|
| 22 | pp test_ (str(pat))
|
| 23 |
|
| 24 | pp test_ ('z' ~ pat)
|
| 25 |
|
| 26 | # this matches
|
| 27 | pp test_ (b'\n' ~ pat)
|
| 28 |
|
| 29 | # but then what happens with grep?
|
| 30 | # invalid regular expression
|
| 31 |
|
| 32 | # write 1 2 3 | egrep "$pat"
|
| 33 |
|
| 34 | ## STDOUT:
|
| 35 | (Str) "[\n]"
|
| 36 | (Bool) false
|
| 37 | (Bool) true
|
| 38 | ## END
|
| 39 |
|
| 40 | #### ERE: 'dot' matches newline
|
| 41 |
|
| 42 | = 'z' ~ /dot/
|
| 43 | = b'\n' ~ /dot/
|
| 44 | = '' ~ /dot/
|
| 45 |
|
| 46 | ## STDOUT:
|
| 47 | (Bool) true
|
| 48 | (Bool) true
|
| 49 | (Bool) false
|
| 50 | ## END
|
| 51 |
|
| 52 | #### ERE: 'dot' matches code point represented with multiple bytes (mu = 0xce 0xbe)
|
| 53 |
|
| 54 | var pat = / 'a' dot 'b' /
|
| 55 |
|
| 56 | pp test_ ('axb' ~ pat )
|
| 57 | # mu character
|
| 58 | pp test_ (b'a\yce\ybcb' ~ pat )
|
| 59 | pp test_ (b'a\u{3bc}b' ~ pat )
|
| 60 |
|
| 61 | pp test_ ('ab' ~ pat )
|
| 62 | pp test_ ('aZZb' ~ pat )
|
| 63 |
|
| 64 | ## STDOUT:
|
| 65 | (Bool) true
|
| 66 | (Bool) true
|
| 67 | (Bool) true
|
| 68 | (Bool) false
|
| 69 | (Bool) false
|
| 70 | ## END
|
| 71 |
|
| 72 | #### $'\xff' is disallowed in Eggex, because it's disallowed in YSH
|
| 73 |
|
| 74 | # NOTE: This pattern doesn't work with en_US.UTF-8. I think the user should
|
| 75 | # set LANG=C or shopt --unset libc_utf8.
|
| 76 |
|
| 77 | shopt -s ysh:all
|
| 78 | = /[ $'\xff' ]/;
|
| 79 |
|
| 80 | ## status: 2
|
| 81 | ## STDOUT:
|
| 82 | ## END
|
| 83 |
|
| 84 | #### Match low ASCII with [\x01]
|
| 85 | shopt -s ysh:all
|
| 86 |
|
| 87 | pp test_ ('a' ~ / [a] /)
|
| 88 | echo
|
| 89 |
|
| 90 | pp test_ (b'\y01' ~ / [\x01] /)
|
| 91 | pp test_ (b'\y02' ~ / [\x01] /)
|
| 92 |
|
| 93 | # \y01 isn't accepted, because we punt \x01 translation to ERE ...
|
| 94 | #pp test_ (b'\y02' ~ / [\y01] /)
|
| 95 |
|
| 96 | # we print this as \u{1}
|
| 97 | # = str( / [\x01] / )
|
| 98 |
|
| 99 | ## STDOUT:
|
| 100 | (Bool) true
|
| 101 |
|
| 102 | (Bool) true
|
| 103 | (Bool) false
|
| 104 | ## END
|
| 105 |
|
| 106 | #### Match low ASCII with \u{7f} - translates to valid ERE
|
| 107 | shopt -s ysh:all
|
| 108 | var pat = /[ \u{7f} ]/;
|
| 109 |
|
| 110 | echo $pat | od -A n -t x1
|
| 111 | if (b'\y7f' ~ pat) { echo yes } else { echo no }
|
| 112 | if (b'\y7e' ~ pat) { echo yes } else { echo no }
|
| 113 |
|
| 114 | var pat2 = /[ \u{7f} ]/;
|
| 115 | var pat3 = /[ \u{0007f} ]/;
|
| 116 | test "$pat2" = "$pat3" && echo 'equal'
|
| 117 |
|
| 118 | var range = / [ \u{70} - \u{7f} ] /
|
| 119 | if (b'\y70' ~ range) { echo yes } else { echo no }
|
| 120 | if (b'\y69' ~ range) { echo yes } else { echo no }
|
| 121 |
|
| 122 | ## STDOUT:
|
| 123 | 5b 7f 5d 0a
|
| 124 | yes
|
| 125 | no
|
| 126 | equal
|
| 127 | yes
|
| 128 | no
|
| 129 | ## END
|
| 130 |
|
| 131 | #### non-ASCII bytes must be singleton terms, e.g. b'\y7f\yff' is disallowed
|
| 132 | var bytes = b'\y7f\yff'
|
| 133 | var pat = / [ @bytes ] /
|
| 134 | echo $pat
|
| 135 | ## status: 1
|
| 136 | ## stdout-json: ""
|
| 137 |
|
| 138 | #### Bytes are denoted \y01 in Eggex char classes (not \x01)
|
| 139 |
|
| 140 | # That is, eggex does have MODES like re.UNICODE
|
| 141 | #
|
| 142 | # We UNAMBIGUOUSLY accept
|
| 143 | # - \y01 or \u{1} - these are the same
|
| 144 | # - \yff or \u{ff} - these are DIFFERENT
|
| 145 |
|
| 146 | var pat = / [\y01] /
|
| 147 | pp test_ (b'\y01' ~ pat)
|
| 148 | pp test_ ('a' ~ pat)
|
| 149 |
|
| 150 | ## STDOUT:
|
| 151 | (Bool) true
|
| 152 | (Bool) false
|
| 153 | ## END
|
| 154 |
|
| 155 | #### NUL byte can be expressed in Eggex, but not in ERE
|
| 156 |
|
| 157 | $SH <<'EOF'
|
| 158 | pp test_ (b'\y01' ~ / [\y01] /)
|
| 159 | pp test_ (b'\y00' ~ / [\y00] /)
|
| 160 | EOF
|
| 161 | echo status=$?
|
| 162 |
|
| 163 | $SH <<'EOF'
|
| 164 | pp test_ (b'\y01' ~ / [\u{1}] /)
|
| 165 | pp test_ (b'\y00' ~ / [\u{0}] /)
|
| 166 | EOF
|
| 167 | echo status=$?
|
| 168 |
|
| 169 |
|
| 170 | # legacy synonym
|
| 171 |
|
| 172 | $SH <<'EOF'
|
| 173 | pp test_ (b'\y01' ~ / [\x01] /)
|
| 174 | pp test_ (b'\y00' ~ / [\x00] /)
|
| 175 | EOF
|
| 176 | echo status=$?
|
| 177 |
|
| 178 | ## STDOUT:
|
| 179 | (Bool) true
|
| 180 | status=1
|
| 181 | (Bool) true
|
| 182 | status=1
|
| 183 | (Bool) true
|
| 184 | status=1
|
| 185 | ## END
|
| 186 |
|
| 187 | #### High bytes 0x80 0xff usually can't be matched - Eggex is UTF-8
|
| 188 | shopt -s ysh:all
|
| 189 |
|
| 190 | # ascii works
|
| 191 | pp test_ (b'\y7f' ~ / [\x7f] /)
|
| 192 | pp test_ (b'\y7e' ~ / [\x7f] /)
|
| 193 |
|
| 194 | = str( / [\y80]/ )
|
| 195 |
|
| 196 | #pp test_ (b'\y80' ~ / [\y80] /)
|
| 197 | #pp test_ (b'\yff' ~ / [\yff] /)
|
| 198 |
|
| 199 | ## status: 1
|
| 200 | ## STDOUT:
|
| 201 | (Bool) true
|
| 202 | (Bool) false
|
| 203 | ## END
|
| 204 |
|
| 205 | #### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
|
| 206 |
|
| 207 | export LC_ALL=C
|
| 208 |
|
| 209 | $SH <<'EOF'
|
| 210 | var yes = b'foo \yff'
|
| 211 | var no = b'foo'
|
| 212 |
|
| 213 | # POSIX ERE string
|
| 214 | var ere = b'[\yff]'
|
| 215 |
|
| 216 | pp test_ (yes ~ ere)
|
| 217 | pp test_ (no ~ ere)
|
| 218 | EOF
|
| 219 |
|
| 220 | ## STDOUT:
|
| 221 | (Bool) true
|
| 222 | (Bool) false
|
| 223 | ## END
|
| 224 |
|
| 225 | #### Code points like \u{3bc} can be matched
|
| 226 |
|
| 227 | var pat = / [\u{3bc}] /
|
| 228 | pp test_ (b'a' ~ pat)
|
| 229 | pp test_ (b'\u{3bc}' ~ / [\u{3bc}] /)
|
| 230 | echo
|
| 231 |
|
| 232 | var pat = / [\u{10ffff}] /
|
| 233 | pp test_ (b'a' ~ pat)
|
| 234 | pp test_ (b'\u{10ffff}' ~ pat)
|
| 235 |
|
| 236 | #echo "-$pat-"
|
| 237 |
|
| 238 | ## STDOUT:
|
| 239 | (Bool) false
|
| 240 | (Bool) true
|
| 241 |
|
| 242 | (Bool) false
|
| 243 | (Bool) true
|
| 244 | ## END
|
| 245 |
|
| 246 | #### Code point ranges work in limited cases
|
| 247 | shopt -s ysh:all
|
| 248 |
|
| 249 | var range1 = /[ \u{1} - \u{7e} ]/;
|
| 250 |
|
| 251 | pp test_ (u'\u{7f}' ~ range1)
|
| 252 | pp test_ (u'\u{7e}' ~ range1)
|
| 253 |
|
| 254 | exit
|
| 255 |
|
| 256 | # Invalid collation character? Unicode ranges don't work I guess
|
| 257 | var range2 = /[ \u{1} - \u{3bc} ]/;
|
| 258 |
|
| 259 | pp test_ (b'\y7f' ~ range2)
|
| 260 | pp test_ (b'\y7e' ~ range2)
|
| 261 |
|
| 262 | ## STDOUT:
|
| 263 | (Bool) false
|
| 264 | (Bool) true
|
| 265 | ## END
|
| 266 |
|
| 267 |
|
| 268 | #### Max code point is disallowed at parse time
|
| 269 |
|
| 270 | pp test_ (/ [\u{10ffff}] /)
|
| 271 | pp test_ (/ [\u{110000}] /)
|
| 272 |
|
| 273 | ## STDOUT:
|
| 274 | (Bool) false
|
| 275 | (Bool) true
|
| 276 |
|
| 277 | (Bool) false
|
| 278 | (Bool) true
|
| 279 | ## END
|
| 280 |
|
| 281 |
|