| 1 | #!/usr/bin/env bash
|
| 2 | #
|
| 3 | # Usage:
|
| 4 | # demo/04-unicode.sh <function name>
|
| 5 | #
|
| 6 | # TODO: Test what happens if you read binary data into a $(command sub)
|
| 7 | # - internal NUL
|
| 8 | # - invalid utf-8 sequence
|
| 9 | #
|
| 10 | # It would be nice to move some of this into test/gold? It depends on the
|
| 11 | # locale.
|
| 12 |
|
| 13 | set -o nounset
|
| 14 | set -o pipefail
|
| 15 | set -o errexit
|
| 16 |
|
| 17 | # https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
|
| 18 | #
|
| 19 | # See doc/unicode.txt.
|
| 20 |
|
| 21 | unicode-char() {
|
| 22 | python -c 'print u"[\u03bc]".encode("utf-8")'
|
| 23 | }
|
| 24 |
|
| 25 | # http://stackoverflow.com/questions/602912/how-do-you-echo-a-4-digit-unicode-character-in-bash
|
| 26 | echo-char() {
|
| 27 | #echo -e "\xE2\x98\xA0"
|
| 28 | echo -e "\xE2\x98\xA0"
|
| 29 |
|
| 30 | #echo -e "\x03\xbc"
|
| 31 |
|
| 32 | # Woah bash has this! Interesting. Not documented in "help echo" though.
|
| 33 | echo -e '\u2620'
|
| 34 |
|
| 35 | # GNU echo does not have it.
|
| 36 | /bin/echo -e '\u2620'
|
| 37 | }
|
| 38 |
|
| 39 | raw-char() {
|
| 40 | # Use vim to put utf-8 in this source file:
|
| 41 | # 1. i to enter Insert mode
|
| 42 | # 2. Ctrl-V
|
| 43 | # 3. u
|
| 44 | # 4. 03bc -- 4 digits of hex0
|
| 45 | echo [μ]
|
| 46 | }
|
| 47 |
|
| 48 | quoted-chars() {
|
| 49 | echo '[μ]'
|
| 50 | echo "[μ]"
|
| 51 | echo $'[\u03bc]' # C-escaped string
|
| 52 |
|
| 53 | # Not implementing this
|
| 54 | # https://www.gnu.org/software/bash/manual/html_node/Locale-Translation.html
|
| 55 | echo $"hello"
|
| 56 | }
|
| 57 |
|
| 58 | test-unicode() {
|
| 59 | locale # displays state
|
| 60 | echo
|
| 61 | echo $LANG
|
| 62 |
|
| 63 | unicode-char
|
| 64 |
|
| 65 | local u=$(unicode-char)
|
| 66 | echo $u
|
| 67 |
|
| 68 | # This changes bash behavior!
|
| 69 |
|
| 70 | #LANG=C
|
| 71 | echo ${#u} # three chars
|
| 72 |
|
| 73 | # OK bash respect utf-8 when doing string slicing. Does it have its own
|
| 74 | # unicode support, or does it use libc?
|
| 75 | echo ${u:0} ${u:1} ${u:2}
|
| 76 |
|
| 77 | local u=$(raw-char)
|
| 78 | echo ${u:0} ${u:1} ${u:2}
|
| 79 | }
|
| 80 |
|
| 81 | json() {
|
| 82 | python -c 'print "\"\u03bc\""' | python -c '
|
| 83 | import sys, json
|
| 84 | print json.loads(sys.stdin.read())
|
| 85 | '
|
| 86 |
|
| 87 | # \0u000 code point seems to be representable
|
| 88 | python -c 'print "\"[\u0000]\""' | python -c '
|
| 89 | import sys, json
|
| 90 | print repr(json.loads(sys.stdin.read()))
|
| 91 | '
|
| 92 | # Works in python3 too.
|
| 93 | python -c 'print "\"[\u0000]\""' | python3 -c '
|
| 94 | import sys, json
|
| 95 | print(repr(json.loads(sys.stdin.read())))
|
| 96 | '
|
| 97 | }
|
| 98 |
|
| 99 | # Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc'). This is
|
| 100 | # fine for most purposes, although we could probably simplify this.
|
| 101 | osh-literal() {
|
| 102 | bin/osh -n -c 'echo [μ]'
|
| 103 | # This works fine
|
| 104 | bin/osh -c 'echo [μ]'
|
| 105 | }
|
| 106 |
|
| 107 | # TODO
|
| 108 | # - ltrace of bash, python, osh, ysh
|
| 109 | # - LANG vs LC_ALL - LANG is the default
|
| 110 | # - C or UTF-8 is accepted
|
| 111 |
|
| 112 | libc-vars() {
|
| 113 | local sh=${1:-bin/osh} # also _bin/cxx-asan/osh
|
| 114 |
|
| 115 | case $sh in
|
| 116 | _bin/*/osh)
|
| 117 | ninja $sh
|
| 118 | ;;
|
| 119 | esac
|
| 120 |
|
| 121 | $sh -c 'echo hi'
|
| 122 | echo
|
| 123 |
|
| 124 | LC_ALL=C $sh -c 'echo hi'
|
| 125 | echo
|
| 126 |
|
| 127 | LANG=C $sh -c 'echo hi'
|
| 128 | echo
|
| 129 |
|
| 130 | LC_COLLATE=C $sh -c 'echo hi'
|
| 131 | echo
|
| 132 |
|
| 133 | # this turns it into "C"
|
| 134 | LC_ALL=POSIX $sh -c 'echo hi'
|
| 135 | echo
|
| 136 |
|
| 137 | LC_ALL=zz $sh -c 'echo hi'
|
| 138 | echo
|
| 139 |
|
| 140 | # TODO: non-utf8
|
| 141 | }
|
| 142 |
|
| 143 | # Copied into spec/unicode.test.sh; mksh behaves differently
|
| 144 | length-op() {
|
| 145 | for s in $'\u03bc' $'\U00010000'; do
|
| 146 | LC_ALL=
|
| 147 | echo "len=${#s}"
|
| 148 |
|
| 149 | LC_ALL=C
|
| 150 | echo "len=${#s}"
|
| 151 | done
|
| 152 | }
|
| 153 |
|
| 154 | compare-shells() {
|
| 155 | # hm they all support unicode
|
| 156 | for sh in bash zsh mksh; do
|
| 157 | echo "=== $sh"
|
| 158 | $sh $0 length-op
|
| 159 | echo
|
| 160 | done
|
| 161 | }
|
| 162 |
|
| 163 | len-1() {
|
| 164 | s=$'\U00010000'
|
| 165 | echo ${#s}
|
| 166 | }
|
| 167 |
|
| 168 | len-2() {
|
| 169 | s=$'\U00010000'
|
| 170 | s2=$'\u03bc' # different string, so length isn't cached
|
| 171 |
|
| 172 | #s3=$'\uffff' # different string, so length isn't cached
|
| 173 | #s2=$'\U0001000f' # different string, so length isn't cached
|
| 174 |
|
| 175 | #echo ${#s} ${#s2}
|
| 176 | # I see more of these
|
| 177 | # __ctype_get_mb_cur_max() = 6
|
| 178 | # mbrtowc(0, 0xHEX, 3, 0xHEX) = 2
|
| 179 |
|
| 180 | echo ${#s} ${#s2}
|
| 181 | }
|
| 182 |
|
| 183 | norm-ltrace() {
|
| 184 | grep mb $1 | sed --regexp-extended 's/0x[0-9a-f]+/0xHEX/g'
|
| 185 | }
|
| 186 |
|
| 187 | ltrace-diff() {
|
| 188 | ### Shows that bash calls decoding mbrtowc() when calculating string length!
|
| 189 |
|
| 190 | ltrace bash $0 len-1 2>_tmp/1.txt
|
| 191 | ltrace bash $0 len-2 2>_tmp/2.txt
|
| 192 |
|
| 193 | wc -l _tmp/{1,2}.txt
|
| 194 |
|
| 195 | diff -u <(norm-ltrace _tmp/1.txt) <(norm-ltrace _tmp/2.txt )
|
| 196 | }
|
| 197 |
|
| 198 |
|
| 199 | "$@"
|