OILS / demo / 04-unicode.sh View on Github | oils.pub

199 lines, 94 significant
1#!/usr/bin/env bash
2#
3# Usage:
4# demo/04-unicode.sh <function name>
5#
6# TODO: Test what happens if you read binary data into a $(command sub)
7# - internal NUL
8# - invalid utf-8 sequence
9#
10# It would be nice to move some of this into test/gold? It depends on the
11# locale.
12
13set -o nounset
14set -o pipefail
15set -o errexit
16
17# https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
18#
19# See doc/unicode.txt.
20
21unicode-char() {
22 python -c 'print u"[\u03bc]".encode("utf-8")'
23}
24
25# http://stackoverflow.com/questions/602912/how-do-you-echo-a-4-digit-unicode-character-in-bash
26echo-char() {
27 #echo -e "\xE2\x98\xA0"
28 echo -e "\xE2\x98\xA0"
29
30 #echo -e "\x03\xbc"
31
32 # Woah bash has this! Interesting. Not documented in "help echo" though.
33 echo -e '\u2620'
34
35 # GNU echo does not have it.
36 /bin/echo -e '\u2620'
37}
38
39raw-char() {
40 # Use vim to put utf-8 in this source file:
41 # 1. i to enter Insert mode
42 # 2. Ctrl-V
43 # 3. u
44 # 4. 03bc -- 4 digits of hex0
45 echo [μ]
46}
47
48quoted-chars() {
49 echo '[μ]'
50 echo "[μ]"
51 echo $'[\u03bc]' # C-escaped string
52
53 # Not implementing this
54 # https://www.gnu.org/software/bash/manual/html_node/Locale-Translation.html
55 echo $"hello"
56}
57
58test-unicode() {
59 locale # displays state
60 echo
61 echo $LANG
62
63 unicode-char
64
65 local u=$(unicode-char)
66 echo $u
67
68 # This changes bash behavior!
69
70 #LANG=C
71 echo ${#u} # three chars
72
73 # OK bash respect utf-8 when doing string slicing. Does it have its own
74 # unicode support, or does it use libc?
75 echo ${u:0} ${u:1} ${u:2}
76
77 local u=$(raw-char)
78 echo ${u:0} ${u:1} ${u:2}
79}
80
81json() {
82 python -c 'print "\"\u03bc\""' | python -c '
83import sys, json
84print json.loads(sys.stdin.read())
85'
86
87 # \0u000 code point seems to be representable
88 python -c 'print "\"[\u0000]\""' | python -c '
89import sys, json
90print repr(json.loads(sys.stdin.read()))
91'
92 # Works in python3 too.
93 python -c 'print "\"[\u0000]\""' | python3 -c '
94import sys, json
95print(repr(json.loads(sys.stdin.read())))
96'
97}
98
99# Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc'). This is
100# fine for most purposes, although we could probably simplify this.
101osh-literal() {
102 bin/osh -n -c 'echo [μ]'
103 # This works fine
104 bin/osh -c 'echo [μ]'
105}
106
107# TODO
108# - ltrace of bash, python, osh, ysh
109# - LANG vs LC_ALL - LANG is the default
110# - C or UTF-8 is accepted
111
112libc-vars() {
113 local sh=${1:-bin/osh} # also _bin/cxx-asan/osh
114
115 case $sh in
116 _bin/*/osh)
117 ninja $sh
118 ;;
119 esac
120
121 $sh -c 'echo hi'
122 echo
123
124 LC_ALL=C $sh -c 'echo hi'
125 echo
126
127 LANG=C $sh -c 'echo hi'
128 echo
129
130 LC_COLLATE=C $sh -c 'echo hi'
131 echo
132
133 # this turns it into "C"
134 LC_ALL=POSIX $sh -c 'echo hi'
135 echo
136
137 LC_ALL=zz $sh -c 'echo hi'
138 echo
139
140 # TODO: non-utf8
141}
142
143# Copied into spec/unicode.test.sh; mksh behaves differently
144length-op() {
145 for s in $'\u03bc' $'\U00010000'; do
146 LC_ALL=
147 echo "len=${#s}"
148
149 LC_ALL=C
150 echo "len=${#s}"
151 done
152}
153
154compare-shells() {
155 # hm they all support unicode
156 for sh in bash zsh mksh; do
157 echo "=== $sh"
158 $sh $0 length-op
159 echo
160 done
161}
162
163len-1() {
164 s=$'\U00010000'
165 echo ${#s}
166}
167
168len-2() {
169 s=$'\U00010000'
170 s2=$'\u03bc' # different string, so length isn't cached
171
172 #s3=$'\uffff' # different string, so length isn't cached
173 #s2=$'\U0001000f' # different string, so length isn't cached
174
175 #echo ${#s} ${#s2}
176 # I see more of these
177 # __ctype_get_mb_cur_max() = 6
178 # mbrtowc(0, 0xHEX, 3, 0xHEX) = 2
179
180 echo ${#s} ${#s2}
181}
182
183norm-ltrace() {
184 grep mb $1 | sed --regexp-extended 's/0x[0-9a-f]+/0xHEX/g'
185}
186
187ltrace-diff() {
188 ### Shows that bash calls decoding mbrtowc() when calculating string length!
189
190 ltrace bash $0 len-1 2>_tmp/1.txt
191 ltrace bash $0 len-2 2>_tmp/2.txt
192
193 wc -l _tmp/{1,2}.txt
194
195 diff -u <(norm-ltrace _tmp/1.txt) <(norm-ltrace _tmp/2.txt )
196}
197
198
199"$@"