demo/04-unicode.sh

OILS / demo / 04-unicode.sh View on Github | oils.pub

199 lines, 94 significant

1	#!/usr/bin/env bash
2	#
3	# Usage:
4	# demo/04-unicode.sh <function name>
5	#
6	# TODO: Test what happens if you read binary data into a $(command sub)
7	# - internal NUL
8	# - invalid utf-8 sequence
9	#
10	# It would be nice to move some of this into test/gold? It depends on the
11	# locale.
12
13	set -o nounset
14	set -o pipefail
15	set -o errexit
16
17	# https://www.gnu.org/software/bash/manual/bash.html#Shell-Parameter-Expansion
18	#
19	# See doc/unicode.txt.
20
21	unicode-char() {
22	python -c 'print u"[\u03bc]".encode("utf-8")'
23	}
24
25	# http://stackoverflow.com/questions/602912/how-do-you-echo-a-4-digit-unicode-character-in-bash
26	echo-char() {
27	#echo -e "\xE2\x98\xA0"
28	echo -e "\xE2\x98\xA0"
29
30	#echo -e "\x03\xbc"
31
32	# Woah bash has this! Interesting. Not documented in "help echo" though.
33	echo -e '\u2620'
34
35	# GNU echo does not have it.
36	/bin/echo -e '\u2620'
37	}
38
39	raw-char() {
40	# Use vim to put utf-8 in this source file:
41	# 1. i to enter Insert mode
42	# 2. Ctrl-V
43	# 3. u
44	# 4. 03bc -- 4 digits of hex0
45	echo [μ]
46	}
47
48	quoted-chars() {
49	echo '[μ]'
50	echo "[μ]"
51	echo $'[\u03bc]' # C-escaped string
52
53	# Not implementing this
54	# https://www.gnu.org/software/bash/manual/html_node/Locale-Translation.html
55	echo $"hello"
56	}
57
58	test-unicode() {
59	locale # displays state
60	echo
61	echo $LANG
62
63	unicode-char
64
65	local u=$(unicode-char)
66	echo $u
67
68	# This changes bash behavior!
69
70	#LANG=C
71	echo ${#u} # three chars
72
73	# OK bash respect utf-8 when doing string slicing. Does it have its own
74	# unicode support, or does it use libc?
75	echo ${u:0} ${u:1} ${u:2}
76
77	local u=$(raw-char)
78	echo ${u:0} ${u:1} ${u:2}
79	}
80
81	json() {
82	python -c 'print "\"\u03bc\""' \| python -c '
83	import sys, json
84	print json.loads(sys.stdin.read())
85	'
86
87	# \0u000 code point seems to be representable
88	python -c 'print "\"[\u0000]\""' \| python -c '
89	import sys, json
90	print repr(json.loads(sys.stdin.read()))
91	'
92	# Works in python3 too.
93	python -c 'print "\"[\u0000]\""' \| python3 -c '
94	import sys, json
95	print(repr(json.loads(sys.stdin.read())))
96	'
97	}
98
99	# Right now it's split into (Lit_Other '\xce') and (Lit_Other '\xbc'). This is
100	# fine for most purposes, although we could probably simplify this.
101	osh-literal() {
102	bin/osh -n -c 'echo [μ]'
103	# This works fine
104	bin/osh -c 'echo [μ]'
105	}
106
107	# TODO
108	# - ltrace of bash, python, osh, ysh
109	# - LANG vs LC_ALL - LANG is the default
110	# - C or UTF-8 is accepted
111
112	libc-vars() {
113	local sh=${1:-bin/osh} # also _bin/cxx-asan/osh
114
115	case $sh in
116	_bin/*/osh)
117	ninja $sh
118	;;
119	esac
120
121	$sh -c 'echo hi'
122	echo
123
124	LC_ALL=C $sh -c 'echo hi'
125	echo
126
127	LANG=C $sh -c 'echo hi'
128	echo
129
130	LC_COLLATE=C $sh -c 'echo hi'
131	echo
132
133	# this turns it into "C"
134	LC_ALL=POSIX $sh -c 'echo hi'
135	echo
136
137	LC_ALL=zz $sh -c 'echo hi'
138	echo
139
140	# TODO: non-utf8
141	}
142
143	# Copied into spec/unicode.test.sh; mksh behaves differently
144	length-op() {
145	for s in $'\u03bc' $'\U00010000'; do
146	LC_ALL=
147	echo "len=${#s}"
148
149	LC_ALL=C
150	echo "len=${#s}"
151	done
152	}
153
154	compare-shells() {
155	# hm they all support unicode
156	for sh in bash zsh mksh; do
157	echo "=== $sh"
158	$sh $0 length-op
159	echo
160	done
161	}
162
163	len-1() {
164	s=$'\U00010000'
165	echo ${#s}
166	}
167
168	len-2() {
169	s=$'\U00010000'
170	s2=$'\u03bc' # different string, so length isn't cached
171
172	#s3=$'\uffff' # different string, so length isn't cached
173	#s2=$'\U0001000f' # different string, so length isn't cached
174
175	#echo ${#s} ${#s2}
176	# I see more of these
177	# __ctype_get_mb_cur_max() = 6
178	# mbrtowc(0, 0xHEX, 3, 0xHEX) = 2
179
180	echo ${#s} ${#s2}
181	}
182
183	norm-ltrace() {
184	grep mb $1 \| sed --regexp-extended 's/0x[0-9a-f]+/0xHEX/g'
185	}
186
187	ltrace-diff() {
188	### Shows that bash calls decoding mbrtowc() when calculating string length!
189
190	ltrace bash $0 len-1 2>_tmp/1.txt
191	ltrace bash $0 len-2 2>_tmp/2.txt
192
193	wc -l _tmp/{1,2}.txt
194
195	diff -u <(norm-ltrace _tmp/1.txt) <(norm-ltrace _tmp/2.txt )
196	}
197
198
199	"$@"