benchmarks/perf.sh

OILS / benchmarks / perf.sh View on Github | oilshell.org

483 lines, 213 significant

1	#!/usr/bin/env bash
2	#
3	# Run the 'perf' tool and associated reports on OSH.
4	#
5	# Usage:
6	# benchmarks/perf.sh <function name>
7	#
8	# Deps:
9	#
10	# Clone https://github.com/brendangregg/FlameGraph
11	# Put it in ~/git/other/FlameGraph, or edit the paths below
12	#
13	# Examples:
14	#
15	# $0 install # install perf, including matching kernel symbols
16	#
17	# $0 profile-osh-parse # make flame graph
18	#
19	# Then look at _tmp/perf/osh-parse.svg in the browser
20
21	# $0 profile-osh-parse flat # make flat text report
22	#
23	# perf report -i _tmp/perf/osh-parse.perf # interactive
24	#
25	# Likewise for
26	#
27	# $0 profile-example escape
28	# => _tmp/perf/example-escape.svg
29	# $0 profile-example escape flat
30	# => _tmp/perf/example-escape.report.txt
31
32	set -o nounset
33	set -o pipefail
34	set -o errexit
35
36	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
37
38	source $REPO_ROOT/test/common.sh # $OSH
39
40	# $REPO_ROOT needed since CPython configure changes dirs
41	readonly BASE_DIR=$REPO_ROOT/_tmp/perf
42
43	# TODO:
44	# - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
45	# - grep for call_function in collapsed. I don't see it?
46	# - it's inlined I guess?
47
48	# Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
49	# according to 'perf'. Or maybe you don't have enough samples to catch it?
50
51	# NOTES:
52	# - dbg vs. opt matters a lot
53	# - function-level performance categorization is bad for bytecode interpreters,
54	# which have a single function and a big switch statement.
55	# - a longer file like configure-coreutils hit garbage collection! collect()
56	# - reference counting functions: visit_decref, visit_reachable
57
58	install-ubuntu-packages() {
59	# linux-tools-generic is the kernel module
60	# Apparently you need a package specific to the kernel, not sure why.
61	sudo apt-get install \
62	linux-tools-common linux-tools-$(uname -r) linux-tools-generic
63	}
64
65	install-debian-packages() {
66	sudo apt-get install linux-perf
67	}
68
69	soil-install() {
70	sudo apt-get update # seem to need this
71
72	install-ubuntu-packages
73	}
74
75	debug-symbols() {
76	#dpkg --listfiles linux-tools-4.13.0-36-generic
77	#sudo apt install python-dbg
78
79	# I don't see symbols files here? Just the interpreter? They're built into the ELF file?
80	#dpkg --listfiles python-dbg
81
82	# has files in /usr/lib/debug
83	# file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
84	# /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
85	#
86	# https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
87
88	# Does perf also support separate debug files?
89	# How do I set the debug link in oil.ovm? Or should I set build ID?
90
91	# The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
92	# that can produce the separated executable / debugging information file
93	# pairs using the following commands:
94	# objcopy --only-keep-debug foo foo.debug
95	# strip -g foo
96
97	sudo apt install zlib1g-dbg
98	dpkg --listfiles zlib1g-dbg
99	#libpython2.7-dbg
100	}
101
102	# TODO: Link these two tools in ../oil_DEPS/bin or something
103	# Make them work on CI
104
105	# NOTE: I used this before with python-flamegraph too.
106	flamegraph() {
107	~/git/other/FlameGraph/flamegraph.pl "$@"
108	}
109
110	stackcollapse-perf() {
111	~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
112	}
113
114	# http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
115	make-graph() {
116	local name=${1:-osh-parse}
117
118	local dir=$BASE_DIR
119	perf script -i $dir/$name.perf \| stackcollapse-perf > $dir/$name.perf-folded
120
121	flamegraph $dir/$name.perf-folded > $dir/$name.svg
122
123	echo "Wrote $dir/$name.svg"
124	}
125
126	_make-readable() {
127	local perf_raw=$1
128
129	chmod 644 $perf_raw
130
131	# original user
132	chown $USER $perf_raw
133	}
134
135	make-readable() {
136	# This gets run as root
137	local name=$1
138
139	local perf_raw=$BASE_DIR/$name.perf
140
141	sudo $0 _make-readable $perf_raw
142
143	file $perf_raw
144	ls -l $perf_raw
145	}
146
147	_record-cpp() {
148	local name=$1 # e.g. oils-for-unix, escape
149	local mode=$2
150	shift 2
151
152	# Can repeat 13 times without blowing heap
153	#export REPEAT=13
154
155	local freq=10000
156
157	local extra_flags=''
158	case $mode in
159	(graph) extra_flags='-g' ;; # needed to make flame graph
160	(flat) extra_flags='' ;;
161	(*) die "Mode should be graph or flat, got $mode" ;;
162	esac
163
164	time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
165
166	make-readable $name
167	}
168
169	profile-cpp() {
170	local name=$1
171	local mode=$2
172	shift
173
174	mkdir -p $BASE_DIR
175
176	# -E preserve environment like BENCHMARK=1
177	sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
178
179	case $mode in
180	(graph)
181	make-graph $name
182	;;
183	(flat)
184	local out=$BASE_DIR/$name.report.txt
185	text-report $name \| tee $out
186	echo "Wrote $out"
187	;;
188	(*)
189	die "Mode should be graph or flat, got $mode"
190	;;
191	esac
192	}
193
194	profile-osh-parse() {
195	# Profile parsing a big file. More than half the time is in malloc
196	# (_int_malloc in GCC), which is not surprising!
197
198	local osh=${1:-_bin/cxx-opt/osh}
199	local mode=${2:-graph}
200
201	#local file=benchmarks/testdata/configure
202	local file=benchmarks/testdata/configure-coreutils
203
204	local -a cmd=( $osh --ast-format none -n $file )
205	profile-cpp 'osh-parse' $mode "${cmd[@]}"
206
207	# 'perf list' shows the events
208	#OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
209	OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
210
211	# Run again with GC stats
212	time OILS_GC_STATS=1 "${cmd[@]}"
213	}
214
215	profile-fib() {
216	local osh=${1:-_bin/cxx-opt/osh}
217	local mode=${2:-graph}
218
219	# Same iterations as benchmarks/gc
220	local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
221
222	profile-cpp 'fib' $mode "${cmd[@]}"
223	}
224
225	# Hm this is dominated by GC, not regex?
226	profile-parse-help() {
227	local osh=${1:-_bin/cxx-opt/osh}
228	local mode=${2:-flat}
229
230	local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
231
232	profile-cpp 'parse-help' $mode "${cmd[@]}"
233	}
234
235	profile-example() {
236	local example=${1:-escape}
237	local mode=${2:-graph}
238
239	local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
240
241	ninja $bin
242	echo
243
244	BENCHMARK=1 profile-cpp "example-$example" $mode $bin
245	}
246
247	profile-hash-table() {
248	local mode=${1:-graph}
249
250	local bin='_bin/cxx-opt/mycpp/hash_table'
251	ninja $bin
252	profile-cpp 'hash_table' $mode $bin -t hash_speed_test
253	}
254
255	# Perf note: Without -o, for some reason osh output is shown on the console.
256	# It doesn't go to wc?
257	#perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild \| wc -l
258
259	text-report() {
260	### Show a batch report; 'perf report' is interactive
261
262	local name=$1
263	shift
264
265	local perf_raw=$BASE_DIR/$name.perf
266
267	# Flat report
268	perf report -i $perf_raw -n --stdio "$@"
269	}
270
271	# Shows instruction counts, branch misses, and so forth
272	#
273	# Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
274	# Only 21M branch misses, or 0.9%. Interesting.
275	_stat() {
276	perf stat -- "$@" \| wc -l
277	# -e cache-misses only shows that stat
278	}
279	stat() { sudo $0 _stat "$@"; }
280
281	stat-osh-parse() {
282	stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
283	}
284
285
286	#
287	# OLD OVM stuff
288	#
289
290	# Parsing abuild in Debug mode:
291	# 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
292	# thought. Maybe need my own loop.
293	# 8% type_call -- hm introspection?
294	# 7% PyObject_GetAttr. My intitution. Should be done at compile time!
295	# 6% do_richcmp -- hm interesting
296	# 5% PyObject_Malloc.
297
298	# More benchmarks:
299	# OPy running under OVM, compiling itself, compiling Oil, compiling OPy ports,
300	# etc.
301
302	# Parsing abuild, the optimized version.
303	#
304	# 80% PyEval_EvalFramEx. Woah everything is inlined?
305	# 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
306	# Some kernel.
307	# 0.76% lookdict_string is not a bottleneck. Hm.
308	#
309	# Wow.
310	# Maybe I need counters in optimized mode?
311	# Yeah what I really want is per opcode total!
312
313	_record() {
314
315	# TODO: The optimized build should have symbols! Don't build with -s. And
316	# then put symbols next to the binary somehow? How do the symbols packages
317	# work?
318	#perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild \| wc -l
319
320	# call graph recording. This helps it be less "flat" in opt mode. Otherwise
321	# everything is PyEval_EvalFrameEx.
322	local flag='-g'
323	local bin=_bin/oil.ovm-opt
324	#local bin=_bin/oil.ovm-dbg # This shows more details
325
326	local freq=1000 # 1000 Hz
327
328	#local file=benchmarks/testdata/abuild # small file
329
330	local file=benchmarks/testdata/configure-coreutils # big file
331
332	time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
333	#perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
334	}
335	record() { sudo $0 _record; }
336
337	#
338	# Soil CI
339	#
340
341	build-stress-test() {
342
343	# Special _OIL_DEV for -D GC_TIMING
344	_OIL_DEV=1 ./configure --without-readline
345
346	mkdir -p _tmp
347	c++ -D MARK_SWEEP -I . \
348	-O2 -g \
349	-o _tmp/gc_stress_test \
350	mycpp/gc_stress_test.cc \
351	mycpp/mark_sweep_heap.cc \
352	mycpp/gc_builtins.cc \
353	mycpp/gc_iolib.cc \
354	mycpp/gc_mylib.cc \
355	mycpp/gc_str.cc \
356	mycpp/hash.cc \
357	-lstdc++
358	}
359
360	profile-stress-test() {
361	profile-cpp 'gc_stress_test' flat \
362	_tmp/gc_stress_test
363	}
364
365	print-index() {
366	echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
367	echo '<h1>Perf Profiles</h1>'
368
369	for path in $BASE_DIR/*.txt; do
370	local filename=$(basename $path)
371	echo "<a href="$filename">$filename</a> <br/>"
372	done
373
374	echo '</body>'
375	}
376
377	# TODO: fetch the tarball from the cpp-small CI task
378
379	build-tar() {
380	local tar=${1:-_release/oils-for-unix.tar}
381
382	tar=$PWD/$tar
383
384	local tmp=$BASE_DIR/tar
385	mkdir -p $tmp
386
387	pushd $tmp
388
389	tar --extract < $tar
390	cd oils-for-unix-* # glob of 1
391
392	./configure
393
394	# TODO: add bumproot
395	for variant in opt+bumpleak opt; do
396	echo
397
398	time _build/oils.sh '' $variant
399	echo
400
401	_bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
402	done
403
404	# TODO:
405	# - profile each executable
406	# - add OILS_GC_THRESHOLD=$big to avoid GC
407
408	popd
409	}
410
411	profile-cpython-configure() {
412	### borrowed from benchmarks/osh-runtime.sh
413
414	local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
415	local mode=${2:-flat}
416
417	local dir=$BASE_DIR/cpython-configure
418
419	# Fails because perf has to run as root
420	rm -r -f -v $dir \|\| true
421
422	mkdir -p $dir
423
424	local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
425
426	pushd $dir
427	profile-cpp 'cpython-configure' $mode "${cmd[@]}"
428	popd
429	}
430
431	cpython-report() {
432	#perf report -i $BASE_DIR/cpython-configure.perf
433
434	# oils-for-unix is only 4.89% of time? That's #5
435	# 48% in kernel
436	# 23% in cc1
437	#
438	# That means we're still a bit slow
439
440	# TODO: I want to change OVERALL percentages
441	#
442	# GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
443	# GC can account for the whole thing
444	# I wonder if we can do GC while waiting for processes? They might be tiny
445	# processes though
446
447	perf report -i $BASE_DIR/cpython-configure.perf \
448	-n --dso=oils-for-unix --percentage=relative
449
450	#perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
451	}
452
453	local-test() {
454	local osh=_bin/cxx-opt/osh
455	ninja $osh
456	profile-fib $REPO_ROOT/$osh flat
457	}
458
459	soil-run() {
460	echo 'TODO run benchmarks/gc tasks'
461	# But we don't have Ninja
462	# Fetch the tarball?
463
464	# Can you WAIT for the tarball?
465	# You can wait for the cpp-small task that builds it? Ah hacky hacky
466
467	build-stress-test
468
469	profile-stress-test
470
471	export-osh-cpp _tmp/native-tar-test opt
472	#export-osh-cpp '' opt
473
474	profile-fib $OSH flat
475	profile-osh-parse $OSH flat
476	profile-parse-help $OSH flat
477
478	print-index > $BASE_DIR/index.html
479
480	echo "Wrote $BASE_DIR/index.html"
481	}
482
483	"$@"