benchmarks/perf.sh

OILS / benchmarks / perf.sh View on Github | oils.pub

482 lines, 213 significant

1	#!/usr/bin/env bash
2	#
3	# Run the 'perf' tool and associated reports on OSH.
4	#
5	# Usage:
6	# benchmarks/perf.sh <function name>
7	#
8	# Deps:
9	#
10	# Clone https://github.com/brendangregg/FlameGraph
11	# Put it in ~/git/other/FlameGraph, or edit the paths below
12	#
13	# Examples:
14	#
15	# $0 install # install perf, including matching kernel symbols
16	#
17	# $0 profile-osh-parse # make flame graph
18	#
19	# Then look at _tmp/perf/osh-parse.svg in the browser
20
21	# $0 profile-osh-parse flat # make flat text report
22	#
23	# perf report -i _tmp/perf/osh-parse.perf # interactive
24	#
25	# Likewise for
26	#
27	# $0 profile-example escape
28	# => _tmp/perf/example-escape.svg
29	# $0 profile-example escape flat
30	# => _tmp/perf/example-escape.report.txt
31
32	set -o nounset
33	set -o pipefail
34	set -o errexit
35
36	REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
37
38	source $REPO_ROOT/test/common.sh # $OSH
39
40	# $REPO_ROOT needed since CPython configure changes dirs
41	readonly BASE_DIR=$REPO_ROOT/_tmp/perf
42
43	# TODO:
44	# - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
45	# - grep for call_function in collapsed. I don't see it?
46	# - it's inlined I guess?
47
48	# Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
49	# according to 'perf'. Or maybe you don't have enough samples to catch it?
50
51	# NOTES:
52	# - dbg vs. opt matters a lot
53	# - function-level performance categorization is bad for bytecode interpreters,
54	# which have a single function and a big switch statement.
55	# - a longer file like configure-coreutils hit garbage collection! collect()
56	# - reference counting functions: visit_decref, visit_reachable
57
58	install-ubuntu-packages() {
59	# linux-tools-generic is the kernel module
60	# Apparently you need a package specific to the kernel, not sure why.
61	sudo apt-get install \
62	linux-tools-common linux-tools-$(uname -r) linux-tools-generic
63	}
64
65	install-debian-packages() {
66	sudo apt-get install linux-perf
67	}
68
69	soil-install() {
70	sudo apt-get update # seem to need this
71
72	install-ubuntu-packages
73	}
74
75	debug-symbols() {
76	#dpkg --listfiles linux-tools-4.13.0-36-generic
77	#sudo apt install python-dbg
78
79	# I don't see symbols files here? Just the interpreter? They're built into the ELF file?
80	#dpkg --listfiles python-dbg
81
82	# has files in /usr/lib/debug
83	# file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
84	# /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
85	#
86	# https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
87
88	# Does perf also support separate debug files?
89	# How do I set the debug link in oil.ovm? Or should I set build ID?
90
91	# The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
92	# that can produce the separated executable / debugging information file
93	# pairs using the following commands:
94	# objcopy --only-keep-debug foo foo.debug
95	# strip -g foo
96
97	sudo apt install zlib1g-dbg
98	dpkg --listfiles zlib1g-dbg
99	#libpython2.7-dbg
100	}
101
102	# TODO: Make these tools work in CI, by turning them into wedges?
103
104	# NOTE: I used this before with python-flamegraph too.
105	flamegraph() {
106	~/git/other/FlameGraph/flamegraph.pl "$@"
107	}
108
109	stackcollapse-perf() {
110	~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
111	}
112
113	# http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
114	make-graph() {
115	local name=${1:-osh-parse}
116
117	local dir=$BASE_DIR
118	perf script -i $dir/$name.perf \| stackcollapse-perf > $dir/$name.perf-folded
119
120	flamegraph $dir/$name.perf-folded > $dir/$name.svg
121
122	echo "Wrote $dir/$name.svg"
123	}
124
125	_make-readable() {
126	local perf_raw=$1
127
128	chmod 644 $perf_raw
129
130	# original user
131	chown $USER $perf_raw
132	}
133
134	make-readable() {
135	# This gets run as root
136	local name=$1
137
138	local perf_raw=$BASE_DIR/$name.perf
139
140	sudo $0 _make-readable $perf_raw
141
142	file $perf_raw
143	ls -l $perf_raw
144	}
145
146	_record-cpp() {
147	local name=$1 # e.g. oils-for-unix, escape
148	local mode=$2
149	shift 2
150
151	# Can repeat 13 times without blowing heap
152	#export REPEAT=13
153
154	local freq=10000
155
156	local extra_flags=''
157	case $mode in
158	(graph) extra_flags='-g' ;; # needed to make flame graph
159	(flat) extra_flags='' ;;
160	(*) die "Mode should be graph or flat, got $mode" ;;
161	esac
162
163	time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
164
165	make-readable $name
166	}
167
168	profile-cpp() {
169	local name=$1
170	local mode=$2
171	shift
172
173	mkdir -p $BASE_DIR
174
175	# -E preserve environment like BENCHMARK=1
176	sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
177
178	case $mode in
179	(graph)
180	make-graph $name
181	;;
182	(flat)
183	local out=$BASE_DIR/$name.report.txt
184	text-report $name \| tee $out
185	echo "Wrote $out"
186	;;
187	(*)
188	die "Mode should be graph or flat, got $mode"
189	;;
190	esac
191	}
192
193	profile-osh-parse() {
194	# Profile parsing a big file. More than half the time is in malloc
195	# (_int_malloc in GCC), which is not surprising!
196
197	local osh=${1:-_bin/cxx-opt/osh}
198	local mode=${2:-graph}
199
200	#local file=benchmarks/testdata/configure
201	local file=benchmarks/testdata/configure-coreutils
202
203	local -a cmd=( $osh --ast-format none -n $file )
204	profile-cpp 'osh-parse' $mode "${cmd[@]}"
205
206	# 'perf list' shows the events
207	#OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
208	OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
209
210	# Run again with GC stats
211	time OILS_GC_STATS=1 "${cmd[@]}"
212	}
213
214	profile-fib() {
215	local osh=${1:-_bin/cxx-opt/osh}
216	local mode=${2:-graph}
217
218	# Same iterations as benchmarks/gc
219	local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
220
221	profile-cpp 'fib' $mode "${cmd[@]}"
222	}
223
224	# Hm this is dominated by GC, not regex?
225	profile-parse-help() {
226	local osh=${1:-_bin/cxx-opt/osh}
227	local mode=${2:-flat}
228
229	local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
230
231	profile-cpp 'parse-help' $mode "${cmd[@]}"
232	}
233
234	profile-example() {
235	local example=${1:-escape}
236	local mode=${2:-graph}
237
238	local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
239
240	ninja $bin
241	echo
242
243	BENCHMARK=1 profile-cpp "example-$example" $mode $bin
244	}
245
246	profile-hash-table() {
247	local mode=${1:-graph}
248
249	local bin='_bin/cxx-opt/mycpp/hash_table'
250	ninja $bin
251	profile-cpp 'hash_table' $mode $bin -t hash_speed_test
252	}
253
254	# Perf note: Without -o, for some reason osh output is shown on the console.
255	# It doesn't go to wc?
256	#perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild \| wc -l
257
258	text-report() {
259	### Show a batch report; 'perf report' is interactive
260
261	local name=$1
262	shift
263
264	local perf_raw=$BASE_DIR/$name.perf
265
266	# Flat report
267	perf report -i $perf_raw -n --stdio "$@"
268	}
269
270	# Shows instruction counts, branch misses, and so forth
271	#
272	# Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
273	# Only 21M branch misses, or 0.9%. Interesting.
274	_stat() {
275	perf stat -- "$@" \| wc -l
276	# -e cache-misses only shows that stat
277	}
278	stat() { sudo $0 _stat "$@"; }
279
280	stat-osh-parse() {
281	stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
282	}
283
284
285	#
286	# OLD OVM stuff
287	#
288
289	# Parsing abuild in Debug mode:
290	# 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
291	# thought. Maybe need my own loop.
292	# 8% type_call -- hm introspection?
293	# 7% PyObject_GetAttr. My intitution. Should be done at compile time!
294	# 6% do_richcmp -- hm interesting
295	# 5% PyObject_Malloc.
296
297	# More benchmarks:
298	# OPy running under OVM, compiling itself, compiling Oils, compiling OPy ports,
299	# etc.
300
301	# Parsing abuild, the optimized version.
302	#
303	# 80% PyEval_EvalFramEx. Woah everything is inlined?
304	# 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
305	# Some kernel.
306	# 0.76% lookdict_string is not a bottleneck. Hm.
307	#
308	# Wow.
309	# Maybe I need counters in optimized mode?
310	# Yeah what I really want is per opcode total!
311
312	_record() {
313
314	# TODO: The optimized build should have symbols! Don't build with -s. And
315	# then put symbols next to the binary somehow? How do the symbols packages
316	# work?
317	#perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild \| wc -l
318
319	# call graph recording. This helps it be less "flat" in opt mode. Otherwise
320	# everything is PyEval_EvalFrameEx.
321	local flag='-g'
322	local bin=_bin/oil.ovm-opt
323	#local bin=_bin/oil.ovm-dbg # This shows more details
324
325	local freq=1000 # 1000 Hz
326
327	#local file=benchmarks/testdata/abuild # small file
328
329	local file=benchmarks/testdata/configure-coreutils # big file
330
331	time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
332	#perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
333	}
334	record() { sudo $0 _record; }
335
336	#
337	# Soil CI
338	#
339
340	build-stress-test() {
341
342	# Special _OIL_DEV for -D GC_TIMING
343	_OIL_DEV=1 ./configure --without-readline
344
345	mkdir -p _tmp
346	c++ -D MARK_SWEEP -I . \
347	-O2 -g \
348	-o _tmp/gc_stress_test \
349	mycpp/gc_stress_test.cc \
350	mycpp/mark_sweep_heap.cc \
351	mycpp/gc_builtins.cc \
352	mycpp/gc_iolib.cc \
353	mycpp/gc_mylib.cc \
354	mycpp/gc_str.cc \
355	mycpp/hash.cc \
356	-lstdc++
357	}
358
359	profile-stress-test() {
360	profile-cpp 'gc_stress_test' flat \
361	_tmp/gc_stress_test
362	}
363
364	print-index() {
365	echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
366	echo '<h1>Perf Profiles</h1>'
367
368	for path in $BASE_DIR/*.txt; do
369	local filename=$(basename $path)
370	echo "<a href="$filename">$filename</a> <br/>"
371	done
372
373	echo '</body>'
374	}
375
376	# TODO: fetch the tarball from the cpp-small CI task
377
378	build-tar() {
379	local tar=${1:-_release/oils-for-unix.tar}
380
381	tar=$PWD/$tar
382
383	local tmp=$BASE_DIR/tar
384	mkdir -p $tmp
385
386	pushd $tmp
387
388	tar --extract < $tar
389	cd oils-for-unix-* # glob of 1
390
391	./configure
392
393	# TODO: add bumproot
394	for variant in opt+bumpleak opt; do
395	echo
396
397	time _build/oils.sh --variant $variant
398	echo
399
400	_bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
401	done
402
403	# TODO:
404	# - profile each executable
405	# - add OILS_GC_THRESHOLD=$big to avoid GC
406
407	popd
408	}
409
410	profile-cpython-configure() {
411	### borrowed from benchmarks/osh-runtime.sh
412
413	local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
414	local mode=${2:-flat}
415
416	local dir=$BASE_DIR/cpython-configure
417
418	# Fails because perf has to run as root
419	rm -r -f -v $dir \|\| true
420
421	mkdir -p $dir
422
423	local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
424
425	pushd $dir
426	profile-cpp 'cpython-configure' $mode "${cmd[@]}"
427	popd
428	}
429
430	cpython-report() {
431	#perf report -i $BASE_DIR/cpython-configure.perf
432
433	# oils-for-unix is only 4.89% of time? That's #5
434	# 48% in kernel
435	# 23% in cc1
436	#
437	# That means we're still a bit slow
438
439	# TODO: I want to change OVERALL percentages
440	#
441	# GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
442	# GC can account for the whole thing
443	# I wonder if we can do GC while waiting for processes? They might be tiny
444	# processes though
445
446	perf report -i $BASE_DIR/cpython-configure.perf \
447	-n --dso=oils-for-unix --percentage=relative
448
449	#perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
450	}
451
452	local-test() {
453	local osh=_bin/cxx-opt/osh
454	ninja $osh
455	profile-fib $REPO_ROOT/$osh flat
456	}
457
458	soil-run() {
459	echo 'TODO run benchmarks/gc tasks'
460	# But we don't have Ninja
461	# Fetch the tarball?
462
463	# Can you WAIT for the tarball?
464	# You can wait for the cpp-small task that builds it? Ah hacky hacky
465
466	build-stress-test
467
468	profile-stress-test
469
470	export-osh-cpp _tmp/native-tar-test opt
471	#export-osh-cpp '' opt
472
473	profile-fib $OSH flat
474	profile-osh-parse $OSH flat
475	profile-parse-help $OSH flat
476
477	print-index > $BASE_DIR/index.html
478
479	echo "Wrote $BASE_DIR/index.html"
480	}
481
482	"$@"