OILS / benchmarks / perf.sh View on Github | oils.pub

483 lines, 213 significant
1#!/usr/bin/env bash
2#
3# Run the 'perf' tool and associated reports on OSH.
4#
5# Usage:
6# benchmarks/perf.sh <function name>
7#
8# Deps:
9#
10# Clone https://github.com/brendangregg/FlameGraph
11# Put it in ~/git/other/FlameGraph, or edit the paths below
12#
13# Examples:
14#
15# $0 install # install perf, including matching kernel symbols
16#
17# $0 profile-osh-parse # make flame graph
18#
19# Then look at _tmp/perf/osh-parse.svg in the browser
20
21# $0 profile-osh-parse flat # make flat text report
22#
23# perf report -i _tmp/perf/osh-parse.perf # interactive
24#
25# Likewise for
26#
27# $0 profile-example escape
28# => _tmp/perf/example-escape.svg
29# $0 profile-example escape flat
30# => _tmp/perf/example-escape.report.txt
31
32set -o nounset
33set -o pipefail
34set -o errexit
35
36REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
37
38source $REPO_ROOT/test/common.sh # $OSH
39
40# $REPO_ROOT needed since CPython configure changes dirs
41readonly BASE_DIR=$REPO_ROOT/_tmp/perf
42
43# TODO:
44# - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
45# - grep for call_function in collapsed. I don't see it?
46# - it's inlined I guess?
47
48# Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
49# according to 'perf'. Or maybe you don't have enough samples to catch it?
50
51# NOTES:
52# - dbg vs. opt matters a lot
53# - function-level performance categorization is bad for bytecode interpreters,
54# which have a single function and a big switch statement.
55# - a longer file like configure-coreutils hit garbage collection! collect()
56# - reference counting functions: visit_decref, visit_reachable
57
58install-ubuntu-packages() {
59 # linux-tools-generic is the kernel module
60 # Apparently you need a package specific to the kernel, not sure why.
61 sudo apt-get install \
62 linux-tools-common linux-tools-$(uname -r) linux-tools-generic
63}
64
65install-debian-packages() {
66 sudo apt-get install linux-perf
67}
68
69soil-install() {
70 sudo apt-get update # seem to need this
71
72 install-ubuntu-packages
73}
74
75debug-symbols() {
76 #dpkg --listfiles linux-tools-4.13.0-36-generic
77 #sudo apt install python-dbg
78
79 # I don't see symbols files here? Just the interpreter? They're built into the ELF file?
80 #dpkg --listfiles python-dbg
81
82 # has files in /usr/lib/debug
83 # file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
84 # /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
85 #
86 # https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
87
88 # Does perf also support separate debug files?
89 # How do I set the debug link in oil.ovm? Or should I set build ID?
90
91 # The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
92 # that can produce the separated executable / debugging information file
93 # pairs using the following commands:
94 # objcopy --only-keep-debug foo foo.debug
95 # strip -g foo
96
97 sudo apt install zlib1g-dbg
98 dpkg --listfiles zlib1g-dbg
99 #libpython2.7-dbg
100}
101
102# TODO: Link these two tools in ../oil_DEPS/bin or something
103# Make them work on CI
104
105# NOTE: I used this before with python-flamegraph too.
106flamegraph() {
107 ~/git/other/FlameGraph/flamegraph.pl "$@"
108}
109
110stackcollapse-perf() {
111 ~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
112}
113
114# http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
115make-graph() {
116 local name=${1:-osh-parse}
117
118 local dir=$BASE_DIR
119 perf script -i $dir/$name.perf | stackcollapse-perf > $dir/$name.perf-folded
120
121 flamegraph $dir/$name.perf-folded > $dir/$name.svg
122
123 echo "Wrote $dir/$name.svg"
124}
125
126_make-readable() {
127 local perf_raw=$1
128
129 chmod 644 $perf_raw
130
131 # original user
132 chown $USER $perf_raw
133}
134
135make-readable() {
136 # This gets run as root
137 local name=$1
138
139 local perf_raw=$BASE_DIR/$name.perf
140
141 sudo $0 _make-readable $perf_raw
142
143 file $perf_raw
144 ls -l $perf_raw
145}
146
147_record-cpp() {
148 local name=$1 # e.g. oils-for-unix, escape
149 local mode=$2
150 shift 2
151
152 # Can repeat 13 times without blowing heap
153 #export REPEAT=13
154
155 local freq=10000
156
157 local extra_flags=''
158 case $mode in
159 (graph) extra_flags='-g' ;; # needed to make flame graph
160 (flat) extra_flags='' ;;
161 (*) die "Mode should be graph or flat, got $mode" ;;
162 esac
163
164 time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
165
166 make-readable $name
167}
168
169profile-cpp() {
170 local name=$1
171 local mode=$2
172 shift
173
174 mkdir -p $BASE_DIR
175
176 # -E preserve environment like BENCHMARK=1
177 sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
178
179 case $mode in
180 (graph)
181 make-graph $name
182 ;;
183 (flat)
184 local out=$BASE_DIR/$name.report.txt
185 text-report $name | tee $out
186 echo "Wrote $out"
187 ;;
188 (*)
189 die "Mode should be graph or flat, got $mode"
190 ;;
191 esac
192}
193
194profile-osh-parse() {
195 # Profile parsing a big file. More than half the time is in malloc
196 # (_int_malloc in GCC), which is not surprising!
197
198 local osh=${1:-_bin/cxx-opt/osh}
199 local mode=${2:-graph}
200
201 #local file=benchmarks/testdata/configure
202 local file=benchmarks/testdata/configure-coreutils
203
204 local -a cmd=( $osh --ast-format none -n $file )
205 profile-cpp 'osh-parse' $mode "${cmd[@]}"
206
207 # 'perf list' shows the events
208 #OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
209 OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
210
211 # Run again with GC stats
212 time OILS_GC_STATS=1 "${cmd[@]}"
213}
214
215profile-fib() {
216 local osh=${1:-_bin/cxx-opt/osh}
217 local mode=${2:-graph}
218
219 # Same iterations as benchmarks/gc
220 local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
221
222 profile-cpp 'fib' $mode "${cmd[@]}"
223}
224
225# Hm this is dominated by GC, not regex?
226profile-parse-help() {
227 local osh=${1:-_bin/cxx-opt/osh}
228 local mode=${2:-flat}
229
230 local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
231
232 profile-cpp 'parse-help' $mode "${cmd[@]}"
233}
234
235profile-example() {
236 local example=${1:-escape}
237 local mode=${2:-graph}
238
239 local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
240
241 ninja $bin
242 echo
243
244 BENCHMARK=1 profile-cpp "example-$example" $mode $bin
245}
246
247profile-hash-table() {
248 local mode=${1:-graph}
249
250 local bin='_bin/cxx-opt/mycpp/hash_table'
251 ninja $bin
252 profile-cpp 'hash_table' $mode $bin -t hash_speed_test
253}
254
255# Perf note: Without -o, for some reason osh output is shown on the console.
256# It doesn't go to wc?
257#perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild | wc -l
258
259text-report() {
260 ### Show a batch report; 'perf report' is interactive
261
262 local name=$1
263 shift
264
265 local perf_raw=$BASE_DIR/$name.perf
266
267 # Flat report
268 perf report -i $perf_raw -n --stdio "$@"
269}
270
271# Shows instruction counts, branch misses, and so forth
272#
273# Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
274# Only 21M branch misses, or 0.9%. Interesting.
275_stat() {
276 perf stat -- "$@" | wc -l
277 # -e cache-misses only shows that stat
278}
279stat() { sudo $0 _stat "$@"; }
280
281stat-osh-parse() {
282 stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
283}
284
285
286#
287# OLD OVM stuff
288#
289
290# Parsing abuild in Debug mode:
291# 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
292# thought. Maybe need my own loop.
293# 8% type_call -- hm introspection?
294# 7% PyObject_GetAttr. My intitution. Should be done at compile time!
295# 6% do_richcmp -- hm interesting
296# 5% PyObject_Malloc.
297
298# More benchmarks:
299# OPy running under OVM, compiling itself, compiling Oil, compiling OPy ports,
300# etc.
301
302# Parsing abuild, the optimized version.
303#
304# 80% PyEval_EvalFramEx. Woah everything is inlined?
305# 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
306# Some kernel.
307# 0.76% lookdict_string is not a bottleneck. Hm.
308#
309# Wow.
310# Maybe I need counters in optimized mode?
311# Yeah what I really want is per opcode total!
312
313_record() {
314
315 # TODO: The optimized build should have symbols! Don't build with -s. And
316 # then put symbols next to the binary somehow? How do the symbols packages
317 # work?
318 #perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild | wc -l
319
320 # call graph recording. This helps it be less "flat" in opt mode. Otherwise
321 # everything is PyEval_EvalFrameEx.
322 local flag='-g'
323 local bin=_bin/oil.ovm-opt
324 #local bin=_bin/oil.ovm-dbg # This shows more details
325
326 local freq=1000 # 1000 Hz
327
328 #local file=benchmarks/testdata/abuild # small file
329
330 local file=benchmarks/testdata/configure-coreutils # big file
331
332 time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
333 #perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
334}
335record() { sudo $0 _record; }
336
337#
338# Soil CI
339#
340
341build-stress-test() {
342
343 # Special _OIL_DEV for -D GC_TIMING
344 _OIL_DEV=1 ./configure --without-readline
345
346 mkdir -p _tmp
347 c++ -D MARK_SWEEP -I . \
348 -O2 -g \
349 -o _tmp/gc_stress_test \
350 mycpp/gc_stress_test.cc \
351 mycpp/mark_sweep_heap.cc \
352 mycpp/gc_builtins.cc \
353 mycpp/gc_iolib.cc \
354 mycpp/gc_mylib.cc \
355 mycpp/gc_str.cc \
356 mycpp/hash.cc \
357 -lstdc++
358}
359
360profile-stress-test() {
361 profile-cpp 'gc_stress_test' flat \
362 _tmp/gc_stress_test
363}
364
365print-index() {
366 echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
367 echo '<h1>Perf Profiles</h1>'
368
369 for path in $BASE_DIR/*.txt; do
370 local filename=$(basename $path)
371 echo "<a href="$filename">$filename</a> <br/>"
372 done
373
374 echo '</body>'
375}
376
377# TODO: fetch the tarball from the cpp-small CI task
378
379build-tar() {
380 local tar=${1:-_release/oils-for-unix.tar}
381
382 tar=$PWD/$tar
383
384 local tmp=$BASE_DIR/tar
385 mkdir -p $tmp
386
387 pushd $tmp
388
389 tar --extract < $tar
390 cd oils-for-unix-* # glob of 1
391
392 ./configure
393
394 # TODO: add bumproot
395 for variant in opt+bumpleak opt; do
396 echo
397
398 time _build/oils.sh --variant $variant
399 echo
400
401 _bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
402 done
403
404 # TODO:
405 # - profile each executable
406 # - add OILS_GC_THRESHOLD=$big to avoid GC
407
408 popd
409}
410
411profile-cpython-configure() {
412 ### borrowed from benchmarks/osh-runtime.sh
413
414 local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
415 local mode=${2:-flat}
416
417 local dir=$BASE_DIR/cpython-configure
418
419 # Fails because perf has to run as root
420 rm -r -f -v $dir || true
421
422 mkdir -p $dir
423
424 local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
425
426 pushd $dir
427 profile-cpp 'cpython-configure' $mode "${cmd[@]}"
428 popd
429}
430
431cpython-report() {
432 #perf report -i $BASE_DIR/cpython-configure.perf
433
434 # oils-for-unix is only 4.89% of time? That's #5
435 # 48% in kernel
436 # 23% in cc1
437 #
438 # That means we're still a bit slow
439
440 # TODO: I want to change OVERALL percentages
441 #
442 # GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
443 # GC can account for the whole thing
444 # I wonder if we can do GC while waiting for processes? They might be tiny
445 # processes though
446
447 perf report -i $BASE_DIR/cpython-configure.perf \
448 -n --dso=oils-for-unix --percentage=relative
449
450 #perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
451}
452
453local-test() {
454 local osh=_bin/cxx-opt/osh
455 ninja $osh
456 profile-fib $REPO_ROOT/$osh flat
457}
458
459soil-run() {
460 echo 'TODO run benchmarks/gc tasks'
461 # But we don't have Ninja
462 # Fetch the tarball?
463
464 # Can you WAIT for the tarball?
465 # You can wait for the cpp-small task that builds it? Ah hacky hacky
466
467 build-stress-test
468
469 profile-stress-test
470
471 export-osh-cpp _tmp/native-tar-test opt
472 #export-osh-cpp '' opt
473
474 profile-fib $OSH flat
475 profile-osh-parse $OSH flat
476 profile-parse-help $OSH flat
477
478 print-index > $BASE_DIR/index.html
479
480 echo "Wrote $BASE_DIR/index.html"
481}
482
483"$@"