OILS / benchmarks / perf.sh View on Github | oils.pub

482 lines, 213 significant
1#!/usr/bin/env bash
2#
3# Run the 'perf' tool and associated reports on OSH.
4#
5# Usage:
6# benchmarks/perf.sh <function name>
7#
8# Deps:
9#
10# Clone https://github.com/brendangregg/FlameGraph
11# Put it in ~/git/other/FlameGraph, or edit the paths below
12#
13# Examples:
14#
15# $0 install # install perf, including matching kernel symbols
16#
17# $0 profile-osh-parse # make flame graph
18#
19# Then look at _tmp/perf/osh-parse.svg in the browser
20
21# $0 profile-osh-parse flat # make flat text report
22#
23# perf report -i _tmp/perf/osh-parse.perf # interactive
24#
25# Likewise for
26#
27# $0 profile-example escape
28# => _tmp/perf/example-escape.svg
29# $0 profile-example escape flat
30# => _tmp/perf/example-escape.report.txt
31
32set -o nounset
33set -o pipefail
34set -o errexit
35
36REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
37
38source $REPO_ROOT/test/common.sh # $OSH
39
40# $REPO_ROOT needed since CPython configure changes dirs
41readonly BASE_DIR=$REPO_ROOT/_tmp/perf
42
43# TODO:
44# - kernel symbols. Is that why there are a lot of [unknown] in opt mode?
45# - grep for call_function in collapsed. I don't see it?
46# - it's inlined I guess?
47
48# Question: PyEval_EvalFrameEx doesn't appear recursive in opt mode? At least
49# according to 'perf'. Or maybe you don't have enough samples to catch it?
50
51# NOTES:
52# - dbg vs. opt matters a lot
53# - function-level performance categorization is bad for bytecode interpreters,
54# which have a single function and a big switch statement.
55# - a longer file like configure-coreutils hit garbage collection! collect()
56# - reference counting functions: visit_decref, visit_reachable
57
58install-ubuntu-packages() {
59 # linux-tools-generic is the kernel module
60 # Apparently you need a package specific to the kernel, not sure why.
61 sudo apt-get install \
62 linux-tools-common linux-tools-$(uname -r) linux-tools-generic
63}
64
65install-debian-packages() {
66 sudo apt-get install linux-perf
67}
68
69soil-install() {
70 sudo apt-get update # seem to need this
71
72 install-ubuntu-packages
73}
74
75debug-symbols() {
76 #dpkg --listfiles linux-tools-4.13.0-36-generic
77 #sudo apt install python-dbg
78
79 # I don't see symbols files here? Just the interpreter? They're built into the ELF file?
80 #dpkg --listfiles python-dbg
81
82 # has files in /usr/lib/debug
83 # file /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug
84 # /usr/lib/debug/.build-id/8d/9bd4ce26e45ef16075c67d5f5eeafd8b562832.debug: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=8d9bd4ce26e45ef16075c67d5f5eeafd8b562832, not stripped
85 #
86 # https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
87
88 # Does perf also support separate debug files?
89 # How do I set the debug link in oil.ovm? Or should I set build ID?
90
91 # The GNU binary utilities (Binutils) package includes the ‘objcopy’ utility
92 # that can produce the separated executable / debugging information file
93 # pairs using the following commands:
94 # objcopy --only-keep-debug foo foo.debug
95 # strip -g foo
96
97 sudo apt install zlib1g-dbg
98 dpkg --listfiles zlib1g-dbg
99 #libpython2.7-dbg
100}
101
102# TODO: Make these tools work in CI, by turning them into wedges?
103
104# NOTE: I used this before with python-flamegraph too.
105flamegraph() {
106 ~/git/other/FlameGraph/flamegraph.pl "$@"
107}
108
109stackcollapse-perf() {
110 ~/git/other/FlameGraph/stackcollapse-perf.pl "$@"
111}
112
113# http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html
114make-graph() {
115 local name=${1:-osh-parse}
116
117 local dir=$BASE_DIR
118 perf script -i $dir/$name.perf | stackcollapse-perf > $dir/$name.perf-folded
119
120 flamegraph $dir/$name.perf-folded > $dir/$name.svg
121
122 echo "Wrote $dir/$name.svg"
123}
124
125_make-readable() {
126 local perf_raw=$1
127
128 chmod 644 $perf_raw
129
130 # original user
131 chown $USER $perf_raw
132}
133
134make-readable() {
135 # This gets run as root
136 local name=$1
137
138 local perf_raw=$BASE_DIR/$name.perf
139
140 sudo $0 _make-readable $perf_raw
141
142 file $perf_raw
143 ls -l $perf_raw
144}
145
146_record-cpp() {
147 local name=$1 # e.g. oils-for-unix, escape
148 local mode=$2
149 shift 2
150
151 # Can repeat 13 times without blowing heap
152 #export REPEAT=13
153
154 local freq=10000
155
156 local extra_flags=''
157 case $mode in
158 (graph) extra_flags='-g' ;; # needed to make flame graph
159 (flat) extra_flags='' ;;
160 (*) die "Mode should be graph or flat, got $mode" ;;
161 esac
162
163 time perf record $extra_flags -F $freq -o $BASE_DIR/$name.perf -- "$@"
164
165 make-readable $name
166}
167
168profile-cpp() {
169 local name=$1
170 local mode=$2
171 shift
172
173 mkdir -p $BASE_DIR
174
175 # -E preserve environment like BENCHMARK=1
176 sudo -E $REPO_ROOT/$0 _record-cpp $name "$@";
177
178 case $mode in
179 (graph)
180 make-graph $name
181 ;;
182 (flat)
183 local out=$BASE_DIR/$name.report.txt
184 text-report $name | tee $out
185 echo "Wrote $out"
186 ;;
187 (*)
188 die "Mode should be graph or flat, got $mode"
189 ;;
190 esac
191}
192
193profile-osh-parse() {
194 # Profile parsing a big file. More than half the time is in malloc
195 # (_int_malloc in GCC), which is not surprising!
196
197 local osh=${1:-_bin/cxx-opt/osh}
198 local mode=${2:-graph}
199
200 #local file=benchmarks/testdata/configure
201 local file=benchmarks/testdata/configure-coreutils
202
203 local -a cmd=( $osh --ast-format none -n $file )
204 profile-cpp 'osh-parse' $mode "${cmd[@]}"
205
206 # 'perf list' shows the events
207 #OILS_GC_STATS=1 sudo perf stat -e cache-misses -e cache-references "${cmd[@]}"
208 OILS_GC_STATS=1 sudo perf stat "${cmd[@]}"
209
210 # Run again with GC stats
211 time OILS_GC_STATS=1 "${cmd[@]}"
212}
213
214profile-fib() {
215 local osh=${1:-_bin/cxx-opt/osh}
216 local mode=${2:-graph}
217
218 # Same iterations as benchmarks/gc
219 local -a cmd=( $osh benchmarks/compute/fib.sh 500 44 )
220
221 profile-cpp 'fib' $mode "${cmd[@]}"
222}
223
224# Hm this is dominated by GC, not regex?
225profile-parse-help() {
226 local osh=${1:-_bin/cxx-opt/osh}
227 local mode=${2:-flat}
228
229 local -a cmd=( $osh benchmarks/parse-help/pure-excerpt.sh parse_help_file benchmarks/parse-help/clang.txt )
230
231 profile-cpp 'parse-help' $mode "${cmd[@]}"
232}
233
234profile-example() {
235 local example=${1:-escape}
236 local mode=${2:-graph}
237
238 local bin="_bin/cxx-opt/mycpp/examples/$example.mycpp"
239
240 ninja $bin
241 echo
242
243 BENCHMARK=1 profile-cpp "example-$example" $mode $bin
244}
245
246profile-hash-table() {
247 local mode=${1:-graph}
248
249 local bin='_bin/cxx-opt/mycpp/hash_table'
250 ninja $bin
251 profile-cpp 'hash_table' $mode $bin -t hash_speed_test
252}
253
254# Perf note: Without -o, for some reason osh output is shown on the console.
255# It doesn't go to wc?
256#perf record -o perf.data -- _bin/osh -n benchmarks/testdata/abuild | wc -l
257
258text-report() {
259 ### Show a batch report; 'perf report' is interactive
260
261 local name=$1
262 shift
263
264 local perf_raw=$BASE_DIR/$name.perf
265
266 # Flat report
267 perf report -i $perf_raw -n --stdio "$@"
268}
269
270# Shows instruction counts, branch misses, and so forth
271#
272# Wow 11 billion instructions! 9 billion cycles. 2.3 billion branches. Crazy.
273# Only 21M branch misses, or 0.9%. Interesting.
274_stat() {
275 perf stat -- "$@" | wc -l
276 # -e cache-misses only shows that stat
277}
278stat() { sudo $0 _stat "$@"; }
279
280stat-osh-parse() {
281 stat _bin/cxx-opt/oils-for-unix --ast-format none -n benchmarks/testdata/configure
282}
283
284
285#
286# OLD OVM stuff
287#
288
289# Parsing abuild in Debug mode:
290# 14% PyEval_EvalFrameEx -- hm. Interpreter dispatch is a lot? More than I
291# thought. Maybe need my own loop.
292# 8% type_call -- hm introspection?
293# 7% PyObject_GetAttr. My intitution. Should be done at compile time!
294# 6% do_richcmp -- hm interesting
295# 5% PyObject_Malloc.
296
297# More benchmarks:
298# OPy running under OVM, compiling itself, compiling Oils, compiling OPy ports,
299# etc.
300
301# Parsing abuild, the optimized version.
302#
303# 80% PyEval_EvalFramEx. Woah everything is inlined?
304# 12.5% PyObject_GenericGetAtr. PyObject_GetAttr is much lower down.
305# Some kernel.
306# 0.76% lookdict_string is not a bottleneck. Hm.
307#
308# Wow.
309# Maybe I need counters in optimized mode?
310# Yeah what I really want is per opcode total!
311
312_record() {
313
314 # TODO: The optimized build should have symbols! Don't build with -s. And
315 # then put symbols next to the binary somehow? How do the symbols packages
316 # work?
317 #perf record -o perf.data -- _bin/oil.ovm-dbg osh -n benchmarks/testdata/abuild | wc -l
318
319 # call graph recording. This helps it be less "flat" in opt mode. Otherwise
320 # everything is PyEval_EvalFrameEx.
321 local flag='-g'
322 local bin=_bin/oil.ovm-opt
323 #local bin=_bin/oil.ovm-dbg # This shows more details
324
325 local freq=1000 # 1000 Hz
326
327 #local file=benchmarks/testdata/abuild # small file
328
329 local file=benchmarks/testdata/configure-coreutils # big file
330
331 time perf record $flag -F $freq -o perf.data -- $bin osh --ast-format none -n $file
332 #perf record -o perf.data -- _bin/osh --ast-format none -n benchmarks/testdata/abuild
333}
334record() { sudo $0 _record; }
335
336#
337# Soil CI
338#
339
340build-stress-test() {
341
342 # Special _OIL_DEV for -D GC_TIMING
343 _OIL_DEV=1 ./configure --without-readline
344
345 mkdir -p _tmp
346 c++ -D MARK_SWEEP -I . \
347 -O2 -g \
348 -o _tmp/gc_stress_test \
349 mycpp/gc_stress_test.cc \
350 mycpp/mark_sweep_heap.cc \
351 mycpp/gc_builtins.cc \
352 mycpp/gc_iolib.cc \
353 mycpp/gc_mylib.cc \
354 mycpp/gc_str.cc \
355 mycpp/hash.cc \
356 -lstdc++
357}
358
359profile-stress-test() {
360 profile-cpp 'gc_stress_test' flat \
361 _tmp/gc_stress_test
362}
363
364print-index() {
365 echo '<body style="margin: 0 auto; width: 40em; font-size: large">'
366 echo '<h1>Perf Profiles</h1>'
367
368 for path in $BASE_DIR/*.txt; do
369 local filename=$(basename $path)
370 echo "<a href="$filename">$filename</a> <br/>"
371 done
372
373 echo '</body>'
374}
375
376# TODO: fetch the tarball from the cpp-small CI task
377
378build-tar() {
379 local tar=${1:-_release/oils-for-unix.tar}
380
381 tar=$PWD/$tar
382
383 local tmp=$BASE_DIR/tar
384 mkdir -p $tmp
385
386 pushd $tmp
387
388 tar --extract < $tar
389 cd oils-for-unix-* # glob of 1
390
391 ./configure
392
393 # TODO: add bumproot
394 for variant in opt+bumpleak opt; do
395 echo
396
397 time _build/oils.sh --variant $variant
398 echo
399
400 _bin/cxx-$variant-sh/osh -c 'echo "hi from $0"'
401 done
402
403 # TODO:
404 # - profile each executable
405 # - add OILS_GC_THRESHOLD=$big to avoid GC
406
407 popd
408}
409
410profile-cpython-configure() {
411 ### borrowed from benchmarks/osh-runtime.sh
412
413 local osh=${1:-$REPO_ROOT/_bin/cxx-opt/osh}
414 local mode=${2:-flat}
415
416 local dir=$BASE_DIR/cpython-configure
417
418 # Fails because perf has to run as root
419 rm -r -f -v $dir || true
420
421 mkdir -p $dir
422
423 local -a cmd=( $osh $REPO_ROOT/Python-2.7.13/configure )
424
425 pushd $dir
426 profile-cpp 'cpython-configure' $mode "${cmd[@]}"
427 popd
428}
429
430cpython-report() {
431 #perf report -i $BASE_DIR/cpython-configure.perf
432
433 # oils-for-unix is only 4.89% of time? That's #5
434 # 48% in kernel
435 # 23% in cc1
436 #
437 # That means we're still a bit slow
438
439 # TODO: I want to change OVERALL percentages
440 #
441 # GC is 1.6% and let's say rooting is 3%. That's 300 ms out of 10s
442 # GC can account for the whole thing
443 # I wonder if we can do GC while waiting for processes? They might be tiny
444 # processes though
445
446 perf report -i $BASE_DIR/cpython-configure.perf \
447 -n --dso=oils-for-unix --percentage=relative
448
449 #perf report -i $BASE_DIR/cpython-configure.perf --sort=dso
450}
451
452local-test() {
453 local osh=_bin/cxx-opt/osh
454 ninja $osh
455 profile-fib $REPO_ROOT/$osh flat
456}
457
458soil-run() {
459 echo 'TODO run benchmarks/gc tasks'
460 # But we don't have Ninja
461 # Fetch the tarball?
462
463 # Can you WAIT for the tarball?
464 # You can wait for the cpp-small task that builds it? Ah hacky hacky
465
466 build-stress-test
467
468 profile-stress-test
469
470 export-osh-cpp _tmp/native-tar-test opt
471 #export-osh-cpp '' opt
472
473 profile-fib $OSH flat
474 profile-osh-parse $OSH flat
475 profile-parse-help $OSH flat
476
477 print-index > $BASE_DIR/index.html
478
479 echo "Wrote $BASE_DIR/index.html"
480}
481
482"$@"