OILS / metrics / bytecode.R View on Github | oils.pub

388 lines, 204 significant
1#!/usr/bin/env Rscript
2#
3# bytecode.R -- Analyze output of opyc dis-tables.
4#
5# Usage:
6# bytecode.R ACTION IN_DIR OUT_DIR
7
8library(dplyr)
9library(tidyr) # spread()
10library(stringr)
11
12source('benchmarks/common.R')
13
14options(stringsAsFactors = F,
15 # Make the report wide. tibble.width doesn't appear to do this?
16 width=200,
17 tibble.print_max=Inf
18)
19
20Basic = function(ctx) {
21 Banner('BASIC METRICS')
22
23 # Number of files
24 ctx$frames %>% count(path) -> by_path
25 ShowValue('Number of files: %d', nrow(by_path))
26
27 # 216K
28 b = sum(ctx$frames$bytecode_bytes)
29 ShowValue('Total bytecode bytes: %d', b)
30
31 num_insts = nrow(ctx$ops)
32 ShowValue('Total instructions: %d', num_insts)
33
34 # Hm this isn't reliable because the code name isn't unique! I think we need
35 # firstlineno
36 ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
37 ShowFrame('Duplicate path/name', f1)
38}
39
40BigStrings = function(consts) {
41 Banner('BIG STRINGS')
42
43 strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
44 strs %>% head(20) %>% print()
45 total_bytes = sum(strs$len_or_val)
46
47 # 184 KB of strings! That's just the payload; the header is probably more.
48 ShowValue('total string bytes: %d', total_bytes)
49
50 # This plot says:
51 #
52 # total bytes is 184 KB
53 # - the top 10 strings sum to 20K bytes
54 # - the top 100 strings sum to 30K bytes
55
56 cum = cumsum(strs$len_or_val)
57 plot(cum)
58
59 #plot(ecdf(strs$len_or_val))
60}
61
62Consts = function(consts) {
63 Banner('CONSTS')
64
65 # count of types of constants. Strings dominate of course.
66 # But there are only 7 or so immutable types!
67
68 # - only 2 float constants.
69 # - get rid of the unicode constants in posixpath.
70
71 consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
72 ShowFrame('Types of constants', frequent)
73}
74
75# Frames by number of consts, number of ops, etc.
76Frames = function(ctx) {
77 Banner('FRAMES')
78
79 ctx$consts %>% count(path, code_name, sort=T) %>% head(20) -> f1
80 ShowFrame('Frames with many consts', f1)
81
82 ctx$ops %>% count(path, code_name, sort=T) %>% head(20) -> f2
83 ShowFrame('Frames with many ops', f2)
84
85 ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
86 ShowFrame('Frames with large stacksize', f3)
87
88 ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
89 ShowFrame('Frames with many locals', f4)
90}
91
92# OpKind is FAST for LOAD_FAST, or SLICE for STORE_SLICE+1
93#
94# [,1] is the whole match, and [,2] is the first match. Like $0 and $1 in
95# normal regexes.
96OpKind = function(op_name) {
97 # optional +1 suffix
98 str_match(op_name, '([A-Z]+)(?:\\+[0-9])?$')[,2]
99}
100
101Ops = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
102 Banner('OPS')
103
104 ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq
105
106 ShowFrame('Ops Used by Frequency', op_freq)
107
108 u2 = ops %>% distinct(op_name)
109 ShowValue('Total unique opcodes: %d', nrow(u2))
110
111 if (ops_defined != '') {
112 defined = read.table(ops_defined, header=F)
113 colnames(defined) = c('op_name')
114
115 setdiff(defined, u2) -> f4
116 ShowFrame('Unused opcodes:', f4)
117 }
118
119 op_freq %>%
120 filter(str_detect(op_name, 'LOAD|STORE|FAST')) %>%
121 mutate(kind = OpKind(op_name)) %>%
122 arrange(kind) %>%
123 select(kind, op_name, n) -> mem_ops
124 ShowFrame('Memory Operations:', mem_ops)
125
126 # NOTE: got rid of IMPORT_STAR!
127 ops %>% filter(str_detect(op_name, 'IMPORT')) %>% count(op_name) -> imports
128 ShowFrame('Imports:', imports)
129
130 # These are all the big jump targets! Max is 3,852, which is a lot less than
131 # 65,536. We don't need EXTENDED_ARG!
132 ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
133 ShowFrame('Large op_arg (jump targets):', f1)
134}
135
136Flags = function(flags) {
137 Banner('FLAGS')
138
139 flags %>% count(flag) %>% arrange(desc(n)) -> f1
140 ShowFrame('Common flags', f1)
141}
142
143Names = function(names) {
144 Banner('NAMES')
145
146 # Common types: free, cell, etc.
147 names %>% count(kind) %>% arrange(desc(n)) %>% head(20) -> f1
148 ShowFrame('Common types', f1)
149
150 # Common names:
151 # self, None, True, False, append, len
152 names %>% count(name) %>% arrange(desc(n)) %>% head(20) -> f2
153 ShowFrame('Common names', f2)
154
155 names %>% mutate(len=nchar(name)) -> all
156 names %>% count(name) %>% mutate(len=nchar(name)) -> unique
157
158 ShowValue('Total length of all %d names: %d',
159 nrow(all), sum(all$len))
160 ShowValue('Total length of %d unique names: %d',
161 nrow(unique), sum(unique$len))
162}
163
164# Hm max unique ops is 58
165# _build/oil/bytecode-opy/core/cmd_exec.pyc 54
166# _build/oil/bytecode-opy/warnings.pyc 55
167# _build/oil/bytecode-opy/_abcoll.pyc 58
168#
169# But there are 119 total opcodes. A lot of the math ones are uncommon.
170
171# Written by opy/metrics.sh. Could get rid of that file.
172UniqueOpsByFile = function(ops) {
173 Banner('UNIQUE OPS')
174
175 # This is a row for every path/op_name
176 u = ops %>% group_by(path) %>% distinct(op_name)
177 u %>% count(path) %>% arrange(n) -> ops_by_file
178
179 ops_by_file %>% head(20) -> f1
180 ShowFrame('Files with few ops:', f1)
181
182 ops_by_file %>% tail(10) -> f2
183 ShowFrame('Files with many ops:', f2)
184
185 ops_by_file %>% filter(grepl('reader|lex|parse', path)) -> f3
186 ShowFrame('Unique ops for files that just parse:', f3) # 17, 23, 34, 34, 46
187
188 ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) ->
189 string_ops
190 ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))
191}
192
193# OPy emits 88 distinct opcodes out of 119. Interesting.
194# CPython emits 94 distinct opcodes.
195# STORE_MAP and SETUP_WITH are the only differences. Is this for dict literals?
196#
197#
198# setdiff(cpy$ops %>% distinct(op_name), opy$ops %>% distinct(op_name))
199# op_name
200# 1 STORE_MAP
201# 2 SETUP_WITH
202# 3 PRINT_ITEM
203# 4 PRINT_NEWLINE
204# 5 PRINT_ITEM_TO
205# 6 PRINT_NEWLINE_TO
206
207# Unused opcodes:
208# op_name
209# 1 BINARY_TRUE_DIVIDE
210# 2 BUILD_SET
211# 3 BUILD_SLICE
212# 4 CONTINUE_LOOP
213# 5 DELETE_ATTR
214# 6 DELETE_GLOBAL
215# 7 DELETE_SLICE+2
216# 8 DELETE_SLICE+3
217# 9 EXTENDED_ARG
218# 10 INPLACE_DIVIDE
219# 11 INPLACE_FLOOR_DIVIDE
220# 12 INPLACE_LSHIFT
221# 13 INPLACE_MODULO
222# 14 INPLACE_OR
223# 15 INPLACE_POWER
224# 16 INPLACE_TRUE_DIVIDE
225# 17 NOP
226# 18 PRINT_EXPR
227# 19 PRINT_ITEM
228# 20 PRINT_ITEM_TO
229# 21 PRINT_NEWLINE
230# 22 PRINT_NEWLINE_TO
231# 23 ROT_FOUR
232# 24 SETUP_WITH
233# 25 SET_ADD
234# 26 STOP_CODE
235# 27 STORE_MAP
236# 28 STORE_SLICE+2
237# 29 STORE_SLICE+3
238# 30 UNARY_CONVERT
239# 31 UNARY_POSITIVE
240
241
242Report = function(ctx) {
243 Basic(ctx)
244 BigStrings(ctx$consts)
245
246 Frames(ctx)
247 Names(ctx$names)
248 Consts(ctx$consts)
249 Flags(ctx$flags)
250
251 Ops(ctx$ops)
252 UniqueOpsByFile(ctx$ops)
253}
254
255Load = function(in_dir) {
256 list(
257 frames = read.table(file.path(in_dir, 'frames.tsv2'), header=T),
258 names = read.table(file.path(in_dir, 'names.tsv2'), header=T),
259 consts = read.table(file.path(in_dir, 'consts.tsv2'), header=T),
260 flags = read.table(file.path(in_dir, 'flags.tsv2'), header=T),
261 ops = read.table(file.path(in_dir, 'ops.tsv2'), header=T)
262 )
263}
264
265# This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
266# Then it computes the ratio.
267
268FileSizes = function(all_deps_py, pyc_base_dir) {
269 py_pyc = read.table(all_deps_py, header=F)
270 colnames(py_pyc) = c('py_path', 'pyc_path')
271
272 py_pyc$py_bytes = file.info(py_pyc$py_path)$size
273
274 pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
275 py_pyc$pyc_bytes = file.info(pyc_paths)$size
276
277 py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
278 arrange(ratio) -> py_pyc
279
280 Banner('RATIO')
281
282 py_pyc %>% head(10) -> small
283 ShowFrame('small .pyc files:', small)
284
285 py_pyc %>% tail(10) -> big
286 ShowFrame('big .pyc files:', big)
287
288 # This ratio is a ltitle misleading because it counts comments.
289 py_total = sum(py_pyc$py_bytes)
290 pyc_total = sum(py_pyc$pyc_bytes)
291
292 ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
293 ShowValue('Ratio: %f', pyc_total / py_total)
294
295 Banner('FULL LISTING')
296
297 py_pyc %>% select(c(pyc_bytes, pyc_path)) %>% arrange(desc(pyc_bytes)) -> f1
298 ShowFrame('bytecode', f1)
299 ShowValue('total (again): %d', pyc_total)
300
301 py_pyc
302}
303
304
305CompareCol = function(ctx) {
306 c(nrow(ctx$frames),
307 nrow(ctx$names),
308 nrow(ctx$consts),
309 nrow(ctx$flags),
310 nrow(ctx$ops)
311 )
312}
313
314Compare = function(cpython_ctx, opy_ctx) {
315 Banner('CPYTHON vs. OPY')
316
317 tibble(
318 table_name = c('frames', 'names', 'consts', 'flags', 'ops'),
319 cpython = CompareCol(cpython_ctx),
320 opy = CompareCol(opy_ctx)
321 ) -> f1
322
323 ShowFrame('Overview', f1)
324
325 Banner('Cell Variables')
326
327 cpython_ctx$names %>% filter(kind == 'cell') -> f2
328 opy_ctx$names %>% filter(kind == 'cell') -> f3
329
330 ShowFrame('CPython', f2)
331 ShowFrame('OPy', f3)
332
333 Banner('CLOSURE bytecodes')
334
335 cpython_ctx$ops %>%
336 filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f4
337 opy_ctx$ops %>%
338 filter(op_name %in% c('LOAD_CLOSURE', 'MAKE_CLOSURE')) -> f5
339
340 ShowFrame('CPython', f4)
341 ShowFrame('OPy', f5)
342
343 Banner('Rare bytecodes')
344
345 cpython_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f6
346 ShowFrame('DELETE_FAST in CPython', f6)
347
348 opy_ctx$ops %>% filter(op_name == 'DELETE_FAST') -> f7
349 ShowFrame('DELETE_FAST in OPy', f7)
350
351 # These are all for the global util.GetResourceLoader().
352 opy_ctx$ops %>% filter(op_name == 'STORE_GLOBAL') -> f8
353 ShowFrame('STORE_GLOBAL in OPy', f8)
354
355 # In asdl/unpickle.py.
356 opy_ctx$ops %>% filter(op_name == 'STORE_SLICE+1') -> f9
357 ShowFrame('STORE_SLICE+1 in OPy', f9)
358}
359
360main = function(argv) {
361 action = argv[[1]]
362
363 if (action == 'metrics') {
364 in_dir = argv[[2]]
365 ctx = Load(in_dir)
366 Report(ctx)
367
368 } else if (action == 'compare') {
369 cpython_ctx = Load(argv[[2]])
370 opy_ctx = Load(argv[[3]])
371 Compare(cpython_ctx, opy_ctx)
372
373 } else if (action == 'src-bin-ratio') { # This takes different inputs
374 all_deps_py = argv[[2]]
375 pyc_base_dir = argv[[3]]
376 ctx = FileSizes(all_deps_py, pyc_base_dir)
377
378 } else {
379 Log("Invalid action '%s'", action)
380 quit(status = 1)
381 }
382}
383
384if (length(sys.frames()) == 0) {
385 # increase ggplot font size globally
386 #theme_set(theme_grey(base_size = 20))
387 main(commandArgs(TRUE))
388}