#!/usr/bin/python # Copyright (C) 2015-2016 Free Software Foundation, Inc. # This file is part of the GNU C Library. # # The GNU C Library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # The GNU C Library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with the GNU C Library; if not, see # . """Compare two benchmark results Given two benchmark result files and a threshold, this script compares the benchmark results and flags differences in performance beyond a given threshold. """ import sys import os import pylab import import_bench as bench def do_compare(func, var, tl1, tl2, par, threshold): """Compare one of the aggregate measurements Helper function to compare one of the aggregate measurements of a function variant. Args: func: Function name var: Function variant name tl1: The first timings list tl2: The second timings list par: The aggregate to measure threshold: The threshold for differences, beyond which the script should print a warning. """ d = abs(tl2[par] - tl1[par]) * 100 / tl1[str(par)] if d > threshold: if tl1[par] > tl2[par]: ind = '+++' else: ind = '---' print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' % (ind, func, var, par, d, tl1[par], tl2[par])) def compare_runs(pts1, pts2, threshold): """Compare two benchmark runs Args: pts1: Timing data from first machine pts2: Timing data from second machine """ # XXX We assume that the two benchmarks have identical functions and # variants. We cannot compare two benchmarks that may have different # functions or variants. Maybe that is something for the future. for func in pts1['functions'].keys(): for var in pts1['functions'][func].keys(): tl1 = pts1['functions'][func][var] tl2 = pts2['functions'][func][var] # Compare the consolidated numbers # do_compare(func, var, tl1, tl2, 'max', threshold) do_compare(func, var, tl1, tl2, 'min', threshold) do_compare(func, var, tl1, tl2, 'mean', threshold) # Skip over to the next variant or function if there is no detailed # timing info for the function variant. if 'timings' not in pts1['functions'][func][var].keys() or \ 'timings' not in pts2['functions'][func][var].keys(): return # If two lists do not have the same length then it is likely that # the performance characteristics of the function have changed. # XXX: It is also likely that there was some measurement that # strayed outside the usual range. Such ouiers should not # happen on an idle machine with identical hardware and # configuration, but ideal environments are hard to come by. if len(tl1['timings']) != len(tl2['timings']): print('* %s(%s): Timing characteristics changed' % (func, var)) print('\tBefore: [%s]' % ', '.join([str(x) for x in tl1['timings']])) print('\tAfter: [%s]' % ', '.join([str(x) for x in tl2['timings']])) continue # Collect numbers whose differences cross the threshold we have # set. issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \ if abs(y - x) * 100 / x > threshold] # Now print them. for t1, t2 in issues: d = abs(t2 - t1) * 100 / t1 if t2 > t1: ind = '-' else: ind = '+' print("%s %s(%s): (%.2lf%%) from %g to %g" % (ind, func, var, d, t1, t2)) def plot_graphs(bench1, bench2): """Plot graphs for functions Make scatter plots for the functions and their variants. Args: bench1: Set of points from the first machine bench2: Set of points from the second machine. """ for func in bench1['functions'].keys(): for var in bench1['functions'][func].keys(): # No point trying to print a graph if there are no detailed # timings. if u'timings' not in bench1['functions'][func][var].keys(): print('Skipping graph for %s(%s)' % (func, var)) continue pylab.clf() pylab.ylabel('Time (cycles)') # First set of points length = len(bench1['functions'][func][var]['timings']) X = [float(x) for x in range(length)] lines = pylab.scatter(X, bench1['functions'][func][var]['timings'], 1.5 + 100 / length) pylab.setp(lines, 'color', 'r') # Second set of points length = len(bench2['functions'][func][var]['timings']) X = [float(x) for x in range(length)] lines = pylab.scatter(X, bench2['functions'][func][var]['timings'], 1.5 + 100 / length) pylab.setp(lines, 'color', 'g') if var: filename = "%s-%s.png" % (func, var) else: filename = "%s.png" % func print('Writing out %s' % filename) pylab.savefig(filename) def main(args): """Program Entry Point Take two benchmark output files and compare their timings. """ if len(args) > 4 or len(args) < 3: print('Usage: %s [threshold in %%]' % sys.argv[0]) sys.exit(os.EX_USAGE) bench1 = bench.parse_bench(args[1], args[0]) bench2 = bench.parse_bench(args[2], args[0]) if len(args) == 4: threshold = float(args[3]) else: threshold = 10.0 if (bench1['timing_type'] != bench2['timing_type']): print('Cannot compare benchmark outputs: timing types are different') return plot_graphs(bench1, bench2) bench.compress_timings(bench1) bench.compress_timings(bench2) compare_runs(bench1, bench2, threshold) if __name__ == '__main__': main(sys.argv[1:])