Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add analysis of single results #1460

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 296 additions & 0 deletions comparer_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#!/usr/bin/env python3

from pathlib import Path
import itertools
import math

import tabulate
from asv_runner.console import color_print
from asv_runner.statistics import get_err

from asv import results
from asv.console import log
from asv.util import human_value, load_json
from asv import _stats
from asv.commands.compare import _is_result_better, unroll_result, _isna

import asv
import click


@click.command()
@click.argument("b1", type=click.Path(exists=True), required=True)
@click.argument("b2", type=click.Path(exists=True), required=True)
@click.argument("bconf", type=click.Path(exists=True), required=True)
def comparer(b1, b2, bconf):
click.echo(f"Hello, comparing {b1} and {b2}!")
do_compare(b1, b2, bconf)


def result_iter(bdot):
for key in bdot.get_all_result_keys():
params = bdot.get_result_params(key)
result_value = bdot.get_result_value(key, params)
result_stats = bdot.get_result_stats(key, params)
result_samples = bdot.get_result_samples(key, params)
result_version = bdot.benchmark_version.get(key)
yield (
key,
params,
result_value,
result_stats,
result_samples,
result_version,
bdot.params["machine"],
bdot.env_name,
)


def do_compare(
b1,
b2,
bconf,
factor=1.1,
split=False,
only_changed=False,
sort="default",
machine=None,
env_spec=None,
use_stats=True,
):
# Already validated by click normally
res_1 = results.Results.load(b1)
res_2 = results.Results.load(b2)
conf_asv = asv.config.Config()
conf_asv.results_dir = Path(bconf).parent
benchmarks = asv.benchmarks.Benchmarks.load(conf_asv)
# Kanged from compare.py

results_1 = {}
results_2 = {}
ss_1 = {}
ss_2 = {}
versions_1 = {}
versions_2 = {}
units = {}

machine_env_names = set()

for key, params, value, stats, samples, version, machine, env_name in result_iter(
res_1
):
machine_env_name = f"{machine}/{env_name}"
machine_env_names.add(machine_env_name)
for name, value, stats, samples in unroll_result(
key, params, value, stats, samples
):
units[name] = benchmarks.get(key, {}).get("unit")
results_1[name] = value
ss_1[name] = (stats, samples)
versions_1[name] = version

for key, params, value, stats, samples, version, machine, env_name in result_iter(
res_2
):
machine_env_name = f"{machine}/{env_name}"
machine_env_names.add(machine_env_name)
for name, value, stats, samples in unroll_result(
key, params, value, stats, samples
):
units[name] = benchmarks.get(key, {}).get("unit")
results_2[name] = value
ss_2[name] = (stats, samples)
versions_2[name] = version

benchmarks_1 = set(results_1.keys())
benchmarks_2 = set(results_2.keys())
joint_benchmarks = sorted(list(benchmarks_1 | benchmarks_2))
bench = {}

if split:
bench["green"] = []
bench["red"] = []
bench["lightgrey"] = []
bench["default"] = []
else:
bench["all"] = []

worsened = False
improved = False

for benchmark in joint_benchmarks:
if benchmark in results_1:
time_1 = results_1[benchmark]
else:
time_1 = math.nan

if benchmark in results_2:
time_2 = results_2[benchmark]
else:
time_2 = math.nan

if benchmark in ss_1 and ss_1[benchmark][0]:
err_1 = get_err(time_1, ss_1[benchmark][0])
else:
err_1 = None

if benchmark in ss_2 and ss_2[benchmark][0]:
err_2 = get_err(time_2, ss_2[benchmark][0])
else:
err_2 = None

version_1 = versions_1.get(benchmark)
version_2 = versions_2.get(benchmark)

if _isna(time_1) or _isna(time_2):
ratio = "n/a"
ratio_num = 1e9
else:
try:
ratio_num = time_2 / time_1
ratio = f"{ratio_num:6.2f}"
except ZeroDivisionError:
ratio_num = 1e9
ratio = "n/a"

if version_1 is not None and version_2 is not None and version_1 != version_2:
# not comparable
color = "lightgrey"
mark = "x"
elif time_1 is not None and time_2 is None:
# introduced a failure
color = "red"
mark = "!"
worsened = True
elif time_1 is None and time_2 is not None:
# fixed a failure
color = "green"
mark = " "
improved = True
elif time_1 is None and time_2 is None:
# both failed
color = "default"
mark = " "
elif _isna(time_1) or _isna(time_2):
# either one was skipped
color = "default"
mark = " "
elif _is_result_better(
time_2,
time_1,
ss_2.get(benchmark),
ss_1.get(benchmark),
factor,
use_stats=use_stats,
):
color = "green"
mark = "-"
improved = True
elif _is_result_better(
time_1,
time_2,
ss_1.get(benchmark),
ss_2.get(benchmark),
factor,
use_stats=use_stats,
):
color = "red"
mark = "+"
worsened = True
else:
color = "default"
mark = " "

# Mark statistically insignificant results
if _is_result_better(
time_1, time_2, None, None, factor
) or _is_result_better(time_2, time_1, None, None, factor):
ratio = "~" + ratio.strip()

if only_changed and mark in (" ", "x"):
continue

unit = units[benchmark]

details = "{0:1s} {1:>15s} {2:>15s} {3:>8s} ".format(
mark,
human_value(time_1, unit, err=err_1),
human_value(time_2, unit, err=err_2),
ratio,
)
split_line = details.split()
if len(machine_env_names) > 1:
benchmark_name = "{} [{}]".format(*benchmark)
else:
benchmark_name = benchmark[0]
if len(split_line) == 4:
split_line += [benchmark_name]
else:
split_line = [" "] + split_line + [benchmark_name]
if split:
bench[color].append(split_line)
else:
bench["all"].append(split_line)

if split:
keys = ["green", "default", "red", "lightgrey"]
else:
keys = ["all"]

titles = {}
titles["green"] = "Benchmarks that have improved:"
titles["default"] = "Benchmarks that have stayed the same:"
titles["red"] = "Benchmarks that have got worse:"
titles["lightgrey"] = "Benchmarks that are not comparable:"
titles["all"] = "All benchmarks:"

log.flush()

for key in keys:

if len(bench[key]) == 0:
continue

if not only_changed:
color_print("")
color_print(titles[key])
color_print("")

name_1 = False # commit_names.get(hash_1)
if name_1:
name_1 = f"<{name_1}>"
else:
name_1 = ""

name_2 = False # commit_names.get(hash_2)
if name_2:
name_2 = f"<{name_2}>"
else:
name_2 = ""

if sort == "default":
pass
elif sort == "ratio":
bench[key].sort(key=lambda v: v[3], reverse=True)
elif sort == "name":
bench[key].sort(key=lambda v: v[2])
else:
raise ValueError("Unknown 'sort'")

print(
tabulate.tabulate(
bench[key],
headers=[
"Change",
f"Before {name_1}",
f"After {name_2}",
"Ratio",
"Benchmark (Parameter)",
],
tablefmt="github",
)
)


if __name__ == "__main__":
comparer()
27 changes: 13 additions & 14 deletions docs/source/using.rst
Original file line number Diff line number Diff line change
Expand Up @@ -590,19 +590,18 @@ revisions of the project. You can do so with the ``compare`` command::
$ asv compare v0.1 v0.2
All benchmarks:

before after ratio
[3bfda9c6] [bf719488]
<v0.1> <v0.2>
40.4m 40.4m 1.00 benchmarks.MemSuite.mem_list [amulet.localdomain/virtualenv-py3.7-numpy]
failed 35.2m n/a benchmarks.MemSuite.mem_list [amulet.localdomain/virtualenv-py3.12-numpy]
11.5±0.08μs 11.0±0μs 0.96 benchmarks.TimeSuite.time_iterkeys [amulet.localdomain/virtualenv-py3.7-numpy]
failed failed n/a benchmarks.TimeSuite.time_iterkeys [amulet.localdomain/virtualenv-py3.12-numpy]
11.5±1μs 11.2±0.02μs 0.97 benchmarks.TimeSuite.time_keys [amulet.localdomain/virtualenv-py3.7-numpy]
failed 8.40±0.02μs n/a benchmarks.TimeSuite.time_keys [amulet.localdomain/virtualenv-py3.12-numpy]
34.6±0.09μs 32.9±0.01μs 0.95 benchmarks.TimeSuite.time_range [amulet.localdomain/virtualenv-py3.7-numpy]
failed 35.6±0.05μs n/a benchmarks.TimeSuite.time_range [amulet.localdomain/virtualenv-py3.12-numpy]
31.6±0.1μs 30.2±0.02μs 0.95 benchmarks.TimeSuite.time_xrange [amulet.localdomain/virtualenv-py3.7-numpy]
failed failed n/a benchmarks.TimeSuite.time_xrange [amulet.localdomain/virtualenv-py3.12-numpy]
| Change | Before [3bfda9c6] | After [bf719488] | Ratio | Benchmark (Parameter) |
|--------|-------------------|------------------|-------|-----------------------|
| | 40.4m | 40.4m | 1.00 | benchmarks.MemSuite.mem_list [amulet.localdomain/virtualenv-py3.7-numpy] |
| | failed | 35.2m | n/a | benchmarks.MemSuite.mem_list [amulet.localdomain/virtualenv-py3.12-numpy] |
| | 11.5±0.08μs | 11.0±0μs | 0.96 | benchmarks.TimeSuite.time_iterkeys [amulet.localdomain/virtualenv-py3.7-numpy] |
| | failed | failed | n/a | benchmarks.TimeSuite.time_iterkeys [amulet.localdomain/virtualenv-py3.12-numpy] |
| | 11.5±1μs | 11.2±0.02μs | ~ | benchmarks.TimeSuite.time_keys [amulet.localdomain/virtualenv-py3.7-numpy] |
| - | failed | 8.40±0.02μs | n/a | benchmarks.TimeSuite.time_keys [amulet.localdomain/virtualenv-py3.12-numpy] |
| - | 34.6±0.09μs | 32.9±0.01μs | 0.95 | benchmarks.TimeSuite.time_range [amulet.localdomain/virtualenv-py3.7-numpy] |
| - | failed | 35.6±0.05μs | n/a | benchmarks.TimeSuite.time_range [amulet.localdomain/virtualenv-py3.12-numpy] |
| - | 31.6±0.1μs | 30.2±0.02μs | 0.95 | benchmarks.TimeSuite.time_xrange [amulet.localdomain/virtualenv-py3.7-numpy] |
| | failed | failed | n/a | benchmarks.TimeSuite.time_xrange [amulet.localdomain/virtualenv-py3.12-numpy] |

This will show the times for each benchmark for the first and second
revision, and the ratio of the second to the first. In addition, the
Expand Down Expand Up @@ -630,7 +629,7 @@ each of its symbolic states can be understood as:
- Before
* - ``x``
- Light Gray
- Not comparable
- Not comparable since :ref:`benchmark versions <benchmark-versioning>` differ
-
-
* - ``!``
Expand Down
1 change: 1 addition & 0 deletions docs/source/writing_benchmarks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ garbage collector at a given state::

For details, see :doc:`benchmarks`.

.. _benchmark-versioning:

Benchmark versioning
--------------------
Expand Down
Loading