diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py index 2ef3e2350a7..65773c6f693 100644 --- a/tests/framework/ab_test.py +++ b/tests/framework/ab_test.py @@ -22,9 +22,9 @@ does not block PRs). If not, it fails, preventing PRs from introducing new vulnerable dependencies. """ import statistics -from pathlib import Path +from pathlib import Path, PosixPath from tempfile import TemporaryDirectory -from typing import Callable, List, Optional, TypeVar +from typing import Callable, Generator, List, Optional, Tuple, TypeVar import scipy @@ -98,6 +98,32 @@ def git_ab_test( return result_a, result_b, comparison +def git_clone_ab_dirs( + a_revision: str = DEFAULT_A_REVISION, + b_revision: Optional[str] = None, +) -> Generator[Tuple[PosixPath, PosixPath], None, None]: + """ + Prepare cloned Git repository to run A/B tests. + + :param a_revision: The revision to checkout for the "A" part of the test. Defaults to the pull request target branch + if run in CI, and "main" otherwise. + :param b_revision: The git revision to check out for "B" part of the test. Defaults to whatever is currently checked + out (in which case no temporary directory will be created). + """ + + with TemporaryDirectory() as tmp_dir: + dir_a = git_clone(Path(tmp_dir) / a_revision, a_revision) + + if b_revision: + dir_b = git_clone(Path(tmp_dir) / b_revision, b_revision) + else: + # By default, pytest execution happens inside the `tests` subdirectory. Pass the repository root, as + # documented. + dir_b = Path.cwd().parent + + yield (dir_a, dir_b) + + DEFAULT_A_DIRECTORY = FC_WORKSPACE_DIR / "build" / "main" DEFAULT_B_DIRECTORY = FC_WORKSPACE_DIR / "build" / "cargo_target" / DEFAULT_TARGET_DIR diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 6e6541a688d..81afd37a0fb 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -2,93 +2,135 @@ # SPDX-License-Identifier: Apache-2.0 """Optional benchmarks-do-not-regress test""" import contextlib -import json import logging import platform +import re import shutil from pathlib import Path +from typing import Callable, List import pytest from framework import utils -from framework.ab_test import git_ab_test +from framework.ab_test import binary_ab_test, git_clone_ab_dirs from host_tools.cargo_build import cargo LOGGER = logging.getLogger(__name__) +git_clone_ab_dirs_one_time = pytest.fixture(git_clone_ab_dirs, scope="class") -@pytest.mark.no_block_pr -@pytest.mark.timeout(900) -def test_no_regression_relative_to_target_branch(): +def get_benchmark_names() -> List[str]: """ - Run the microbenchmarks in this repository, comparing results from pull - request target branch against what's achieved on HEAD + Get a list of benchmark test names """ - git_ab_test(run_criterion, compare_results) + _, stdout, _ = cargo( + "bench", + f"--workspace --quiet --target {platform.machine()}-unknown-linux-musl", + "--list", + ) + + # Format a string like `page_fault #2: benchmark` to a string like `page_fault`. + benchmark_names = [ + re.sub(r"\s#([0-9]*)", "", i.split(":")[0]) + for i in stdout.split("\n") + if i.endswith(": benchmark") + ] -def run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: + return list(set(benchmark_names)) + + +class TestBenchMarks: """ - Executes all benchmarks by running "cargo bench --no-run", finding the executables, and running them pinned to some CPU + This class is used to prevent fixtures from being executed for each parameter in + a parametrize test. """ - baseline_name = "a_baseline" if is_a else "b_baseline" - with contextlib.chdir(firecracker_checkout): - # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the - # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we - # extract the path to the compiled benchmark binary. - _, stdout, _ = cargo( - "bench", - f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", + @pytest.mark.no_block_pr + @pytest.mark.parametrize("benchname", get_benchmark_names()) + def test_no_regression_relative_to_target_branch( + self, benchname, git_clone_ab_dirs_one_time + ): + """ + Run the microbenchmarks in this repository, comparing results from pull + request target branch against what's achieved on HEAD + """ + + dir_a = git_clone_ab_dirs_one_time[0] + dir_b = git_clone_ab_dirs_one_time[1] + run_criterion = get_run_criterion(benchname) + compare_results = get_compare_results(benchname) + + binary_ab_test( + test_runner=run_criterion, + comparator=compare_results, + a_directory=dir_a, + b_directory=dir_b, ) - executables = [] - for line in stdout.split("\n"): - if line: - msg = json.loads(line) - executable = msg.get("executable") - if executable: - executables.append(executable) - for executable in executables: +def get_run_criterion(benchmark_name) -> Callable[[Path, bool], Path]: + """ + Get function that executes specified benchmarks, and running them pinned to some CPU + """ + + def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: + baseline_name = "a_baseline" if is_a else "b_baseline" + + with contextlib.chdir(firecracker_checkout): utils.check_output( - f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}" + f"taskset -c 1 cargo bench --workspace --quiet -- {benchmark_name} --exact --save-baseline {baseline_name}" ) - return firecracker_checkout / "build" / "cargo_target" / "criterion" + return firecracker_checkout / "build" / "cargo_target" / "criterion" + return _run_criterion -def compare_results(location_a_baselines: Path, location_b_baselines: Path): - """Compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main""" - for benchmark in location_b_baselines.glob("*"): - data = json.loads( - (benchmark / "b_baseline" / "estimates.json").read_text("utf-8") - ) - average_ns = data["mean"]["point_estimate"] +def get_compare_results(benchmark_name) -> Callable[[Path, Path], None]: + """ + Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main + """ + + def _compare_results(location_a_baselines: Path, location_b_baselines: Path): - LOGGER.info("%s mean: %iµs", benchmark.name, average_ns / 1000) + _, stdout, _ = cargo( + "bench", + f"--workspace --target {platform.machine()}-unknown-linux-musl --quiet", + f"--exact {benchmark_name} --list", + ) + + # Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`. + # Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create. + bench_mark_targets = [ + re.sub(r"\s#(?P[0-9]*)", r"_\g", i.split(":")[0]) + for i in stdout.split("\n") + if i.endswith(": benchmark") + ] + + # If benchmark test has multiple targets, the results of a single benchmark test will be output to multiple directories. + # For example, `page_fault` and `page_fault_2`. + # We need copy benchmark results each directories. + for bench_mark_target in bench_mark_targets: + shutil.copytree( + location_a_baselines / bench_mark_target / "a_baseline", + location_b_baselines / bench_mark_target / "a_baseline", + ) - # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here - # to do the comparison - for benchmark in location_a_baselines.glob("*"): - shutil.copytree( - benchmark / "a_baseline", - location_b_baselines / benchmark.name / "a_baseline", + _, stdout, _ = cargo( + "bench", + f"--workspace --target {platform.machine()}-unknown-linux-musl", + f"{benchmark_name} --baseline a_baseline --load-baseline b_baseline", ) - _, stdout, _ = cargo( - "bench", - f"--all --target {platform.machine()}-unknown-linux-musl", - "--baseline a_baseline --load-baseline b_baseline", - ) + regressions_only = "\n\n".join( + result + for result in stdout.split("\n\n") + if "Performance has regressed." in result + ) - regressions_only = "\n\n".join( - result - for result in stdout.split("\n\n") - if "Performance has regressed." in result - ) + # If this string is anywhere in stdout, then at least one of our benchmarks + # is now performing worse with the PR changes. + assert not regressions_only, "\n" + regressions_only - # If this string is anywhere in stdout, then at least one of our benchmarks - # is now performing worse with the PR changes. - assert not regressions_only, "\n" + regressions_only + return _compare_results