ASV PR bench workflow, pytest-bench -> ASV, add peakmem tests

lapp0 · brandonwillard · commit 3029b289c151 · 2024-06-04T12:20:53.000-05:00
diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml
@@ -0,0 +1,52 @@
+name: Benchmark PR
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+env:
+  PYTHON_VERSION: "3.10"
+  WORKING_DIR: ${{ github.workspace }}/benchmarks
+  BENCHMARKS_OUTPUT: ${{ github.workspace }}/benchmarks_output
+
+jobs:
+  benchmark-pr:
+    runs-on: ubuntu-latest
+    if: contains(github.event.pull_request.labels.*.name, 'run_benchmarks') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_run'
+
+    defaults:
+      run:
+        working-directory: ${{ env.WORKING_DIR }}
+
+    steps:
+
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install asv virtualenv lf-asv-formatter
+
+    - name: Create ASV machine config file
+      run: asv machine --machine gh-runner --yes
+
+    - name: Run Benchmarks - `PR HEAD` vs `main`
+      run: |
+        # prepare main branch for comparison
+        git remote add upstream https://github.com/${{ github.repository }}.git
+        git fetch upstream main
+
+        # Run benchmarks, allow errors, they will be caught in the next step
+        asv continuous upstream/main HEAD \
+            --no-stats --interleave-rounds -a repeat=3 || true
+
+    - name: BENCHMARK RESULTS
+      run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ docs/build
 .idea/
 *.gguf
 .venv
+benchmarks/results
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
@@ -0,0 +1,20 @@
+{
+    "version": 1,
+    "project": "Outlines",
+    "project_url": "https://outlines-dev.github.io/outlines/",
+    "repo": "..",
+    "branches": [
+	"HEAD"
+    ],
+    "build_command": [
+        "python -mpip install .[test]",
+        "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}",
+    ],
+    "environment_type": "virtualenv",
+    "show_commit_url": "https://github.com/lapp0/outlines/commit/",
+    "benchmark_dir": ".",
+    "env_dir": "env",
+    "results_dir": "results",
+    "html_dir": "html",
+    "build_cache_size": 8
+}
diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py
@@ -1,12 +1,16 @@
-import pytest
-
 import outlines
 
 outlines.disable_cache()
 
 from outlines.fsm.guide import RegexGuide  # noqa: E402
 from outlines.fsm.json_schema import build_regex_from_schema  # noqa: E402
 
+from .common import (  # noqa: E402
+    clear_outlines_cache,
+    ensure_numba_compiled,
+    setup_tokenizer,
+)
+
 simple_schema = """{
         "$defs": {
             "Armor": {
@@ -63,30 +67,21 @@
   "required": ["id", "work", "recording_artists"]
 }"""
 
-
 schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema)
 
 
-@pytest.mark.parametrize("schema_name", schemas.keys())
-def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name):
-    """Benchmark convert json schema to regex"""
-    schema = schemas[schema_name]
-    benchmark.pedantic(
-        build_regex_from_schema,
-        args=(schema,),
-        rounds=8,
-    )
+class JsonSchemaBenchmark:
+    params = schemas.keys()
+
+    def setup(self, schema_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        self.schema = schemas[schema_name]
+        ensure_numba_compiled(self.tokenizer)
 
+    def time_json_schema_to_regex(self, schema_name):
+        build_regex_from_schema(self.schema)
 
-@pytest.mark.parametrize("schema_name", schemas.keys())
-def test_benchmark_json_schema_to_fsm(
-    benchmark, tokenizer, ensure_numba_compiled, schema_name
-):
-    """Benchmark compile json schema as FSM"""
-    schema = schemas[schema_name]
-    regex = build_regex_from_schema(schema)
-    benchmark.pedantic(
-        RegexGuide,
-        args=(regex, tokenizer),
-        rounds=8,
-    )
+    def time_json_schema_to_fsm(self, schema_name):
+        regex = build_regex_from_schema(self.schema)
+        RegexGuide(regex, self.tokenizer)
diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py
@@ -0,0 +1,37 @@
+import importlib
+
+import interegular
+import numba
+
+import outlines
+
+from .common import clear_outlines_cache, setup_tokenizer
+
+outlines.disable_cache()
+
+
+class NumbaCompileBenchmark:
+    def setup(self):
+        clear_outlines_cache()
+        from outlines.fsm import regex
+
+        self.tokenizer = setup_tokenizer()
+        self.regex = regex
+        original_njit = numba.njit
+
+        def mock_njit(*args, **kwargs):
+            kwargs["cache"] = False
+            return original_njit(*args, **kwargs)
+
+        self.original_njit = original_njit
+        numba.njit = mock_njit
+        importlib.reload(self.regex)
+        self.regex_pattern, _ = self.regex.make_deterministic_fsm(
+            interegular.parse_pattern("a").to_fsm().reduce()
+        )
+
+    def teardown(self):
+        numba.njit = self.original_njit
+
+    def time_compile_numba(self):
+        self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer)
diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py
@@ -1,7 +1,7 @@
-import pytest
-
 import outlines
 
+from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer
+
 outlines.disable_cache()
 
 from outlines.fsm.guide import RegexGuide  # noqa: E402
@@ -19,14 +19,27 @@
 }
 
 
-@pytest.mark.parametrize("regex_name", regex_samples.keys())
-def test_benchmark_regex_to_fsm(
-    benchmark, tokenizer, ensure_numba_compiled, regex_name
-):
-    """Benchmark converting regex to FSM"""
-    regex_str = regex_samples[regex_name]
-    benchmark.pedantic(
-        RegexGuide,
-        args=(regex_str, tokenizer),
-        rounds=8,
-    )
+class RegexGuideBenchmark:
+    params = regex_samples.keys()
+
+    def setup(self, pattern_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        ensure_numba_compiled(self.tokenizer)
+        self.pattern = regex_samples[pattern_name]
+
+    def time_regex_to_guide(self, pattern_name):
+        RegexGuide(self.pattern, self.tokenizer)
+
+
+class MemoryRegexGuideBenchmark:
+    params = ["simple_phone", "complex_span_constrained_relation_extraction"]
+
+    def setup(self, pattern_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        ensure_numba_compiled(self.tokenizer)
+        self.pattern = regex_samples[pattern_name]
+
+    def peakmem_regex_to_guide(self, pattern_name):
+        RegexGuide(self.pattern, self.tokenizer)
diff --git a/benchmarks/common.py b/benchmarks/common.py
@@ -1,17 +1,19 @@
-import pytest
 from transformers import AutoTokenizer
 
+import outlines.caching
 from outlines.fsm.guide import RegexGuide
 from outlines.models.transformers import TransformerTokenizer
 
 
-@pytest.fixture
-def tokenizer():
+def clear_outlines_cache():
+    outlines.caching.clear_cache()
+
+
+def setup_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
     return TransformerTokenizer(tokenizer)
 
 
-@pytest.fixture
 def ensure_numba_compiled(tokenizer):
     RegexGuide("a", tokenizer)
     return True
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
@@ -57,12 +57,38 @@ And run the code style checks:
 pre-commit run --all-files
 ```
 
-When modifying the code related to the index compilation, we kindly ask you to
-post benchmarks before and after your changes. You can run benchmarks using:
+### Benchmarking
 
-```python
-pytest --benchmark-only
+Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation.
+
+You can run the benchmark test suite locally with the following command:
+```
+asv run --config benchmarks/asv.conf.json
+```
+
+Run a specific test:
+```
+asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
+```
+
+Profile a specific test:
 ```
+asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
+```
+
+Compare to `origin/main`
+```
+get fetch origin
+asv continuous origin/main HEAD --config benchmarks/asv.conf.json
+```
+
+#### ASV PR Behavior
+
+- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section.
+- Merging is blocked unless benchmarks are run for the latest commit.
+- Benchmarks fail if performance degrades by more than 10% for any individual benchmark.
+- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit.
+
 
 ### Contribute to the documentation
 
diff --git a/tests/benchmark/test_benchmark_numba_compile.py b/tests/benchmark/test_benchmark_numba_compile.py

-Original file line number
+Diff line change
 .idea/
 *.gguf
 .venv
 +benchmarks/results