Merge pull request #36 from montasaurus/adam/24/01/05/add-sentry

add sentry to vllm container
OpenRouterTeam · Jan 5, 2024 · 7939545 · 7939545
2 parents 5d419dc + 437cff3
commit 7939545
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 22 deletions.
diff --git a/modal/runner/README.md b/modal/runner/README.md
@@ -56,6 +56,12 @@ Before you start working with the OpenRouter Runner, it's crucial to set up your
       ```shell
       modal secret create ext-api-key RUNNER_API_KEY=<generate a random key>
       ```
+    
+    - **Sentry Configuration**:
+      Create another secret group for the Sentry error tracking storage. Replace `<optional SENTRY_DSN>` with your DSN from sentry.io or leave it blank to disable Sentry (e.g. `SENTRY_DSN=`). You can also add an environment by adding `SENTRY_ENVIRONMENT=<environment name>` to the command.
+      ```shell
+      modal secret create sentry SENTRY_DSN=<optional SENTRY_DSN>
+      ```
 
  Now if you go to your dashboard on [Modal](https://modal.com/) and click on the secrets tab you should be able to see your keys deployed there.
 

diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -1,7 +1,8 @@
+from os import environ
 from typing import Optional
 
 import modal.gpu
-from modal import Image
+from modal import Image, Secret
 
 from runner.engines.vllm import VllmEngine, VllmParams
 from runner.shared.common import stub
@@ -10,9 +11,7 @@
 _vllm_image = Image.from_registry(
     "nvidia/cuda:12.1.0-base-ubuntu22.04",
     add_python="3.10",
-).pip_install(
-    "vllm==0.2.6",
-)
+).pip_install("vllm==0.2.6", "sentry-sdk==1.39.1")
 
 
 def _make_container(
@@ -28,27 +27,39 @@ def __init__(
             model_path: str,
             max_model_len: Optional[int] = None,
         ):
-            if num_gpus > 1:
-                # Patch issue from https://github.com/vllm-project/vllm/issues/1116
-                import ray
-
-                ray.shutdown()
-                ray.init(num_gpus=num_gpus, ignore_reinit_error=True)
-
-            super().__init__(
-                VllmParams(
-                    model=model_path,
-                    tensor_parallel_size=num_gpus,
-                    max_model_len=max_model_len,
-                )
+            import sentry_sdk
+
+            sentry_sdk.init(
+                dsn=environ.get("SENTRY_DSN"),
+                environment=environ.get("SENTRY_ENVIRONMENT") or "development",
             )
 
-            # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
-            if num_gpus > 1:
-                import subprocess
+            try:
+                if num_gpus > 1:
+                    # Patch issue from https://github.com/vllm-project/vllm/issues/1116
+                    import ray
+
+                    ray.shutdown()
+                    ray.init(num_gpus=num_gpus, ignore_reinit_error=True)
+
+                super().__init__(
+                    VllmParams(
+                        model=model_path,
+                        tensor_parallel_size=num_gpus,
+                        max_model_len=max_model_len,
+                    )
+                )
+
+                # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
+                if num_gpus > 1:
+                    import subprocess
 
-                RAY_CORE_PIN_OVERRIDE = "cpuid=0 ; for pid in $(ps xo '%p %c' | grep ray:: | awk '{print $1;}') ; do taskset -cp $cpuid $pid ; cpuid=$(($cpuid + 1)) ; done"
-                subprocess.call(RAY_CORE_PIN_OVERRIDE, shell=True)
+                    RAY_CORE_PIN_OVERRIDE = "cpuid=0 ; for pid in $(ps xo '%p %c' | grep ray:: | awk '{print $1;}') ; do taskset -cp $cpuid $pid ; cpuid=$(($cpuid + 1)) ; done"
+                    subprocess.call(RAY_CORE_PIN_OVERRIDE, shell=True)
+            except Exception as e:
+                # We have to manually capture and re-raise because Modal catches the exception upstream
+                sentry_sdk.capture_exception(e)
+                raise e
 
     _VllmContainer.__name__ = name
 
@@ -59,6 +70,7 @@ def __init__(
         allow_concurrent_inputs=concurrent_inputs,
         container_idle_timeout=20 * 60,
         timeout=10 * 60,
+        secret=Secret.from_name("sentry"),
     )
     return wrap(_VllmContainer)