feat(services): add custom callbacks service

1Yanik3 · 1Yanik3 · commit 87d07b026756 · 2025-03-18T10:23:20.000+01:00
Signed-off-by: Yanik Ammann &lt;yanik.ammann@siemens.com&gt;
diff --git a/custom_callbacks_example.py b/custom_callbacks_example.py
@@ -0,0 +1,18 @@
+from fastapi import Request, Response
+
+from vllm_router.services.callbacks_service.custom_callbacks import (
+    CustomCallbackHandler,
+)
+
+
+class MyCustomCallbackHandler(CustomCallbackHandler):
+    def pre_request(self, request: Request, request_body: bytes, request_json: any):
+        if b"coffee" in request_body:
+            return Response("I'm a teapot", 418)
+
+    def post_request(self, request: Request, response_content: bytes):
+        with open("/tmp/response.txt", "ab") as f:
+            f.write(response_content)
+
+
+my_callback_handler_instance = MyCustomCallbackHandler()
diff --git a/docs/source/user_manual/router/cmd.rst b/docs/source/user_manual/router/cmd.rst
@@ -44,6 +44,7 @@ Logging Options
 +++++++++++++++
 
 - ``--log-stats``: Log statistics every 30 seconds.
+- ``--callbacks``: The path to the callback instance extending CustomCallbackHandler (e.g. ``my_callbacks.my_callback_handler_instance``).
 
 
 Build docker image
@@ -78,3 +79,46 @@ You can install the router using the following command:
     --engine-stats-interval 10 \
     --log-stats \
     --routing-logic roundrobin
+
+
+Hooking into custom callbacks
+-----------------------------
+
+The router can be extended to add custom callbacks at various points in the request lifecycle.
+
+For this you will need to create a custom callback handler instance, implementing at least one of the available callback methods. You can find all available callbacks along with detailed descriptions in the abstract `CustomCallbackHandler <https://github.com/vllm-project/production-stack/tree/main/src/vllm_router/services/callbacks_service/custom_callbacks.py>`_ class.
+
+.. code-block:: python
+
+    # my_callbacks.py
+
+    from fastapi import Request, Response
+
+    from vllm_router.services.callbacks_service.custom_callbacks import CustomCallbackHandler
+
+
+    class MyCustomCallbackHandler(CustomCallbackHandler):
+        def pre_request(self, request: Request, request_body: bytes, request_json: any) -> Response | None:
+            """
+            Receives the request object before it gets proxied.
+            """
+            if b"coffee" in request_body:
+                return Response("I'm a teapot", 418)
+
+        def post_request(self, request: Request, response_content: bytes) -> None:
+            """
+            Is executed as a background task, receives the request object
+            and the complete response_content.
+            """
+            with open("/tmp/response.txt", "ab") as f:
+                f.write(response_content)
+
+
+    my_callback_handler_instance = MyCustomCallbackHandler()
+
+
+You can pass the instance to the router with the filename first (without the file ending), followed by the instance, separated by a dot like this:
+
+.. code-block:: bash
+
+    vllm-router ... --callbacks my_callbacks.my_callback_handler_instance
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
@@ -11,6 +11,7 @@
     initialize_dynamic_config_watcher,
 )
 from vllm_router.experimental import get_feature_gates, initialize_feature_gates
+from vllm_router.services.callbacks_service.callbacks import initialize_custom_callbacks
 
 try:
     # Semantic cache integration
@@ -206,6 +207,9 @@ def initialize_all(app: FastAPI, args):
             args.dynamic_config_json, 10, init_config, app
         )
 
+    if args.callbacks:
+        initialize_custom_callbacks(args.callbacks, app)
+
 
 app = FastAPI(lifespan=lifespan)
 app.include_router(main_router)
diff --git a/src/vllm_router/parsers/parser.py b/src/vllm_router/parsers/parser.py
@@ -108,6 +108,12 @@ def parse_args():
         default=None,
         help="The key (in the header) to identify a session.",
     )
+    parser.add_argument(
+        "--callbacks",
+        type=str,
+        default=None,
+        help="Path to the callback instance extending CustomCallbackHandler. Consists of <file path without .py ending>.<instance variable name>.",
+    )
 
     # Request rewriter arguments
     parser.add_argument(
diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
@@ -1,6 +1,6 @@
 import json
 
-from fastapi import APIRouter, Request
+from fastapi import APIRouter, BackgroundTasks, Request
 from fastapi.responses import JSONResponse, Response
 
 from vllm_router.dynamic_config import get_dynamic_config_watcher
@@ -40,7 +40,7 @@
 
 
 @main_router.post("/v1/chat/completions")
-async def route_chat_completion(request: Request):
+async def route_chat_completion(request: Request, background_tasks: BackgroundTasks):
     if semantic_cache_available:
         # Check if the request can be served from the semantic cache
         logger.debug("Received chat completion request, checking semantic cache")
@@ -51,37 +51,39 @@ async def route_chat_completion(request: Request):
             return cache_response
 
     logger.debug("No cache hit, forwarding request to backend")
-    return await route_general_request(request, "/v1/chat/completions")
+    return await route_general_request(
+        request, "/v1/chat/completions", background_tasks
+    )
 
 
 @main_router.post("/v1/completions")
-async def route_completion(request: Request):
-    return await route_general_request(request, "/v1/completions")
+async def route_completion(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/v1/completions", background_tasks)
 
 
 @main_router.post("/v1/embeddings")
-async def route_embeddings(request: Request):
-    return await route_general_request(request, "/v1/embeddings")
+async def route_embeddings(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/v1/embeddings", background_tasks)
 
 
 @main_router.post("/v1/rerank")
-async def route_v1_rerank(request: Request):
-    return await route_general_request(request, "/v1/rerank")
+async def route_v1_rerank(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/v1/rerank", background_tasks)
 
 
 @main_router.post("/rerank")
-async def route_rerank(request: Request):
-    return await route_general_request(request, "/rerank")
+async def route_rerank(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/rerank", background_tasks)
 
 
 @main_router.post("/v1/score")
-async def route_v1_score(request: Request):
-    return await route_general_request(request, "/v1/score")
+async def route_v1_score(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/v1/score", background_tasks)
 
 
 @main_router.post("/score")
-async def route_score(request: Request):
-    return await route_general_request(request, "/score")
+async def route_score(request: Request, background_tasks: BackgroundTasks):
+    return await route_general_request(request, "/score", background_tasks)
 
 
 @main_router.get("/version")
diff --git a/src/vllm_router/services/callbacks_service/__init__.py b/src/vllm_router/services/callbacks_service/__init__.py
diff --git a/src/vllm_router/services/callbacks_service/callbacks.py b/src/vllm_router/services/callbacks_service/callbacks.py
@@ -0,0 +1,19 @@
+import importlib
+
+from fastapi import FastAPI
+
+from vllm_router.log import init_logger
+
+logger = init_logger(__name__)
+
+
+def initialize_custom_callbacks(callbacks_file_location: str, app: FastAPI):
+    # Split the path by dots to separate module from instance
+    parts = callbacks_file_location.split(".")
+
+    # The module path is all but the last part, and the instance_name is the last part
+    module_name = ".".join(parts[:-1])
+    instance_name = parts[-1]
+
+    module = importlib.import_module(module_name)
+    app.state.callbacks = getattr(module, instance_name)
diff --git a/src/vllm_router/services/callbacks_service/custom_callbacks.py b/src/vllm_router/services/callbacks_service/custom_callbacks.py
@@ -0,0 +1,42 @@
+from abc import abstractmethod
+
+from fastapi import Request, Response
+
+
+class CustomCallbackHandler:
+    """
+    Abstract class
+
+    Callbacks can be injected at multiple points within the request lifecycle.
+    This can be used to validate the request or log the response.
+    """
+
+    @abstractmethod
+    def pre_request(
+        self, request: Request, request_body: bytes, request_json: any
+    ) -> Response | None:
+        """
+        Receives the request object before it gets proxied.
+        This can be used to validate the request or raise HTTP responses.
+
+        Args:
+            request: The original request
+            request_body: The request body as a byte array.
+            request_json: The request body as a JSON object.
+
+        Returns:
+            Either None or a Response Object which will end the request.
+        """
+        return None
+
+    @abstractmethod
+    def post_request(self, request: Request, response_content: bytes) -> None:
+        """
+        Is executed as a background task, receives the request object and the complete response_content.
+        This can be used to log the response or further process it.
+
+        Args:
+            request: The original request
+            response_content: The complete response content after the request has been completed as a byte array.
+        """
+        pass
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
@@ -4,7 +4,7 @@
 import time
 import uuid
 
-from fastapi import Request
+from fastapi import BackgroundTasks, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from vllm_router.log import init_logger
@@ -42,7 +42,13 @@
 
 
 async def process_request(
-    request: Request, body, backend_url, request_id, endpoint, debug_request=None
+    request: Request,
+    body,
+    backend_url,
+    request_id,
+    endpoint,
+    background_tasks: BackgroundTasks,
+    debug_request=None,
 ):
     """
     Process a request by sending it to the chosen backend.
@@ -78,7 +84,7 @@ async def process_request(
         pass
 
     # For non-streaming requests, collect the full response to cache it properly
-    full_response = bytearray() if not is_streaming else None
+    full_response = bytearray()
 
     async with request.app.state.httpx_client_wrapper().stream(
         method=request.method,
@@ -111,13 +117,19 @@ async def process_request(
     # Store in semantic cache if applicable
     # Use the full response for non-streaming requests, or the last chunk for streaming
     if request.app.state.semantic_cache_available:
-        cache_chunk = bytes(full_response) if full_response is not None else chunk
+        cache_chunk = bytes(full_response) if not is_streaming else chunk
         await store_in_semantic_cache(
             endpoint=endpoint, method=request.method, body=body, chunk=cache_chunk
         )
+        if background_tasks and hasattr(request.app.state, "callbacks"):
+            background_tasks.add_task(
+                request.app.state.callbacks.post_request, request, full_response
+            )
 
 
-async def route_general_request(request: Request, endpoint: str):
+async def route_general_request(
+    request: Request, endpoint: str, background_tasks: BackgroundTasks
+):
     """
     Route the incoming request to the backend server and stream the response back to the client.
 
@@ -138,6 +150,14 @@ async def route_general_request(request: Request, endpoint: str):
     request_id = str(uuid.uuid4())
     request_body = await request.body()
     request_json = await request.json()  # TODO (ApostaC): merge two awaits into one
+
+    if hasattr(request.app.state, "callbacks") and (
+        response_overwrite := request.app.state.callbacks.pre_request(
+            request, request_body, request_json
+        )
+    ):
+        return response_overwrite
+
     requested_model = request_json.get("model", None)
     if requested_model is None:
         return JSONResponse(
@@ -185,6 +205,7 @@ async def route_general_request(request: Request, endpoint: str):
         request_body,
         server_url,
         request_id,
+        background_tasks,
         endpoint=endpoint,
     )
     headers, status_code = await anext(stream_generator)

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`initialize_dynamic_config_watcher,`
`12`	`12`	`)`
`13`	`13`	`from vllm_router.experimental import get_feature_gates, initialize_feature_gates`
	`14`	`+from vllm_router.services.callbacks_service.callbacks import initialize_custom_callbacks`
`14`	`15`
`15`	`16`	`try:`
`16`	`17`	`# Semantic cache integration`
`@@ -206,6 +207,9 @@ def initialize_all(app: FastAPI, args):`
`206`	`207`	`args.dynamic_config_json, 10, init_config, app`
`207`	`208`	`)`
`208`	`209`
	`210`	`+ if args.callbacks:`
	`211`	`+ initialize_custom_callbacks(args.callbacks, app)`
	`212`	`+`
`209`	`213`
`210`	`214`	`app = FastAPI(lifespan=lifespan)`
`211`	`215`	`app.include_router(main_router)`