[Misc] Support docker for the latest vllm integration (LMCache#316)

YaoJiayi · web-flow · commit 9b550d1d0845 · 2025-01-20T09:07:28.000-08:00
* add docker-related stuff

* remove comments

* fix format

* fix bash.sh to include docker patch
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,115 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
+ARG CUDA_VERSION=12.4.1
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.4.1
+ARG PYTHON_VERSION=3.12
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt
+
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+
+# install build dependencies
+COPY requirements-build.txt requirements-build.txt
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt
+
+ARG LMCACHE_COMMIT_ID=1
+
+RUN git clone https://github.com/LMCache/LMCache.git
+RUN git clone https://github.com/LMCache/torchac_cuda.git
+
+
+WORKDIR /workspace/LMCache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist_lmcache
+
+WORKDIR /workspace/torchac_cuda
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=/workspace/LMCache/dist_lmcache
+    
+
+#################### vLLM installation IMAGE ####################
+# Install torchac_cuda wheel into the vLLM image
+FROM vllm/vllm-openai:v0.6.6.post1 AS vllm-openai
+RUN --mount=type=bind,from=build,src=/workspace/LMCache/dist_lmcache,target=/vllm-workspace/dist_lmcache \
+--mount=type=cache,target=/root/.cache/pip \
+pip install dist_lmcache/*.whl --verbose
+
+# Copy lmc_connector patch into vllm
+COPY patches/factory.py \
+    /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/
+COPY patches/lmcache_connector.py \
+    /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/
+
+# Use diff if file is too large
+COPY patches/parallel_state.patch \
+    /usr/local/lib/python3.12/dist-packages/vllm/distributed/
+COPY patches/config.patch \
+    /usr/local/lib/python3.12/dist-packages/vllm/
+
+RUN patch /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py \
+    /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.patch
+RUN patch /usr/local/lib/python3.12/dist-packages/vllm/config.py \
+    /usr/local/lib/python3.12/dist-packages/vllm/config.patch
+
+
+ENTRYPOINT ["vllm", "serve"]
diff --git a/docker/example_run.sh b/docker/example_run.sh
@@ -0,0 +1,14 @@
+IMAGE=<IMAGE_NAME>:<TAG>
+docker run --runtime nvidia --gpus all \
+    --env "HF_TOKEN=<YOUR_HUGGINGFACE_TOKEN>" \
+    --env "LMCACHE_USE_EXPERIMENTAL=True" \
+    --env "chunk_size=256" \
+    --env "local_cpu=True" \
+    --env "max_local_cpu_size=5" \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --network host \
+    --entrypoint "/usr/local/bin/vllm" \
+    $IMAGE \
+    serve mistralai/Mistral-7B-Instruct-v0.2 --kv-transfer-config \
+    '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' \
+    --enable-chunked-prefill false
diff --git a/docker/patch/config.patch b/docker/patch/config.patch
@@ -0,0 +1,13 @@
+--- original/config.py	2025-01-19 20:05:02.376220126 -0600
++++ config.py	2025-01-19 20:01:35.864391306 -0600
+@@ -2559,7 +2559,9 @@
+         return KVTransferConfig.model_validate_json(cli_value)
+ 
+     def model_post_init(self, __context: Any) -> None:
+-        supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
++        supported_kv_connector = ["PyNcclConnector", 
++                                  "MooncakeConnector",
++                                  "LMCacheConnector"]
+         if all([
+                 self.kv_connector is not None, self.kv_connector
+                 not in supported_kv_connector
diff --git a/docker/patch/factory.py b/docker/patch/factory.py
@@ -0,0 +1,27 @@
+from typing import TYPE_CHECKING
+
+from .base import KVConnectorBase
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class KVConnectorFactory:
+
+    @staticmethod
+    def create_connector(rank: int, local_rank: int,
+                         config: "VllmConfig") -> KVConnectorBase:
+        supported_kv_connector = [
+            "PyNcclConnector", "MooncakeConnector", "LMCacheConnector"
+        ]
+        kv_connector = config.kv_transfer_config.kv_connector
+        if kv_connector in supported_kv_connector:
+            if kv_connector in ["PyNcclConnector", "MooncakeConnector"]:
+                from .simple_connector import SimpleConnector
+                return SimpleConnector(rank, local_rank, config)
+            elif kv_connector in ["LMCacheConnector"]:
+                from .lmcache_connector import LMCacheConnector
+                return LMCacheConnector(rank, local_rank, config)
+        else:
+            raise ValueError(f"Unsupported connector type: "
+                             f"{config.kv_connector}")
diff --git a/docker/patch/lmcache_connector.py b/docker/patch/lmcache_connector.py
@@ -0,0 +1,104 @@
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
+The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker
+(KV cache producer) and decode vLLM worker (KV cache consumer) using LMCache;
+(2) offload and share KV caches. Only (2) is supported for now.
+"""
+
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class LMCacheConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.transfer_config = config.kv_transfer_config
+        self.vllm_config = config
+
+        from lmcache.integration.vllm.vllm_adapter import (RetrieveStatus,
+                                                           StoreStatus,
+                                                           init_lmcache_engine,
+                                                           lmcache_retrieve_kv,
+                                                           lmcache_store_kv)
+
+        logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
+                    self.transfer_config)
+
+        # TODO (Jiayi): Find model_config, parallel_config, and cache_config
+        self.engine = init_lmcache_engine(config.model_config,
+                                          config.parallel_config,
+                                          config.cache_config)
+
+        self.model_config = config.model_config
+        self.parallel_config = config.parallel_config
+        self.cache_config = config.cache_config
+        self.lmcache_retrieve_kv = lmcache_retrieve_kv
+        self.lmcache_store_kv = lmcache_store_kv
+        self.store_status = StoreStatus
+        self.retrieve_status = RetrieveStatus
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        # TODO(Jiayi): This shouldn't be none for disagg prefill
+        hidden_or_intermediate_states = None
+
+        # TODO (Jiayi): Only normal prefill is supported for now
+        retrieve_status = [self.retrieve_status.PREFILL]
+
+        model_input, bypass_model_exec = self.lmcache_retrieve_kv(
+            model_executable, model_input, self.cache_config, kv_caches,
+            retrieve_status)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        num_reqs = 0
+        seq_group_list = model_input.sampling_metadata.seq_groups
+        assert seq_group_list is not None
+        for seq_group in seq_group_list:
+            seq_ids = seq_group.seq_ids
+            for seq_id in seq_ids:
+                num_reqs += 1
+
+        # TODO (Jiayi): Only normal prefill is supported for now
+        store_status = [self.store_status.PREFILL] * num_reqs
+        self.lmcache_store_kv(
+            self.model_config,
+            self.parallel_config,
+            model_executable,
+            model_input,
+            kv_caches,
+            store_status,
+        )
+
+    def close(self):
+        self.engine.close()
diff --git a/docker/patch/parallel_state.patch b/docker/patch/parallel_state.patch
@@ -0,0 +1,14 @@
+--- original/parallel_state.py	2025-01-19 20:05:02.012220433 -0600
++++ parallel_state.py	2025-01-19 20:07:24.844098884 -0600
+@@ -1075,9 +1075,9 @@
+ 
+     if vllm_config.kv_transfer_config is None:
+         return
+-
++    
+     if all([
+-            vllm_config.kv_transfer_config.need_kv_parallel_group,
++            vllm_config.kv_transfer_config.is_kv_transfer_instance,
+             _KV_TRANSFER is None
+     ]):
+         _KV_TRANSFER = kv_transfer.KVTransferAgent(
diff --git a/docker/requirements-build.txt b/docker/requirements-build.txt
@@ -0,0 +1,9 @@
+# Should be mirrored in pyproject.toml
+cmake>=3.26
+ninja
+packaging
+setuptools>=61
+setuptools-scm>=8
+torch==2.4.0
+wheel
+jinja2
diff --git a/docker/requirements-common.txt b/docker/requirements-common.txt
@@ -0,0 +1,33 @@
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers >= 4.45.0  # Required for Llama 3.2.
+tokenizers >= 0.19.1  # Required for Llama 3.
+protobuf # Required by LlamaTokenizer.
+fastapi < 0.113.0; python_version < '3.9'
+fastapi >= 0.114.1; python_version >= '3.9'
+aiohttp
+openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
+uvicorn[standard]
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.6
+outlines >= 0.0.43, < 0.1
+typing_extensions >= 4.10
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+partial-json-parser # used for parsing partial JSON outputs
+pyzmq
+msgspec
+gguf == 0.10.0
+importlib_metadata
+mistral_common >= 1.4.3
+pyyaml
+six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+einops # Required for Qwen2-VL.
diff --git a/docker/requirements-cuda.txt b/docker/requirements-cuda.txt
@@ -0,0 +1,10 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for NVIDIA GPUs
+ray >= 2.9
+nvidia-ml-py # for pynvml package
+torch == 2.4.0
+# These must be updated alongside torch
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
diff --git a/format.sh b/format.sh
@@ -193,7 +193,7 @@ if [[ "$1" == '--files' ]]; then
    # If `--all` is passed, then any further arguments are ignored and the
    # entire python directory is linted.
 elif [[ "$1" == '--all' ]]; then
-   lint lmcache tests
+   lint lmcache tests docker
 else
    # Format only the files that changed in last commit.
    lint_changed