1
+ # The vLLM Dockerfile is used to construct vLLM image that can be directly used
2
+ # to run the OpenAI compatible server.
3
+
4
+ # Please update any changes made here to
5
+ # docs/source/dev/dockerfile/dockerfile.rst and
6
+ # docs/source/assets/dev/dockerfile-stages-dependency.png
7
+
8
+ ARG CUDA_VERSION=12.4.1
9
+ # ################### BASE BUILD IMAGE ####################
10
+ # prepare basic build environment
11
+ FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
12
+ ARG CUDA_VERSION=12.4.1
13
+ ARG PYTHON_VERSION=3.12
14
+ ENV DEBIAN_FRONTEND=noninteractive
15
+
16
+ # Install Python and other dependencies
17
+ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
18
+ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
19
+ && apt-get update -y \
20
+ && apt-get install -y ccache software-properties-common git curl sudo \
21
+ && add-apt-repository ppa:deadsnakes/ppa \
22
+ && apt-get update -y \
23
+ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
24
+ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
25
+ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
26
+ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
27
+ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
28
+ && python3 --version && python3 -m pip --version
29
+
30
+ # Workaround for https://github.com/openai/triton/issues/2507 and
31
+ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
32
+ # this won't be needed for future versions of this docker image
33
+ # or future versions of triton.
34
+ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
35
+
36
+ WORKDIR /workspace
37
+
38
+ # install build and runtime dependencies
39
+ COPY requirements-common.txt requirements-common.txt
40
+ COPY requirements-cuda.txt requirements-cuda.txt
41
+ RUN --mount=type=cache,target=/root/.cache/pip \
42
+ python3 -m pip install -r requirements-cuda.txt
43
+
44
+
45
+ # cuda arch list used by torch
46
+ # can be useful for both `dev` and `test`
47
+ # explicitly set the list to avoid issues with torch 2.2
48
+ # see https://github.com/pytorch/pytorch/pull/123243
49
+ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
50
+ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
51
+ # Override the arch list for flash-attn to reduce the binary size
52
+ ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
53
+ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
54
+ # ################### BASE BUILD IMAGE ####################
55
+
56
+ # ################### WHEEL BUILD IMAGE ####################
57
+ FROM base AS build
58
+
59
+ # install build dependencies
60
+ COPY requirements-build.txt requirements-build.txt
61
+
62
+ # max jobs used by Ninja to build extensions
63
+ ARG max_jobs=2
64
+ ENV MAX_JOBS=${max_jobs}
65
+ # number of threads used by nvcc
66
+ ARG nvcc_threads=8
67
+ ENV NVCC_THREADS=$nvcc_threads
68
+
69
+
70
+ RUN --mount=type=cache,target=/root/.cache/pip \
71
+ python3 -m pip install -r requirements-build.txt
72
+
73
+ ARG LMCACHE_COMMIT_ID=1
74
+
75
+ RUN git clone https://github.com/LMCache/LMCache.git
76
+ RUN git clone https://github.com/LMCache/torchac_cuda.git
77
+
78
+
79
+ WORKDIR /workspace/LMCache
80
+ RUN --mount=type=cache,target=/root/.cache/ccache \
81
+ --mount=type=cache,target=/root/.cache/pip \
82
+ python3 setup.py bdist_wheel --dist-dir=dist_lmcache
83
+
84
+ WORKDIR /workspace/torchac_cuda
85
+ RUN --mount=type=cache,target=/root/.cache/ccache \
86
+ --mount=type=cache,target=/root/.cache/pip \
87
+ python3 setup.py bdist_wheel --dist-dir=/workspace/LMCache/dist_lmcache
88
+
89
+
90
+ # ################### vLLM installation IMAGE ####################
91
+ # Install torchac_cuda wheel into the vLLM image
92
+ FROM vllm/vllm-openai:v0.6.6.post1 AS vllm-openai
93
+ RUN --mount=type=bind,from=build,src=/workspace/LMCache/dist_lmcache,target=/vllm-workspace/dist_lmcache \
94
+ --mount=type=cache,target=/root/.cache/pip \
95
+ pip install dist_lmcache/*.whl --verbose
96
+
97
+ # Copy lmc_connector patch into vllm
98
+ COPY patches/factory.py \
99
+ /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/
100
+ COPY patches/lmcache_connector.py \
101
+ /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/
102
+
103
+ # Use diff if file is too large
104
+ COPY patches/parallel_state.patch \
105
+ /usr/local/lib/python3.12/dist-packages/vllm/distributed/
106
+ COPY patches/config.patch \
107
+ /usr/local/lib/python3.12/dist-packages/vllm/
108
+
109
+ RUN patch /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py \
110
+ /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.patch
111
+ RUN patch /usr/local/lib/python3.12/dist-packages/vllm/config.py \
112
+ /usr/local/lib/python3.12/dist-packages/vllm/config.patch
113
+
114
+
115
+ ENTRYPOINT ["vllm" , "serve" ]
0 commit comments