Skip to content

Commit ff47aab

Browse files
bigPYJ1151Isotr0py
andauthored
[CPU] Upgrade CPU backend to torch-2.6 (vllm-project#13381)
Signed-off-by: jiang1.li <[email protected]> Co-authored-by: Isotr0py <[email protected]>
1 parent debd6bb commit ff47aab

File tree

9 files changed

+23
-13
lines changed

9 files changed

+23
-13
lines changed

.buildkite/run-cpu-test.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@ remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
22+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
2525

2626
function cpu_tests() {
2727
set -e
2828
export NUMA_NODE=$2
29+
export BUILDKITE_BUILD_NUMBER=$3
2930

3031
# offline inference
3132
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@@ -36,6 +37,7 @@ function cpu_tests() {
3637
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3738
set -e
3839
pip install -r vllm/requirements/test.txt
40+
pip install -r vllm/requirements/cpu.txt
3941
pytest -v -s tests/models/decoder_only/language -m cpu_model
4042
pytest -v -s tests/models/embedding/language -m cpu_model
4143
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@@ -85,4 +87,4 @@ function cpu_tests() {
8587

8688
# All of CPU tests are expected to be finished less than 40 mins.
8789
export -f cpu_tests
88-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
90+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

Dockerfile.cpu

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
2222

2323
RUN echo 'ulimit -c 0' >> ~/.bashrc
2424

25-
RUN pip install intel_extension_for_pytorch==2.5.0
25+
RUN pip install intel_extension_for_pytorch==2.6.0
2626

2727
WORKDIR /workspace
2828

cmake/cpu_extension.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
149149
FetchContent_Declare(
150150
oneDNN
151151
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
152-
GIT_TAG v3.6
152+
GIT_TAG v3.7.1
153153
GIT_PROGRESS TRUE
154154
GIT_SHALLOW TRUE
155155
)

requirements/cpu.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
-r common.txt
33

44
# Dependencies for CPUs
5-
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
5+
torch==2.6.0+cpu; platform_machine == "x86_64"
66
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
77
torch==2.7.0.dev20250304; platform_machine == "s390x"
88

tests/lora/test_qwen2vl.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from vllm.platforms import current_platform
1313

1414

15-
@pytest.fixture(autouse=True)
15+
@pytest.fixture(autouse=not current_platform.is_cpu())
1616
def v1(run_with_both_engines_lora):
1717
# Simple autouse wrapper to run both engines for each test
1818
# This can be promoted up to conftest.py to run for every

vllm/attention/ops/ipex_attn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class _PagedAttention:
1717

1818
@staticmethod
1919
def get_supported_head_sizes() -> List[int]:
20-
return [32, 64, 80, 96, 112, 128, 256]
20+
return [32, 64, 80, 96, 112, 128, 192, 256]
2121

2222
@staticmethod
2323
def get_kv_cache_shape(

vllm/executor/multiproc_worker_utils.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,11 @@ def _run_worker_process(
254254
# online (in situ) tuning is enabled.
255255
# Offline tuning API (record_untuned_is_enabled()) only
256256
# available in PyTorch 2.6 or later.
257-
import torch.cuda.tunable as tunable
258-
if (tunable.is_enabled() and tunable.tuning_is_enabled()
259-
and not tunable.record_untuned_is_enabled()):
260-
tunable.write_file()
257+
if torch.cuda.is_available():
258+
import torch.cuda.tunable as tunable
259+
if (tunable.is_enabled() and tunable.tuning_is_enabled()
260+
and not tunable.record_untuned_is_enabled()):
261+
tunable.write_file()
261262

262263
logger.info("Worker exiting")
263264

vllm/model_executor/layers/fused_moe/layer.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,11 @@ def forward_cpu(
193193
global_num_experts: int = -1,
194194
expert_map: Optional[torch.Tensor] = None,
195195
custom_routing_function: Optional[Callable] = None,
196+
scoring_func: str = "softmax",
197+
e_score_correction_bias: Optional[torch.Tensor] = None,
196198
activation: str = "silu",
197199
**kwargs,
198200
):
199-
assert custom_routing_function is None
200201
assert activation == "silu", f"{activation} is not supported."
201202
return layer.ipex_fusion(
202203
x,
@@ -206,6 +207,9 @@ def forward_cpu(
206207
renormalize,
207208
topk_group,
208209
num_expert_group,
210+
custom_routing_function,
211+
scoring_func,
212+
e_score_correction_bias,
209213
)
210214

211215
def forward_tpu(

vllm/platforms/cpu.py

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
121121
# Disable torch async compiling which won't work with daemonic processes
122122
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
123123

124+
# MLA attention is not supported
125+
os.environ["VLLM_MLA_DISABLE"] = "1"
126+
124127
# Intel OpenMP setting
125128
ld_prealod_str = os.getenv("LD_PRELOAD", "")
126129
if "libiomp5.so" in ld_prealod_str:

0 commit comments

Comments
 (0)