[V1] TPU - Fix CI/CD runner (vllm-project#14974)

alexm-redhat · web-flow · commit 18551e820c10 · 2025-03-17T21:07:07.000Z
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
@@ -15,13 +15,22 @@ remove_docker_container
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
     && python3 -m pip install pytest \
     && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_1 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_2 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && echo TEST_3 \
+    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_4 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    && echo TEST_5 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
+
+
+# TODO: Fix these tests
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
@@ -34,7 +34,9 @@
 
     # disable custom dispatcher, let Dynamo takes over
     # all the control
-    llm = LLM(model="google/gemma-2b",
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=512,
+              max_num_seqs=64,
               enforce_eager=True,
               compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
     outputs = llm.generate(prompts, sampling_params)
@@ -44,38 +46,51 @@
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         assert generated_text.startswith(answer)
 
-compiled_code = sorted(
+compiled_codes = sorted(
     glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
 
-# we should only trigger Dynamo compilation three times:
-# one for the profiling phase without kv cache
-# one for the prefill phase with symbolic shapes
-# one for the decode phase with symbolic shapes
+for i, compiled_code in enumerate(compiled_codes):
+    print("{} file: {}".format(i + 1, compiled_code))
+
+# We should only trigger Dynamo compilation 4 times:
+# 1. forward pass (symbolic)
+# 2. compute_logits (symbolic)
+# 3. forward pass (shape 16)
+# 4. forward pass (shape 32)
 # and later calls should not trigger Dynamo compilation again.
-# NOTE: it might still trigger XLA compilation.
+# NOTE: It might still trigger XLA compilation.
+
+# Check we have 4 compiled codes
+assert len(compiled_codes) == 4
 
-# check we have three compiled code
-# this is the assumption when we use the custom dispatcher
-assert len(compiled_code) == 3
+kv_cache_prefix = "kv_cache"
+attn_prefix = "ragged_paged_attention"
 
-# check all the compilations are as expected
-compiled_fn = sorted(
+# Check all the compilations are as expected
+compiled_fns = sorted(
     glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
 
-# the first compilation is the profiling phase,
-# it should not have any kv cache
-with open(compiled_fn[0]) as f:
+for i, compiled_fn in enumerate(compiled_fns):
+    print("{} file: {}".format(i + 1, compiled_fn))
+
+# The first compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[0]) as f:
+    content = f.read()
+    assert kv_cache_prefix not in content
+
+# The second compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[1]) as f:
     content = f.read()
-    assert "kv_caches" not in content
+    assert kv_cache_prefix not in content
 
-# the second compilation is the prefill phase,
-# it should have kv cache and the flash_attention op
-with open(compiled_fn[1]) as f:
+# The third compilation is shape 16, so it should have kv_caches and the
+# ragged_paged_attention
+with open(compiled_fns[2]) as f:
     content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
 
-# the third compilation is the decode phase,
-# it should have kv cache and the paged_attention op
-with open(compiled_fn[2]) as f:
+# The forth compilation is shape 32, so it should have kv_caches and the
+# ragged_paged_attention
+with open(compiled_fns[3]) as f:
     content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
@@ -14,12 +14,17 @@
 def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings(
-            "google/gemma-2b",
-            arg1=[
-                "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_ONCE}",
-            ],
-            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
-            env1={},
-            env2={})
+        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
+                             arg1=[
+                                 "--max-model-len=256",
+                                 "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
+                             ],
+                             arg2=[
+                                 "--max-model-len=256", "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
+                             ],
+                             env1={},
+                             env2={})