sync

NickLucche · NickLucche · commit 4759b89b4032 · 2025-03-27T16:18:33.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import tempfile
-from time import time
 
 import pytest
 
@@ -15,61 +13,6 @@
     )
 
 
-# TODO remove this test once VLLM_XLA_CHECK_RECOMPILATION does not error out
-@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
-def test_sampler_compilation(model_name: str, monkeypatch):
-    """
-    Check that no recompilation happens despite changing sampling parameters.
-    We can't read XLA metrics from the engine process, hence we measure time.  
-    """
-    with tempfile.TemporaryDirectory() as temp_dir:
-        monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
-        # Compiling model init may still take some time, enforce_eager to skip.
-        llm = LLM(model_name,
-                  enforce_eager=True,
-                  max_num_seqs=16,
-                  max_model_len=1024,
-                  gpu_memory_utilization=0.5)
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-        ]
-        # First inference should be slow
-        sampling_params = SamplingParams(
-            temperature=0.7,
-            # top_p=0.6, # TODO too slow!
-            top_k=10,
-            min_p=0.2,
-            max_tokens=16)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run1 = time() - s
-
-        # Second request with different params, but for which we
-        # compiled for in previous eager iteration.
-        sampling_params = SamplingParams(temperature=0.1,
-                                         top_k=12,
-                                         min_p=0.8,
-                                         max_tokens=24)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run2 = time() - s
-        # Much faster after compiling
-        assert run1 * 0.1 > run2
-        print("TIMES", run1, run2)
-
-        # Third request with min_p set to "None". It will not trigger
-        # recompilation as a default 0 value will be used.
-        sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run3 = time() - s
-        assert run1 * 0.1 > run3
-        print("TIMES", run1, run3)
-
-
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This test needs a TPU")
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -118,28 +118,12 @@ def forward_tpu(
         # If only top-k is specified, use pytorch's builtin topk op. This leads
         # to significant speed up on TPU compared to using apply_top_k_top_p.
         if k is not None and p is None:
-            logits = top_k_only(logits, k)
+            logits = apply_top_k_only(logits, k)
         # TODO Add TPU optimized topp kernel and topk+topp
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 
 
-def top_k_only(logits: torch.Tensor, k: torch.Tensor) -> torch.Tensor:
-    # Avoid sorting vocab for top-k only case.
-    no_top_k_mask = k == logits.shape[1]
-    # Set non-top-k rows to 1 so that we can gather.
-    k = k.masked_fill(no_top_k_mask, 1)
-    max_top_k = k.max()
-    # topk.values tensor has shape [batch_size, max_top_k].
-    # Convert top k to 0-based index in range [0, max_top_k).
-    k_index = k.sub_(1).unsqueeze(1)
-    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index)
-    # Handle non-topk rows.
-    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
-    logits.masked_fill_(logits < top_k_mask, -float("inf"))
-    return logits
-
-
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],