[TPU][V1][Bugfix] Fix chunked prefill with padding (vllm-project#15037)

NickLucche · web-flow · commit af35d3a3ccb5 · 2025-03-18T07:34:45.000-07:00
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -410,6 +410,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # Do the padding and copy the tensors to the TPU.
         padded_total_num_scheduled_tokens = _get_padded_token_len(
             total_num_scheduled_tokens)
+        # Zero out to avoid spurious values from prev iteration (last cp chunk)
+        self.input_ids_cpu[
+            total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
         self.input_ids = self.input_ids_cpu[:
                                             padded_total_num_scheduled_tokens].to(
                                                 self.device)