revert back examples

yangw1234 · yangw1234 · commit fb37bf33c22d · 2025-03-13T18:15:18.000Z
diff --git a/examples/offline_inference_medusaspeculator.py b/examples/offline_inference_medusaspeculator.py
@@ -36,15 +36,14 @@ def time_generation(llm: LLM, prompts: List[str],
                                      max_tokens=20)
 
     # Create an LLM without spec decoding
-    # print("==============Without speculation==================")
-    # llm = LLM(model="JackFram/llama-68m",
-    #           tensor_parallel_size=2)
+    print("==============Without speculation==================")
+    llm = LLM(model="JackFram/llama-68m")
 
-    # ret_non_spec, latency_per_token_non_spec = time_generation(
-    #     llm, prompts, sampling_params)
+    ret_non_spec, latency_per_token_non_spec = time_generation(
+        llm, prompts, sampling_params)
 
-    # del llm
-    # gc.collect()
+    del llm
+    gc.collect()
 
     # Create an LLM with spec decoding
     print("==============With speculation=====================")
@@ -53,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
         speculative_model="abhigoyal/vllm-medusa-llama-68m-random",
         num_speculative_tokens=5,
         use_v2_block_manager=True,
-        # tensor_parallel_size=2,
     )
 
     ret_spec, latency_per_token_spec = time_generation(llm, prompts,
@@ -63,8 +61,8 @@ def time_generation(llm: LLM, prompts: List[str],
     gc.collect()
     print("================= Summary =====================")
     print("input is ", prompts, "\n")
-    # print("Non Spec Decode - latency_per_token is ",
-    #       latency_per_token_non_spec)
-    # print("Generated Text is :", ret_non_spec, "\n")
+    print("Non Spec Decode - latency_per_token is ",
+          latency_per_token_non_spec)
+    print("Generated Text is :", ret_non_spec, "\n")
     print("Spec Decode - latency_per_token is ", latency_per_token_spec)
     print("Generated Text is :", ret_spec)
diff --git a/examples/offline_inference_spec_decode.py b/examples/offline_inference_spec_decode.py
@@ -35,14 +35,14 @@ def time_generation(llm: LLM, prompts: List[str],
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM without spec decoding
-    # print("==============Without speculation==================")
-    # llm = LLM(model="facebook/opt-6.7b", tensor_parallel_size=2)
+    print("==============Without speculation==================")
+    llm = LLM(model="facebook/opt-6.7b")
 
-    # ret_non_spec, latency_per_token_non_spec = time_generation(
-    #     llm, prompts, sampling_params)
+    ret_non_spec, latency_per_token_non_spec = time_generation(
+        llm, prompts, sampling_params)
 
-    # del llm
-    # gc.collect()
+    del llm
+    gc.collect()
 
     # Create an LLM with spec decoding
     print("==============With speculation=====================")
@@ -52,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
         num_speculative_tokens=5,
         # These are currently required for MLPSpeculator decoding
         use_v2_block_manager=True,
-        # tensor_parallel_size=2,
     )
 
     ret_spec, latency_per_token_spec = time_generation(llm, prompts,
@@ -62,8 +61,8 @@ def time_generation(llm: LLM, prompts: List[str],
     gc.collect()
     print("================= Summary =====================")
     print("input is ", prompts, "\n")
-    # print("Non Spec Decode - latency_per_token is ",
-    #       latency_per_token_non_spec)
-    # print("Generated Text is :", ret_non_spec, "\n")
+    print("Non Spec Decode - latency_per_token is ",
+          latency_per_token_non_spec)
+    print("Generated Text is :", ret_non_spec, "\n")
     print("Spec Decode - latency_per_token is ", latency_per_token_spec)
     print("Generated Text is :", ret_spec)
diff --git a/vllm/spec_decode/hpu_draft_model_runner.py b/vllm/spec_decode/hpu_draft_model_runner.py
@@ -44,7 +44,7 @@ def __init__(self, *args, **kwargs):
         # because in spec_decode_worker determine_num_available_blocks()
         # is not called, so that warmup will fail. Simply adding this call
         # does not work since other proposers do not implement this method.
-        super().skip_warmup = True
+        self.model_runner.skip_warmup = True
 
     @torch.inference_mode()
     def execute_model(