Skip to content

Commit fb37bf3

Browse files
committed
revert back examples
1 parent 6b2dad8 commit fb37bf3

File tree

3 files changed

+19
-22
lines changed

3 files changed

+19
-22
lines changed

examples/offline_inference_medusaspeculator.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,14 @@ def time_generation(llm: LLM, prompts: List[str],
3636
max_tokens=20)
3737

3838
# Create an LLM without spec decoding
39-
# print("==============Without speculation==================")
40-
# llm = LLM(model="JackFram/llama-68m",
41-
# tensor_parallel_size=2)
39+
print("==============Without speculation==================")
40+
llm = LLM(model="JackFram/llama-68m")
4241

43-
# ret_non_spec, latency_per_token_non_spec = time_generation(
44-
# llm, prompts, sampling_params)
42+
ret_non_spec, latency_per_token_non_spec = time_generation(
43+
llm, prompts, sampling_params)
4544

46-
# del llm
47-
# gc.collect()
45+
del llm
46+
gc.collect()
4847

4948
# Create an LLM with spec decoding
5049
print("==============With speculation=====================")
@@ -53,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
5352
speculative_model="abhigoyal/vllm-medusa-llama-68m-random",
5453
num_speculative_tokens=5,
5554
use_v2_block_manager=True,
56-
# tensor_parallel_size=2,
5755
)
5856

5957
ret_spec, latency_per_token_spec = time_generation(llm, prompts,
@@ -63,8 +61,8 @@ def time_generation(llm: LLM, prompts: List[str],
6361
gc.collect()
6462
print("================= Summary =====================")
6563
print("input is ", prompts, "\n")
66-
# print("Non Spec Decode - latency_per_token is ",
67-
# latency_per_token_non_spec)
68-
# print("Generated Text is :", ret_non_spec, "\n")
64+
print("Non Spec Decode - latency_per_token is ",
65+
latency_per_token_non_spec)
66+
print("Generated Text is :", ret_non_spec, "\n")
6967
print("Spec Decode - latency_per_token is ", latency_per_token_spec)
7068
print("Generated Text is :", ret_spec)

examples/offline_inference_spec_decode.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ def time_generation(llm: LLM, prompts: List[str],
3535
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
3636

3737
# Create an LLM without spec decoding
38-
# print("==============Without speculation==================")
39-
# llm = LLM(model="facebook/opt-6.7b", tensor_parallel_size=2)
38+
print("==============Without speculation==================")
39+
llm = LLM(model="facebook/opt-6.7b")
4040

41-
# ret_non_spec, latency_per_token_non_spec = time_generation(
42-
# llm, prompts, sampling_params)
41+
ret_non_spec, latency_per_token_non_spec = time_generation(
42+
llm, prompts, sampling_params)
4343

44-
# del llm
45-
# gc.collect()
44+
del llm
45+
gc.collect()
4646

4747
# Create an LLM with spec decoding
4848
print("==============With speculation=====================")
@@ -52,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
5252
num_speculative_tokens=5,
5353
# These are currently required for MLPSpeculator decoding
5454
use_v2_block_manager=True,
55-
# tensor_parallel_size=2,
5655
)
5756

5857
ret_spec, latency_per_token_spec = time_generation(llm, prompts,
@@ -62,8 +61,8 @@ def time_generation(llm: LLM, prompts: List[str],
6261
gc.collect()
6362
print("================= Summary =====================")
6463
print("input is ", prompts, "\n")
65-
# print("Non Spec Decode - latency_per_token is ",
66-
# latency_per_token_non_spec)
67-
# print("Generated Text is :", ret_non_spec, "\n")
64+
print("Non Spec Decode - latency_per_token is ",
65+
latency_per_token_non_spec)
66+
print("Generated Text is :", ret_non_spec, "\n")
6867
print("Spec Decode - latency_per_token is ", latency_per_token_spec)
6968
print("Generated Text is :", ret_spec)

vllm/spec_decode/hpu_draft_model_runner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def __init__(self, *args, **kwargs):
4444
# because in spec_decode_worker determine_num_available_blocks()
4545
# is not called, so that warmup will fail. Simply adding this call
4646
# does not work since other proposers do not implement this method.
47-
super().skip_warmup = True
47+
self.model_runner.skip_warmup = True
4848

4949
@torch.inference_mode()
5050
def execute_model(

0 commit comments

Comments
 (0)