@@ -35,14 +35,14 @@ def time_generation(llm: LLM, prompts: List[str],
35
35
sampling_params = SamplingParams (temperature = 0.8 , top_p = 0.95 )
36
36
37
37
# Create an LLM without spec decoding
38
- # print("==============Without speculation==================")
39
- # llm = LLM(model="facebook/opt-6.7b", tensor_parallel_size=2 )
38
+ print ("==============Without speculation==================" )
39
+ llm = LLM (model = "facebook/opt-6.7b" )
40
40
41
- # ret_non_spec, latency_per_token_non_spec = time_generation(
42
- # llm, prompts, sampling_params)
41
+ ret_non_spec , latency_per_token_non_spec = time_generation (
42
+ llm , prompts , sampling_params )
43
43
44
- # del llm
45
- # gc.collect()
44
+ del llm
45
+ gc .collect ()
46
46
47
47
# Create an LLM with spec decoding
48
48
print ("==============With speculation=====================" )
@@ -52,7 +52,6 @@ def time_generation(llm: LLM, prompts: List[str],
52
52
num_speculative_tokens = 5 ,
53
53
# These are currently required for MLPSpeculator decoding
54
54
use_v2_block_manager = True ,
55
- # tensor_parallel_size=2,
56
55
)
57
56
58
57
ret_spec , latency_per_token_spec = time_generation (llm , prompts ,
@@ -62,8 +61,8 @@ def time_generation(llm: LLM, prompts: List[str],
62
61
gc .collect ()
63
62
print ("================= Summary =====================" )
64
63
print ("input is " , prompts , "\n " )
65
- # print("Non Spec Decode - latency_per_token is ",
66
- # latency_per_token_non_spec)
67
- # print("Generated Text is :", ret_non_spec, "\n")
64
+ print ("Non Spec Decode - latency_per_token is " ,
65
+ latency_per_token_non_spec )
66
+ print ("Generated Text is :" , ret_non_spec , "\n " )
68
67
print ("Spec Decode - latency_per_token is " , latency_per_token_spec )
69
68
print ("Generated Text is :" , ret_spec )
0 commit comments