Skip to content

Commit e6a2959

Browse files
committed
Update the new test cases
1 parent 611f9b7 commit e6a2959

7 files changed

+169
-43
lines changed

bootstrapper.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,18 @@ class LocalLMCacheServerBootstrapper(Bootstrapper):
8383
"""
8484
Bootstraps a local lmcache server
8585
"""
86-
def __init__(self, config: Bootstrapper, log_dir = "/tmp"):
86+
def __init__(self, config: BootstrapConfig, log_dir = "/tmp"):
8787
super().__init__(config)
8888
server_config = self.parse_lmcache_server_config(self.config.lmcache_config.config_path)
8989
self.handle = None
9090
self.started = False
9191

92+
match config.lmcache_config.remote_device:
93+
case None:
94+
self.remote_device = "cpu"
95+
case path:
96+
self.remote_device = path
97+
9298
if server_config is None:
9399
self.is_needed = False
94100
else:
@@ -126,7 +132,7 @@ def start(self):
126132
if not self.is_needed:
127133
return
128134

129-
cmd = f"python3 -um lmcache_server.server {self.host} {self.port}"
135+
cmd = f"python3 -um lmcache_server.server {self.host} {self.port} {self.remote_device}"
130136
self.handle = run_command(cmd, self.stdout_log, self.stderr_log, detach=True)
131137
self.started = True
132138

@@ -136,9 +142,10 @@ def wait_until_ready(self, timeout = 60) -> bool:
136142
return True
137143

138144
if not self.is_healthy():
145+
logger.error("LMCacheServer is dead!")
139146
return False
140147

141-
self._monitor_file_output([self.stdout_log, self.stderr_log], "Server started at", timeout=timeout)
148+
return self._monitor_file_output([self.stdout_log, self.stderr_log], "Server started at", timeout=timeout)
142149

143150
return True
144151

@@ -185,6 +192,17 @@ def get_or_create(
185192

186193
return cls._instances[instance_id]
187194

195+
@classmethod
196+
def close_servers(cls):
197+
"""
198+
Close and remove all the active lmcache servers
199+
"""
200+
for instance_id, instance in cls._instances.items():
201+
instance.close()
202+
cls._instances = {}
203+
cls._engine_types = {}
204+
205+
188206

189207
class LocalVllmBootstrapper(Bootstrapper):
190208
"""
@@ -200,11 +218,12 @@ def __init__(self, config: BootstrapConfig, log_dir = "/tmp"):
200218
self.lmcache_server_handler = LMCacheServerManager.get_or_create(config)
201219

202220
def get_cmdline(self) -> str:
203-
return f"python3 -m vllm.entrypoints.openai.api_server {self.config.vllm_config.cmdargs()} {self.config.vllm_optional_config.cmdargs()} {self.config.lmcache_config.cmdargs()}"
221+
extra_args = "--trust-remote-code"
222+
return f"python3 -m vllm.entrypoints.openai.api_server {self.config.vllm_config.cmdargs()} {self.config.vllm_optional_config.cmdargs()} {self.config.lmcache_config.cmdargs()} {extra_args}"
204223

205224
def start(self):
206225
self.lmcache_server_handler.start()
207-
self.lmcache_server_handler.wait_until_ready(timeout = 5)
226+
self.lmcache_server_handler.wait_until_ready(timeout = 10)
208227

209228
self.handle = run_command(
210229
self.command,
@@ -214,6 +233,7 @@ def start(self):
214233
def wait_until_ready(self, timeout = 60) -> bool:
215234
# Try reading the log file to see if the server is ready
216235
if not self.is_healthy():
236+
logger.error(f"VLLM or lmcache server is dead!")
217237
return False
218238

219239
if not os.path.exists(self.stdout_log):
@@ -224,6 +244,7 @@ def wait_until_ready(self, timeout = 60) -> bool:
224244

225245
def is_healthy(self) -> bool:
226246
if not self.lmcache_server_handler.is_healthy():
247+
logger.warn(f"LMCache Server is dead during vLLM's check!")
227248
return False
228249

229250
if self.handle is not None:

configs.py

+2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ class LMCacheConfig(Config):
5151
# Path to the lmcache configuration
5252
config_path: str
5353

54+
remote_device: Optional[str] = None
55+
5456
def cmdargs(self) -> str:
5557
return f"--lmcache-config-file {self.config_path}" if self.config_path is not None else ""
5658

configs/lmcache_remote_cachegen_pipeline.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
chunk_size: 256
22
local_device: null
3-
remote_url: "lm://localhost:65431"
3+
remote_url: "lm://localhost:65430"
44
remote_serde: "cachegen"
55

66
# Whether retrieve() is pipelined or not

configs/lmcache_remote_safetensor_pipeline.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
chunk_size: 256
22
local_device: null
3-
remote_url: "lm://localhost:65431"
3+
remote_url: "lm://localhost:65430"
44
remote_serde: "safetensor"
55

66
# Whether retrieve() is pipelined or not

driver.py

+32-24
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from configs import BootstrapConfig, WorkloadConfig, Usecase
99
from test_cases import TestCase
10-
from bootstrapper import CreateBootstrapper, Bootstrapper
10+
from bootstrapper import CreateBootstrapper, Bootstrapper, LMCacheServerManager
1111
from workload import CreateWorkloadGenerator, Request
1212
from utils import read_gpu_memory
1313

@@ -106,27 +106,32 @@ def execute_openai_request(request: Request, model: str, client: openai.Client)
106106
#time.sleep(t)
107107
#return t, t
108108

109-
chat_completion = client.chat.completions.create(
110-
messages = messages,
111-
model = model,
112-
temperature = 0,
113-
stream = True,
114-
)
115-
116-
117-
start_time = time.perf_counter()
118-
first_token_time = None
119-
ntokens = 0
120-
for chunk in chat_completion:
121-
chunk_message = chunk.choices[0].delta.content
122-
if chunk_message is not None:
123-
if first_token_time is None:
124-
first_token_time = time.perf_counter()
125-
ntokens += 1
126-
end_time = time.perf_counter()
127-
128-
ttft = first_token_time - start_time
129-
throughput = ntokens / (end_time - first_token_time)
109+
110+
try:
111+
chat_completion = client.chat.completions.create(
112+
messages = messages,
113+
model = model,
114+
temperature = 0,
115+
stream = True,
116+
)
117+
118+
start_time = time.perf_counter()
119+
first_token_time = None
120+
ntokens = 0
121+
for chunk in chat_completion:
122+
chunk_message = chunk.choices[0].delta.content
123+
if chunk_message is not None:
124+
if first_token_time is None:
125+
first_token_time = time.perf_counter()
126+
ntokens += 1
127+
end_time = time.perf_counter()
128+
129+
ttft = first_token_time - start_time
130+
throughput = ntokens / (end_time - first_token_time)
131+
except Exception as e:
132+
logger.error(f"OpenAI request failed: {e}")
133+
return -1, -1
134+
130135
return ttft, throughput
131136

132137

@@ -151,6 +156,9 @@ def cleanup(bootstrappers: List[Bootstrapper]):
151156
logger.info("Cleanning up the engine processes")
152157
for bootstrapper in bootstrappers:
153158
bootstrapper.close()
159+
LMCacheServerManager.close_servers()
160+
161+
logger.info(f"Running experiment: {workload_config.desc()} {usecase}")
154162

155163
# Create the workloads
156164
workload_generators = [CreateWorkloadGenerator(workload_config, usecase) for _ in engine_configs]
@@ -165,7 +173,7 @@ def cleanup(bootstrappers: List[Bootstrapper]):
165173
try:
166174
# Wait for the engines to be ready
167175
for bootstrapper in bootstrappers:
168-
ready = bootstrapper.wait_until_ready(timeout = 120)
176+
ready = bootstrapper.wait_until_ready(timeout = 180)
169177
if not ready:
170178
logger.error(f"Engine {bootstrapper} is not ready")
171179
cleanup(bootstrappers)
@@ -180,7 +188,7 @@ def cleanup(bootstrappers: List[Bootstrapper]):
180188
executor.schedule_requests(workloads, clients, models)
181189
results = executor.execute_all()
182190

183-
print(results)
191+
#print(results)
184192

185193
# Read GPU memory utilization
186194
gpu_usage = read_gpu_memory()

log.py

+45-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,53 @@
11
import logging
22
from logging import Logger
33

4-
logging.basicConfig(
5-
format='\033[33m%(levelname)s: \033[0m%(message)s [%(asctime)s.%(msecs)03d]',
6-
level=logging.INFO,
7-
)
4+
#logging.basicConfig(
5+
# format='\033[33m%(levelname)s: \033[0m%(message)s [%(asctime)s.%(msecs)03d]',
6+
# level=logging.INFO,
7+
# )
8+
9+
def build_format(color):
10+
reset = "\x1b[0m"
11+
underline = "\x1b[3m"
12+
return f"{color}[%(asctime)s] %(levelname)s:{reset} %(message)s {underline}(%(filename)s:%(lineno)d:%(name)s){reset}"
13+
14+
class CustomFormatter(logging.Formatter):
15+
16+
grey = "\x1b[1m"
17+
green = "\x1b[32;20m"
18+
yellow = "\x1b[33;20m"
19+
red = "\x1b[31;20m"
20+
bold_red = "\x1b[31;1m"
21+
reset = "\x1b[0m"
22+
23+
FORMATS = {
24+
logging.DEBUG: build_format(grey),
25+
logging.INFO: build_format(green),
26+
logging.WARNING: build_format(yellow),
27+
logging.ERROR: build_format(red),
28+
logging.CRITICAL: build_format(bold_red),
29+
}
30+
31+
def format(self, record):
32+
log_fmt = self.FORMATS.get(record.levelno)
33+
formatter = logging.Formatter(log_fmt)
34+
return formatter.format(record)
835

936
def init_logger(name: str) -> Logger:
1037
logger = logging.getLogger(name)
38+
39+
ch = logging.StreamHandler()
40+
ch.setLevel(logging.DEBUG)
41+
ch.setFormatter(CustomFormatter())
42+
logger.addHandler(ch)
1143
logger.setLevel(logging.DEBUG)
44+
1245
return logger
46+
47+
if __name__ == "__main__":
48+
logger = init_logger(__name__)
49+
logger.debug("Debug message")
50+
logger.info("Info message")
51+
logger.warning("Warning message")
52+
logger.error("Error message")
53+
logger.critical("Critical message")

main.py

+62-8
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def CreateSingleLocalBootstrapConfig(
2626
envs = {"CUDA_VISIBLE_DEVICES": str(gpu_id)}
2727
)
2828

29-
def CreateDummyExperiment(num_requests, context_length, gap_between_requests = 10):
29+
def CreateDummyExperiment(num_requests, context_length, gap_between_requests = 8):
3030
"""
3131
Create some requests for DUMMY usecase
3232
The query length will be 16
@@ -86,7 +86,7 @@ def test_lmcache_local_cpu() -> pd.DataFrame:
8686

8787
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
8888
lengths = [8192, 16384, 24576]
89-
experiments = [CreateDummyExperiment(5, length) for length in lengths]
89+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
9090

9191
test_case = TestCase(
9292
experiments = experiments,
@@ -103,7 +103,7 @@ def test_lmcache_local_disk() -> pd.DataFrame:
103103

104104
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
105105
lengths = [8192, 16384, 24576]
106-
experiments = [CreateDummyExperiment(5, length) for length in lengths]
106+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
107107

108108
test_case = TestCase(
109109
experiments = experiments,
@@ -120,7 +120,7 @@ def test_lmcache_remote_cachegen() -> pd.DataFrame:
120120

121121
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
122122
lengths = [8192, 16384, 24576]
123-
experiments = [CreateDummyExperiment(5, length) for length in lengths]
123+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
124124

125125
test_case = TestCase(
126126
experiments = experiments,
@@ -137,7 +137,26 @@ def test_lmcache_remote_safetensor() -> pd.DataFrame:
137137

138138
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
139139
lengths = [8192, 16384, 24576]
140-
experiments = [CreateDummyExperiment(5, length) for length in lengths]
140+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
141+
142+
test_case = TestCase(
143+
experiments = experiments,
144+
engines = [config1, config2])
145+
146+
# Run test case
147+
final_result = run_test_case(test_case)
148+
return final_result
149+
150+
def test_lmcache_remote_disk() -> pd.DataFrame:
151+
# Start two servers: with lmcache and without lmcache
152+
config1 = CreateSingleLocalBootstrapConfig(8000, 0, "mistralai/Mistral-7B-Instruct-v0.2", "configs/lmcache_remote_cachegen.yaml")
153+
config2 = CreateSingleLocalBootstrapConfig(8001, 1, "mistralai/Mistral-7B-Instruct-v0.2", None)
154+
155+
config1.lmcache_config.remote_device = "/local/lmcache-tests/lmcache-server"
156+
157+
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
158+
lengths = [8192, 16384, 24576]
159+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
141160

142161
test_case = TestCase(
143162
experiments = experiments,
@@ -147,11 +166,46 @@ def test_lmcache_remote_safetensor() -> pd.DataFrame:
147166
final_result = run_test_case(test_case)
148167
return final_result
149168

169+
def test_lmcache_chatglm() -> pd.DataFrame:
170+
# Start two servers: with lmcache and without lmcache
171+
config1 = CreateSingleLocalBootstrapConfig(8000, 0, "THUDM/glm-4-9b-chat", "configs/lmcache_remote_cachegen.yaml")
172+
config2 = CreateSingleLocalBootstrapConfig(8001, 1, "THUDM/glm-4-9b-chat", None)
173+
174+
config1.vllm_config.tensor_parallel_size = 2
175+
config1.vllm_config.gpu_memory_utilization = 0.8
176+
config1.envs = {}
177+
config1.vllm_optional_config["trust_remote_code"] = ""
178+
179+
config2.vllm_config.tensor_parallel_size = 2
180+
config2.vllm_config.gpu_memory_utilization = 0.8
181+
config2.envs = {}
182+
config2.vllm_optional_config["trust_remote_code"] = ""
183+
184+
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
185+
lengths = [8192, 16384, 24576]
186+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
187+
188+
test_case1 = TestCase(
189+
experiments = experiments,
190+
engines = [config1])
191+
192+
test_case2 = TestCase(
193+
experiments = experiments,
194+
engines = [config2])
195+
196+
# Run test case
197+
final_result1 = run_test_case(test_case1)
198+
final_result2 = run_test_case(test_case2)
199+
final_result1["engine_id"] = 0
200+
final_result2["engine_id"] = 1
201+
return pd.concat([final_result1, final_result2])
150202

151203
if __name__ == "__main__":
152204
print("Start running test cases")
153205
#wrapped_runner(test_lmcache_local_gpu, "outputs/test_lmcache_local_gpu.csv")
154206
#wrapped_runner(test_lmcache_local_cpu, "outputs/test_lmcache_local_cpu.csv")
155-
wrapped_runner(test_lmcache_local_disk, "outputs/test_lmcache_local_disk.csv")
156-
wrapped_runner(test_lmcache_remote_cachegen, "outputs/test_lmcache_remote_cachegen.csv")
157-
wrapped_runner(test_lmcache_remote_cachegen, "outputs/test_lmcache_remote_safetensor.csv")
207+
#wrapped_runner(test_lmcache_local_disk, "outputs/test_lmcache_local_disk.csv")
208+
#wrapped_runner(test_lmcache_remote_safetensor, "outputs/test_lmcache_remote_safetensor.csv")
209+
#wrapped_runner(test_lmcache_remote_cachegen, "outputs/test_lmcache_remote_cachegen.csv")
210+
#wrapped_runner(test_lmcache_remote_disk, "outputs/test_lmcache_remote_disk.csv")
211+
wrapped_runner(test_lmcache_chatglm, "outputs/test_lmcache_chatglm.csv")

0 commit comments

Comments
 (0)