Skip to content

Commit 800a47f

Browse files
Shaoting-FengShaoting Feng
and
Shaoting Feng
authored
Enhance Local GPU Caching, Disk Management, and Redis Sentinel Testing (LMCache#34)
* add max_local_cache_size * fix remote disk directory * clean up after remote disk tests * clean up after local disk tests * remove remote disk cache * modify local disk configuration * custom setting for redis test * minor fix redis test * add processing script for redis test * Update driver.py * make the query times same --------- Co-authored-by: Shaoting Feng <[email protected]>
1 parent 70ed161 commit 800a47f

File tree

5 files changed

+86
-20
lines changed

5 files changed

+86
-20
lines changed

configs/lmcache_local_disk.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
chunk_size: 256
2-
local_device: "file:///local/lmcache-tests/"
2+
local_device: "file:///local/end-to-end-tests/local/"
33
remote_url: null
44
remote_serde: "cachegen"
55

configs/lmcache_local_gpu.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ remote_url: null
44

55
# Whether retrieve() is pipelined or not
66
pipelined_backend: False
7+
max_local_cache_size: 4

driver.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class ExperimentResult:
2121
request_id: int
2222
TTFT: float
2323
throughput: float
24+
latency: float
2425

2526
@dataclass
2627
class ExperimentResultWithOutput:
@@ -57,9 +58,9 @@ def execute_one_request(
5758
"""
5859
Execute the request and put the result into the queue
5960
"""
60-
ttft, thp = execute_openai_request(request, model, client)
61+
ttft, thp, latency = execute_openai_request(request, model, client)
6162
logger.info(f"Request completed, TTFT = {ttft}, throughput = {thp}")
62-
queue.put(ExperimentResult(request.timestamp, client_id, request_id, ttft, thp))
63+
queue.put(ExperimentResult(request.timestamp, client_id, request_id, ttft, thp, latency))
6364

6465
def execute_one_request_with_output(
6566
self,
@@ -211,12 +212,14 @@ def execute_openai_request(request: Request, model: str, client: openai.Client)
211212

212213
ttft = first_token_time - start_time
213214
throughput = ntokens / (end_time - first_token_time)
215+
latency = end_time - start_time
216+
214217
logger.debug(f"Response: {''.join(messages)}")
215218
except Exception as e:
216219
logger.error(f"OpenAI request failed: {e}")
217220
return -1, -1
218221

219-
return ttft, throughput
222+
return ttft, throughput, latency
220223

221224
def execute_openai_request_with_output(request: Request, model: str, client: openai.Client) -> Tuple[float, float, str]:
222225
"""

outputs/process_redis.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
# Read CSV
5+
file_path = 'test_lmcache_redis_sentinel.csv'
6+
data = pd.read_csv(file_path)
7+
8+
# Calculate ITL (1 / throughput)
9+
data['ITL'] = 1 / data['throughput']
10+
11+
# Separate data for engine_id = 0 and engine_id = 1
12+
engine_0 = data[data['engine_id'] == 0]
13+
engine_1 = data[data['engine_id'] == 1]
14+
15+
# Calculate aggregated metrics for engine_id = 0
16+
cache_avg_latency = engine_0['latency'].mean()
17+
cache_p90_latency = np.percentile(engine_0['latency'], 90)
18+
cache_avg_ttft = engine_0['TTFT'].mean()
19+
cache_avg_itl = engine_0['ITL'].mean()
20+
21+
# Calculate aggregated metrics for engine_id = 1
22+
wocache_avg_latency = engine_1['latency'].mean()
23+
wocache_p90_latency = np.percentile(engine_1['latency'], 90)
24+
wocache_avg_ttft = engine_1['TTFT'].mean()
25+
wocache_avg_itl = engine_1['ITL'].mean()
26+
27+
# Create final summarized row
28+
summary = {
29+
"Concurrency": 1, # Assume constant concurrency for this scenario
30+
"Cache Avg. Latency": cache_avg_latency,
31+
"Cache P90 Latency": cache_p90_latency,
32+
"Cache Avg. TTFT": cache_avg_ttft,
33+
"Cache Avg. ITL": cache_avg_itl,
34+
"W/o Cache Avg. Latency": wocache_avg_latency,
35+
"W/o Cache P90 Latency": wocache_p90_latency,
36+
"W/o Cache Avg. TTFT": wocache_avg_ttft,
37+
"W/o Cache Avg. ITL": wocache_avg_itl,
38+
}
39+
40+
# Convert to DataFrame and save as CSV
41+
summary_df = pd.DataFrame([summary])
42+
output_path = 'redis_summary_one_row.csv'
43+
summary_df.to_csv(output_path, index=False)
44+
45+
print("Table generated and saved to:", output_path)

tests/tests.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from configs import BootstrapConfig, WorkloadConfig, Usecase
99
from configs import VLLMConfig, VLLMOptionalConfig, LMCacheConfig, EngineType
1010
from utils import run_command, get_max_context_length
11+
import yaml
1112

1213
##### Helper functions #####
1314
def CreateSingleLocalBootstrapConfig(
@@ -133,9 +134,9 @@ def test_chunk_prefill(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataF
133134
ModelConfig(model, config1)
134135
ModelConfig(model, config2)
135136

136-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
137+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
137138
lengths = [8192, 16384, 24576]
138-
experiments = [CreateDummyExperiment(5, length ) for length in lengths]
139+
experiments = [CreateDummyExperiment(10, length ) for length in lengths]
139140

140141
test_case = TestCase(
141142
experiments = experiments,
@@ -206,7 +207,7 @@ def test_lmcache_local_gpu(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.D
206207

207208
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
208209
lengths = [8192, 16384, 24576]
209-
experiments = [CreateDummyExperiment(5, length) for length in lengths]
210+
experiments = [CreateDummyExperiment(10, length) for length in lengths]
210211

211212
test_case = TestCase(
212213
experiments = experiments,
@@ -245,14 +246,15 @@ def test_lmcache_local_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.
245246
This function tests local disk storage backend by comparing scenarios with and without lmcache.
246247
"""
247248
# Start two servers: with lmcache and without lmcache
248-
config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_local_disk.yaml")
249+
yaml_config = "configs/lmcache_local_disk.yaml"
250+
config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, yaml_config)
249251
config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
250252

251253
# Set vllm configuration for different models
252254
ModelConfig(model, config1)
253255
ModelConfig(model, config2)
254256

255-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
257+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
256258
lengths = [8192, 16384, 24576]
257259
experiments = [CreateDummyExperiment(10, length) for length in lengths]
258260

@@ -262,6 +264,13 @@ def test_lmcache_local_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.
262264

263265
# Run test case
264266
final_result = run_test_case(test_case)
267+
268+
# Clean up
269+
with open(yaml_config, 'r') as file:
270+
data = yaml.safe_load(file)
271+
local_device = data.get('local_device') + "*"
272+
os.system(f"rm -rf {local_device}")
273+
265274
return final_result
266275

267276
def test_lmcache_local_distributed(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataFrame:
@@ -277,7 +286,7 @@ def test_lmcache_local_distributed(model = "mistralai/Mistral-7B-Instruct-v0.2")
277286
# Set vllm configuration for different models
278287
ModelConfig(model, config)
279288

280-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
289+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
281290
lengths = [8192, 16384, 24576]
282291
experiments = [CreateDummyExperiment(10, length) for length in lengths]
283292

@@ -302,7 +311,7 @@ def test_lmcache_remote_cachegen(model = "mistralai/Mistral-7B-Instruct-v0.2") -
302311
ModelConfig(model, config1)
303312
ModelConfig(model, config2)
304313

305-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
314+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
306315
lengths = [8192, 16384, 24576]
307316
experiments = [CreateDummyExperiment(10, length) for length in lengths]
308317

@@ -354,7 +363,7 @@ def test_lmcache_remote_safetensor(model = "mistralai/Mistral-7B-Instruct-v0.2")
354363
ModelConfig(model, config1)
355364
ModelConfig(model, config2)
356365

357-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
366+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
358367
lengths = [8192, 16384, 24576]
359368
experiments = [CreateDummyExperiment(10, length) for length in lengths]
360369

@@ -379,7 +388,7 @@ def test_lmcache_safetensor_distributed(model = "mistralai/Mistral-7B-Instruct-v
379388
# Set vllm configuration for different models
380389
ModelConfig(model, config)
381390

382-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
391+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
383392
lengths = [8192, 16384, 24576]
384393
experiments = [CreateDummyExperiment(10, length) for length in lengths]
385394

@@ -399,13 +408,13 @@ def test_lmcache_remote_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd
399408
config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_remote_cachegen.yaml")
400409
config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
401410

402-
config1.lmcache_config.remote_device = "/local/end-to-end-tests/lmcache-server"
411+
config1.lmcache_config.remote_device = "/local/end-to-end-tests/lmcache-server/"
403412

404413
# Set vllm configuration for different models
405414
ModelConfig(model, config1)
406415
ModelConfig(model, config2)
407416

408-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
417+
# Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
409418
lengths = [8192, 16384, 24576]
410419
experiments = [CreateDummyExperiment(10, length) for length in lengths]
411420

@@ -415,22 +424,30 @@ def test_lmcache_remote_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd
415424

416425
# Run test case
417426
final_result = run_test_case(test_case)
427+
428+
# Clean up
429+
os.system(f"rm -rf {config1.lmcache_config.remote_device}*")
430+
418431
return final_result
419432

420433
def test_lmcache_redis_sentinel(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataFrame:
421-
config1 = CreateSingleLocalBootstrapConfig(8000, 1, model, "configs/lmcache_redis_sentinel_cachegen.yaml")
434+
# Set up the master node
435+
os.environ["REDIS_SERVICE_NAME"] = "redismaster"
436+
437+
config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_redis_sentinel_cachegen.yaml")
438+
config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
422439

423440
# Set vllm configuration for different models
424441
ModelConfig(model, config1)
442+
ModelConfig(model, config2)
425443

426-
# Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
427-
#lengths = [8192, 16384, 24576]
428-
lengths = [24576]
444+
# Experiments: 10375 shared context, each experiments has 10 queries
445+
lengths = [10375]
429446
experiments = [CreateDummyExperiment(10, length) for length in lengths]
430447

431448
test_case = TestCase(
432449
experiments = experiments,
433-
engines = [config1])
450+
engines = [config1, config2])
434451

435452
# Run test case
436453
final_result = run_test_case(test_case)

0 commit comments

Comments
 (0)