Enhance Local GPU Caching, Disk Management, and Redis Sentinel Testing (LMCache#34)

Shaoting-Feng · Shaoting Feng · web-flow · commit 800a47fc7cfc · 2025-01-03T16:00:15.000-06:00
* add max_local_cache_size

* fix remote disk directory

* clean up after remote disk tests

* clean up after local disk tests

* remove remote disk cache

* modify local disk configuration

* custom setting for redis test

* minor fix redis test

* add processing script for redis test

* Update driver.py

* make the query times same

---------

Co-authored-by: Shaoting Feng &lt;shaotingf@nature.cs.uchicago.edu&gt;
diff --git a/configs/lmcache_local_disk.yaml b/configs/lmcache_local_disk.yaml
@@ -1,5 +1,5 @@
 chunk_size: 256
-local_device: "file:///local/lmcache-tests/"
+local_device: "file:///local/end-to-end-tests/local/"
 remote_url: null
 remote_serde: "cachegen"
 
diff --git a/configs/lmcache_local_gpu.yaml b/configs/lmcache_local_gpu.yaml
@@ -4,3 +4,4 @@ remote_url: null
 
 # Whether retrieve() is pipelined or not
 pipelined_backend: False
+max_local_cache_size: 4
diff --git a/driver.py b/driver.py
@@ -21,6 +21,7 @@ class ExperimentResult:
     request_id: int
     TTFT: float
     throughput: float
+    latency: float
 
 @dataclass
 class ExperimentResultWithOutput:
@@ -57,9 +58,9 @@ def execute_one_request(
         """
         Execute the request and put the result into the queue
         """
-        ttft, thp = execute_openai_request(request, model, client)
+        ttft, thp, latency = execute_openai_request(request, model, client)
         logger.info(f"Request completed, TTFT = {ttft}, throughput = {thp}")
-        queue.put(ExperimentResult(request.timestamp, client_id, request_id, ttft, thp))
+        queue.put(ExperimentResult(request.timestamp, client_id, request_id, ttft, thp, latency))
 
     def execute_one_request_with_output(
             self, 
@@ -211,12 +212,14 @@ def execute_openai_request(request: Request, model: str, client: openai.Client)
 
         ttft = first_token_time - start_time
         throughput = ntokens / (end_time - first_token_time)
+        latency = end_time - start_time  
+
         logger.debug(f"Response: {''.join(messages)}")
     except Exception as e:
         logger.error(f"OpenAI request failed: {e}")
         return -1, -1
 
-    return ttft, throughput
+    return ttft, throughput, latency
 
 def execute_openai_request_with_output(request: Request, model: str, client: openai.Client) -> Tuple[float, float, str]:
     """
diff --git a/outputs/process_redis.py b/outputs/process_redis.py
@@ -0,0 +1,45 @@
+import pandas as pd
+import numpy as np
+
+# Read CSV
+file_path = 'test_lmcache_redis_sentinel.csv'  
+data = pd.read_csv(file_path)
+
+# Calculate ITL (1 / throughput)
+data['ITL'] = 1 / data['throughput']
+
+# Separate data for engine_id = 0 and engine_id = 1
+engine_0 = data[data['engine_id'] == 0]
+engine_1 = data[data['engine_id'] == 1]
+
+# Calculate aggregated metrics for engine_id = 0
+cache_avg_latency = engine_0['latency'].mean()
+cache_p90_latency = np.percentile(engine_0['latency'], 90)
+cache_avg_ttft = engine_0['TTFT'].mean()
+cache_avg_itl = engine_0['ITL'].mean()
+
+# Calculate aggregated metrics for engine_id = 1
+wocache_avg_latency = engine_1['latency'].mean()
+wocache_p90_latency = np.percentile(engine_1['latency'], 90)
+wocache_avg_ttft = engine_1['TTFT'].mean()
+wocache_avg_itl = engine_1['ITL'].mean()
+
+# Create final summarized row
+summary = {
+    "Concurrency": 1,  # Assume constant concurrency for this scenario
+    "Cache Avg. Latency": cache_avg_latency,
+    "Cache P90 Latency": cache_p90_latency,
+    "Cache Avg. TTFT": cache_avg_ttft,
+    "Cache Avg. ITL": cache_avg_itl,
+    "W/o Cache Avg. Latency": wocache_avg_latency,
+    "W/o Cache P90 Latency": wocache_p90_latency,
+    "W/o Cache Avg. TTFT": wocache_avg_ttft,
+    "W/o Cache Avg. ITL": wocache_avg_itl,
+}
+
+# Convert to DataFrame and save as CSV
+summary_df = pd.DataFrame([summary])
+output_path = 'redis_summary_one_row.csv'
+summary_df.to_csv(output_path, index=False)
+
+print("Table generated and saved to:", output_path)
diff --git a/tests/tests.py b/tests/tests.py
@@ -8,6 +8,7 @@
 from configs import BootstrapConfig, WorkloadConfig, Usecase
 from configs import VLLMConfig, VLLMOptionalConfig, LMCacheConfig, EngineType
 from utils import run_command, get_max_context_length
+import yaml
 
 ##### Helper functions #####
 def CreateSingleLocalBootstrapConfig(
@@ -133,9 +134,9 @@ def test_chunk_prefill(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataF
     ModelConfig(model, config1)
     ModelConfig(model, config2)
 
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
-    experiments = [CreateDummyExperiment(5, length ) for length in lengths]
+    experiments = [CreateDummyExperiment(10, length ) for length in lengths]
 
     test_case = TestCase(
             experiments = experiments,
@@ -206,7 +207,7 @@ def test_lmcache_local_gpu(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.D
 
     # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
-    experiments = [CreateDummyExperiment(5, length) for length in lengths]
+    experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
     test_case = TestCase(
             experiments = experiments,
@@ -245,14 +246,15 @@ def test_lmcache_local_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.
     This function tests local disk storage backend by comparing scenarios with and without lmcache.
     """
     # Start two servers: with lmcache and without lmcache
-    config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_local_disk.yaml")
+    yaml_config = "configs/lmcache_local_disk.yaml"
+    config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, yaml_config)
     config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
 
     # Set vllm configuration for different models
     ModelConfig(model, config1)
     ModelConfig(model, config2)
 
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -262,6 +264,13 @@ def test_lmcache_local_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.
 
     # Run test case
     final_result = run_test_case(test_case)
+
+    # Clean up
+    with open(yaml_config, 'r') as file:
+        data = yaml.safe_load(file)
+    local_device = data.get('local_device') + "*"
+    os.system(f"rm -rf {local_device}")
+
     return final_result
 
 def test_lmcache_local_distributed(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataFrame: 
@@ -277,7 +286,7 @@ def test_lmcache_local_distributed(model = "mistralai/Mistral-7B-Instruct-v0.2")
     # Set vllm configuration for different models
     ModelConfig(model, config)
 
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -302,7 +311,7 @@ def test_lmcache_remote_cachegen(model = "mistralai/Mistral-7B-Instruct-v0.2") -
     ModelConfig(model, config1)
     ModelConfig(model, config2)
     
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -354,7 +363,7 @@ def test_lmcache_remote_safetensor(model = "mistralai/Mistral-7B-Instruct-v0.2")
     ModelConfig(model, config1)
     ModelConfig(model, config2)
     
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -379,7 +388,7 @@ def test_lmcache_safetensor_distributed(model = "mistralai/Mistral-7B-Instruct-v
     # Set vllm configuration for different models
     ModelConfig(model, config)
 
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -399,13 +408,13 @@ def test_lmcache_remote_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd
     config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_remote_cachegen.yaml")
     config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
 
-    config1.lmcache_config.remote_device = "/local/end-to-end-tests/lmcache-server"
+    config1.lmcache_config.remote_device = "/local/end-to-end-tests/lmcache-server/"
 
     # Set vllm configuration for different models
     ModelConfig(model, config1)
     ModelConfig(model, config2)
 
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
+    # Experiments: 8K, 16K, 24K shared context, each experiments has 10 queries
     lengths = [8192, 16384, 24576]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
@@ -415,22 +424,30 @@ def test_lmcache_remote_disk(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd
 
     # Run test case
     final_result = run_test_case(test_case)
+
+    # Clean up
+    os.system(f"rm -rf {config1.lmcache_config.remote_device}*")
+
     return final_result
 
 def test_lmcache_redis_sentinel(model = "mistralai/Mistral-7B-Instruct-v0.2") -> pd.DataFrame:
-    config1 = CreateSingleLocalBootstrapConfig(8000, 1, model, "configs/lmcache_redis_sentinel_cachegen.yaml")
+    # Set up the master node
+    os.environ["REDIS_SERVICE_NAME"] = "redismaster"
+
+    config1 = CreateSingleLocalBootstrapConfig(8000, 0, model, "configs/lmcache_redis_sentinel_cachegen.yaml")
+    config2 = CreateSingleLocalBootstrapConfig(8001, 1, model, None)
 
     # Set vllm configuration for different models
     ModelConfig(model, config1)
+    ModelConfig(model, config2)
     
-    # Experiments: 8K, 16K, 24K shared context, each experiments has 5 queries
-    #lengths = [8192, 16384, 24576]
-    lengths = [24576]
+    # Experiments: 10375 shared context, each experiments has 10 queries
+    lengths = [10375]
     experiments = [CreateDummyExperiment(10, length) for length in lengths]
 
     test_case = TestCase(
             experiments = experiments,
-            engines = [config1])
+            engines = [config1, config2])
 
     # Run test case
     final_result = run_test_case(test_case)

Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,4 @@ remote_url: null`
`4`	`4`
`5`	`5`	`# Whether retrieve() is pipelined or not`
`6`	`6`	`pipelined_backend: False`
	`7`	`+max_local_cache_size: 4`