add remote disk support (#2)

YaoJiayi · web-flow · commit c1a298d3a5a3 · 2024-09-03T13:21:42.000-07:00
* add remote disk support

* add simple readme

* Improve disk loading speed by using naive open &amp; read/write

* change comment
diff --git a/README.md b/README.md
@@ -1 +1,7 @@
 # lmcache-server
+## Start lmcache-server
+```
+python3 -m lmcache_server.server localhost <port> <storage>
+<port>: an arbitrary port
+<storage>: "" (cpu), "cpu" or "an arbitrary path (disk) (e.g., remote_disk/)"
+```
diff --git a/lmcache_server/server.py b/lmcache_server/server.py
@@ -4,12 +4,14 @@
 import torch
 from io import BytesIO
 from lmcache.protocol import ClientMetaMessage, ServerMetaMessage, Constants
+from lmcache_server.storage_backend import CreateStorageBackend
 
 class LMCacheServer:
-    def __init__(self, host, port):
+    def __init__(self, host, port, device):
         self.host = host
         self.port = port
-        self.data_store = {}
+        #self.data_store = {}
+        self.data_store = CreateStorageBackend(device)
         self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
         self.server_socket.bind((host, port))
         self.server_socket.listen()
@@ -36,15 +38,17 @@ def handle_client(self, client_socket):
                         t0 = time.perf_counter()
                         s = self.receive_all(client_socket, meta.length)
                         t1 = time.perf_counter()
-                        self.data_store[meta.key] = s
+                        #self.data_store[meta.key] = s
+                        self.data_store.put(meta.key, s)
                         t2 = time.perf_counter()
                         #client_socket.sendall(ServerMetaMessage(Constants.SERVER_SUCCESS, 0).serialize())
                         #t3 = time.perf_counter()
                         print(f"Time to receive data: {t1 - t0}, time to store data: {t2 - t1}")
 
                     case Constants.CLIENT_GET:
                         t0 = time.perf_counter()
-                        data_string = self.data_store.get(meta.key, None)
+                        #data_string = self.data_store.get(meta.key, None)
+                        data_string = self.data_store.get(meta.key)
                         t1 = time.perf_counter()
                         if data_string is not None:
                             client_socket.sendall(ServerMetaMessage(Constants.SERVER_SUCCESS, len(data_string)).serialize())
@@ -56,11 +60,12 @@ def handle_client(self, client_socket):
                             client_socket.sendall(ServerMetaMessage(Constants.SERVER_FAIL, 0).serialize())
 
                     case Constants.CLIENT_EXIST:
-                        code = Constants.SERVER_SUCCESS if meta.key in self.data_store else Constants.SERVER_FAIL
+                        #code = Constants.SERVER_SUCCESS if meta.key in self.data_store else Constants.SERVER_FAIL
+                        code = Constants.SERVER_SUCCESS if meta.key in self.data_store.list_keys() else Constants.SERVER_FAIL
                         client_socket.sendall(ServerMetaMessage(code, 0).serialize())
 
                     case Constants.CLIENT_LIST:
-                        keys = list(self.data_store.keys())
+                        keys = list(self.data_store.list_keys())
                         data = "\n".join(keys).encode()
                         client_socket.sendall(ServerMetaMessage(Constants.SERVER_SUCCESS, len(data)).serialize())
                         client_socket.sendall(data)
@@ -80,13 +85,17 @@ def run(self):
 
 if __name__ == "__main__":
     import os, sys
-    if len(sys.argv) != 3:
-        print(f"Usage: {sys.argv[0]} <host> <port>")
+    if len(sys.argv) not in [3,4]:
+        print(f"Usage: {sys.argv[0]} <host> <port> <storage>(default:cpu)")
         exit(1)
 
     host = sys.argv[1]
     port = int(sys.argv[2])
+    if len(sys.argv) == 4:
+        device = sys.argv[3]
+    else:
+        device = "cpu"
     
-    server = LMCacheServer(host, port)
+    server = LMCacheServer(host, port, device)
     server.run()
 
diff --git a/lmcache_server/storage_backend/__init__.py b/lmcache_server/storage_backend/__init__.py
@@ -0,0 +1,20 @@
+from lmcache_server.storage_backend.abstract_backend import LMSBackendInterface
+from lmcache_server.storage_backend.local_backend import LMSLocalBackend, LMSLocalDiskBackend
+from lmcache.logging import init_logger
+
+logger = init_logger(__name__)
+
+
+def CreateStorageBackend(
+        device: str
+    ) -> LMSBackendInterface:
+    match device:
+        case "cpu":
+            # cpu only
+            logger.info("Initializing cpu-only cache server")
+            return LMSLocalBackend()
+
+        case _:
+            # cpu only
+            logger.info("Initializing disk-only cache server")
+            return LMSLocalDiskBackend(path=device)
diff --git a/lmcache_server/storage_backend/abstract_backend.py b/lmcache_server/storage_backend/abstract_backend.py
@@ -0,0 +1,82 @@
+import abc
+import torch
+from lmcache.logging import init_logger
+from typing import Tuple, Optional, Iterator, List
+
+logger = init_logger(__name__)
+
+class LMSBackendInterface(metaclass=abc.ABCMeta):
+
+    @abc.abstractmethod
+    def put(
+            self,
+            key: str,
+            kv_chunk_bytes: bytes,
+            blocking = True,
+        ) -> None:
+        """
+        Store the KV cache of the tokens into the cache server.
+
+        Input:
+            key: the key of the token chunk, in the format of str
+            kv_chunk: the kv cache (bytes) of the token chunk, in the format of a big tensor
+            blocking: whether to block the call before the operation is completed
+
+        Returns:
+            None
+
+        Note:
+            The KV cache should NOT have the "batch" dimension.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def contains(
+            self,
+            key: str,
+        ) -> bool:
+        """
+        Query if a key is in the cache or not
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get(
+            self,
+            key: str,
+        ) -> Optional[torch.Tensor]:
+        """
+        Retrive the KV cache chunk by the given key 
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+
+        Output: 
+            the kv cache of the token chunk, in the format of a big tensor
+            None if the key is not found
+        """
+        raise NotImplementedError
+    
+    @abc.abstractmethod
+    def list_keys(
+            self,
+        ) -> List[str]:
+        """
+        Retrive the KV cache chunk by the given key 
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+
+        Output: 
+            the kv cache of the token chunk, in the format of a big tensor
+            None if the key is not found
+        """
+        raise NotImplementedError
+    
+
+    def close(self):
+        """
+        Do the cleanup things
+        Children classes should override this method if necessary
+        """
+        pass
diff --git a/lmcache_server/storage_backend/local_backend.py b/lmcache_server/storage_backend/local_backend.py
@@ -0,0 +1,200 @@
+from typing import Tuple, Optional, Iterator, List
+from safetensors import safe_open
+from safetensors.torch import save_file
+import re
+import io
+import torch
+import redis
+import os
+import pickle
+
+from lmcache_server.storage_backend.abstract_backend import LMSBackendInterface
+from lmcache.logging import init_logger
+from lmcache.utils import _lmcache_nvtx_annotate
+
+logger = init_logger(__name__)
+
+class LMSLocalBackend(LMSBackendInterface):
+    """
+    Cache engine for storing the KV cache of the tokens in the local cpu/gpu memory.
+    """
+    def __init__(
+            self, 
+        ):
+        """
+        Throws:
+            RuntimeError if the loaded configuration does not match the current configuration
+        """
+        super().__init__()
+
+        self.dict = {}
+    
+    def list_keys(
+            self
+        ) -> List[str]:
+        
+        return list(self.dict.keys())
+    
+    def contains(
+            self, 
+            key: str,
+        ) -> bool:
+        """
+        Check if the cache engine contains the key.
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+
+        Returns:
+            True if the cache engine contains the key, False otherwise
+        """
+        return key in self.dict
+
+    def put(
+            self, 
+            key: str,
+            kv_chunk_bytes: bytes,
+            blocking: bool = True,
+        ) -> None:
+        """
+        Store the KV cache of the tokens into the cache engine.
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+            kv_chunk: the kv cache of the token chunk, in the format of nested tuples
+
+        Returns:
+            None
+
+        Note:
+            The KV cache should NOT have the "batch" dimension.
+        """
+        if not blocking:
+            logger.warn("Non-blocking is not implemented for local backend")
+        self.dict[key] = kv_chunk_bytes
+
+
+    @_lmcache_nvtx_annotate
+    def get(
+            self,
+            key: str,
+        ) -> Optional[bytes]:
+        """
+        Retrive the KV cache chunk by the given key 
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+        Output: 
+            the kv cache of the token chunk, in the format of nested tuples
+            None if the key is not found
+        """
+        return self.dict.get(key, None)
+
+
+# TODO(Jiayi): need to optimize disk loading
+# current impl. with "naive open read/write" might not be efficient (better than torch.load)
+class LMSLocalDiskBackend(LMSBackendInterface):
+    """
+    Cache engine for storing the KV cache of the tokens in the local disk.
+    """
+    def __init__(
+            self, 
+            path: str,
+        ):
+        """
+        Throws:
+            RuntimeError if the loaded configuration does not match the current configuration
+        """
+        super().__init__()
+
+        self.path = path
+        if not os.path.exists(self.path):
+            os.makedirs(self.path)
+        self.filenames = set()
+
+    def list_keys(
+            self
+        ) -> List[str]:
+        
+        return list(self.filenames)
+    
+    def contains(
+            self, 
+            key: str,
+        ) -> bool:
+        """
+        Check if the cache engine contains the key.
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+
+        Returns:
+            True if the cache engine contains the key, False otherwise
+        """
+        return key in self.filenames
+
+    def _key_to_path(
+        self,
+        key: str,
+    ) -> str:
+        """
+        Covert key to path_name
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+
+        Returns:
+            returns the path name
+        """
+        return self.path + key.replace("/","-") + ".bin"
+        
+    
+    def put(
+            self, 
+            key: str,
+            kv_chunk_bytes: bytes,
+            blocking: bool = True,
+        ) -> None:
+        """
+        Store the KV cache of the tokens into the cache engine.
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+            kv_chunk: the kv cache of the token chunk, in the format of nested tuples
+
+        Returns:
+            None
+
+        Note:
+            The KV cache should NOT have the "batch" dimension.
+        """
+        if not blocking:
+            logger.warn("Non-blocking is not implemented for local backend")
+        self.filenames.add(key)
+        logger.info(f"Saving cache to {self._key_to_path(key)}")
+        #torch.save(kv_chunk_bytes, self._key_to_path(key))
+        with open(self._key_to_path(key), "wb") as binary_file:
+            binary_file.write(kv_chunk_bytes)
+
+
+    @_lmcache_nvtx_annotate
+    def get(
+            self,
+            key: str,
+        ) -> Optional[bytes]:
+        """
+        Retrive the KV cache chunk by the given key 
+
+        Input:
+            key: the key of the token chunk, including prefix hash and format
+        Output: 
+            the kv cache of the token chunk, in the format of nested tuples
+            None if the key is not found
+        """
+        if key not in self.filenames:
+            return None
+        
+        with open(self._key_to_path(key), "rb") as binary_file:
+            return binary_file.read()
+        
+        #return torch.load(self._key_to_path(key))