Skip to content

Commit dcad9f7

Browse files
aayush3011awharrison-28markwallace-microsoft
authored
Python: Adding Azure CosmosDB Mongo vCore as a datastore. (#2990)
### Motivation and Context I have added Azure CosmosDB MongoDB vCore as a data store. MongoDB vCore now supports vector search on embeddings, and it could be used to seamlessly integrate your AI-based applications with your data stored in the Azure CosmosDB. More details about Mongo vCore can be found here: https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search. Issue #2375 <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, by providing the following information: --> 1. Why is this change required? **MongoDB vCore now supports vector search on embeddings, and it could be used to seamlessly integrate your AI-based applications with your data stored in the Azure CosmosDB.** 2. What problem does it solve? This adds a new memory store MongoDB vCore(Azure CosmosDB) ### Description <!-- Describe your changes, the overall approach, and the underlying design. These notes will help you understand how your code works. Thanks! --> ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [ ] The code builds clean without any errors or warnings - [ ] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [ ] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄 --------- Co-authored-by: Abby Harrison <[email protected]> Co-authored-by: Abby Harrison <[email protected]> Co-authored-by: Mark Wallace <[email protected]>
1 parent 6f7cd6a commit dcad9f7

File tree

10 files changed

+822
-4
lines changed

10 files changed

+822
-4
lines changed

python/.env.example

+4
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@ WEAVIATE_API_KEY=""
1414
GOOGLE_PALM_API_KEY=""
1515
GOOGLE_SEARCH_ENGINE_ID=""
1616
REDIS_CONNECTION_STRING=""
17+
AZCOSMOS_API = "" // should be mongo-vcore for now, as CosmosDB only supports vector search in mongo-vcore for now.
18+
AZCOSMOS_CONNSTR = ""
19+
AZCOSMOS_DATABASE_NAME = ""
20+
AZCOSMOS_CONTAINER_NAME = ""

python/semantic_kernel/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
)
1616
from semantic_kernel.utils.null_logger import NullLogger
1717
from semantic_kernel.utils.settings import (
18+
azure_cosmos_db_settings_from_dot_env,
1819
azure_openai_settings_from_dot_env,
1920
bing_search_settings_from_dot_env,
2021
google_palm_settings_from_dot_env,
@@ -35,6 +36,7 @@
3536
"bing_search_settings_from_dot_env",
3637
"mongodb_atlas_settings_from_dot_env",
3738
"google_palm_settings_from_dot_env",
39+
"azure_cosmos_db_settings_from_dot_env",
3840
"redis_settings_from_dot_env",
3941
"PromptTemplateConfig",
4042
"PromptTemplate",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from semantic_kernel.connectors.memory.azure_cosmosdb.azure_cosmos_db_memory_store import (
4+
AzureCosmosDBMemoryStore,
5+
)
6+
7+
__all__ = ["AzureCosmosDBMemoryStore"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from typing import List, Tuple
4+
5+
from numpy import ndarray
6+
7+
from semantic_kernel.connectors.memory.azure_cosmosdb.azure_cosmos_db_store_api import (
8+
AzureCosmosDBStoreApi,
9+
)
10+
from semantic_kernel.connectors.memory.azure_cosmosdb.cosmosdb_utils import (
11+
get_mongodb_resources,
12+
)
13+
from semantic_kernel.connectors.memory.azure_cosmosdb.mongo_vcore_store_api import (
14+
MongoStoreApi,
15+
)
16+
from semantic_kernel.memory.memory_record import MemoryRecord
17+
from semantic_kernel.memory.memory_store_base import MemoryStoreBase
18+
from semantic_kernel.utils.settings import azure_cosmos_db_settings_from_dot_env
19+
20+
# Load environment variables
21+
(cosmos_api, cosmos_connstr) = azure_cosmos_db_settings_from_dot_env()
22+
23+
24+
class AzureCosmosDBMemoryStore(MemoryStoreBase):
25+
"""A memory store that uses AzureCosmosDB for MongoDB vCore, to perform vector similarity search on a fully
26+
managed MongoDB compatible database service.
27+
https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search"""
28+
29+
# Right now this only supports Mongo, but set up to support more later.
30+
apiStore: AzureCosmosDBStoreApi = None
31+
mongodb_client = None
32+
database = None
33+
index_name = None
34+
vector_dimensions = None
35+
num_lists = None
36+
similarity = None
37+
collection_name = None
38+
39+
def __init__(
40+
self,
41+
cosmosStore: AzureCosmosDBStoreApi,
42+
database_name: str,
43+
index_name: str,
44+
vector_dimensions: int,
45+
num_lists: int,
46+
similarity: str,
47+
):
48+
if vector_dimensions <= 0:
49+
raise ValueError("Vector dimensions must be a positive number.")
50+
# if connection_string is None:
51+
# raise ValueError("Connection String cannot be empty.")
52+
if database_name is None:
53+
raise ValueError("Database Name cannot be empty.")
54+
if index_name is None:
55+
raise ValueError("Index Name cannot be empty.")
56+
57+
self.cosmosStore = cosmosStore
58+
self.index_name = index_name
59+
self.num_lists = num_lists
60+
self.similarity = similarity
61+
62+
@staticmethod
63+
async def create(
64+
database_name,
65+
collection_name,
66+
index_name,
67+
vector_dimensions,
68+
num_lists,
69+
similarity,
70+
) -> MemoryStoreBase:
71+
"""Creates the underlying data store based on the API definition"""
72+
# Right now this only supports Mongo, but set up to support more later.
73+
apiStore: AzureCosmosDBStoreApi = None
74+
if cosmos_api == "mongo-vcore":
75+
mongodb_client, database = get_mongodb_resources(
76+
cosmos_connstr, database_name
77+
)
78+
apiStore = MongoStoreApi(
79+
collection_name,
80+
index_name,
81+
vector_dimensions,
82+
num_lists,
83+
similarity,
84+
database,
85+
)
86+
else:
87+
raise NotImplementedError
88+
89+
store = AzureCosmosDBMemoryStore(
90+
apiStore,
91+
database_name,
92+
index_name,
93+
vector_dimensions,
94+
num_lists,
95+
similarity,
96+
)
97+
await store.create_collection_async(collection_name)
98+
return store
99+
100+
async def create_collection_async(self, collection_name: str) -> None:
101+
"""Creates a new collection in the data store.
102+
103+
Arguments:
104+
collection_name {str} -- The name associated with a collection of embeddings.
105+
106+
Returns:
107+
None
108+
"""
109+
return await self.cosmosStore.create_collection(collection_name)
110+
111+
async def get_collections_async(self) -> List[str]:
112+
"""Gets the list of collections.
113+
114+
Returns:
115+
List[str] -- The list of collections.
116+
"""
117+
return await self.cosmosStore.get_collections_async()
118+
119+
async def delete_collection_async(self, collection_name: str) -> None:
120+
"""Deletes a collection.
121+
122+
Arguments:
123+
collection_name {str} -- The name of the collection to delete.
124+
125+
Returns:
126+
None
127+
"""
128+
return await self.cosmosStore.delete_collection(str())
129+
130+
async def does_collection_exist_async(self, collection_name: str) -> bool:
131+
"""Checks if a collection exists.
132+
133+
Arguments:
134+
collection_name {str} -- The name of the collection to check.
135+
136+
Returns:
137+
bool -- True if the collection exists; otherwise, False.
138+
"""
139+
return await self.cosmosStore.does_collection_exist(str())
140+
141+
async def upsert_async(self, collection_name: str, record: MemoryRecord) -> str:
142+
"""Upsert a record.
143+
144+
Arguments:
145+
collection_name {str} -- The name of the collection to upsert the record into.
146+
record {MemoryRecord} -- The record to upsert.
147+
148+
Returns:
149+
str -- The unique record id of the record.
150+
"""
151+
return await self.cosmosStore.upsert(str(), record)
152+
153+
async def upsert_batch_async(
154+
self, collection_name: str, records: List[MemoryRecord]
155+
) -> List[str]:
156+
"""Upsert a batch of records.
157+
158+
Arguments:
159+
collection_name {str} -- The name of the collection to upsert the records into.
160+
records {List[MemoryRecord]} -- The records to upsert.
161+
162+
Returns:
163+
List[str] -- The unique database keys of the records.
164+
"""
165+
return await self.cosmosStore.upsert_batch(str(), records)
166+
167+
async def get_async(
168+
self, collection_name: str, key: str, with_embedding: bool
169+
) -> MemoryRecord:
170+
"""Gets a record.
171+
172+
Arguments:
173+
collection_name {str} -- The name of the collection to get the record from.
174+
key {str} -- The unique database key of the record.
175+
with_embedding {bool} -- Whether to include the embedding in the result. (default: {False})
176+
177+
Returns:
178+
MemoryRecord -- The record.
179+
"""
180+
return await self.cosmosStore.get(str(), key, with_embedding)
181+
182+
async def get_batch_async(
183+
self, collection_name: str, keys: List[str], with_embeddings: bool
184+
) -> List[MemoryRecord]:
185+
"""Gets a batch of records.
186+
187+
Arguments:
188+
collection_name {str} -- The name of the collection to get the records from.
189+
keys {List[str]} -- The unique database keys of the records.
190+
with_embeddings {bool} -- Whether to include the embeddings in the results. (default: {False})
191+
192+
Returns:
193+
List[MemoryRecord] -- The records.
194+
"""
195+
return await self.cosmosStore.get_batch(str(), keys, with_embeddings)
196+
197+
async def remove_async(self, collection_name: str, key: str) -> None:
198+
"""Removes a record.
199+
200+
Arguments:
201+
collection_name {str} -- The name of the collection to remove the record from.
202+
key {str} -- The unique database key of the record to remove.
203+
204+
Returns:
205+
None
206+
"""
207+
return await self.cosmosStore.remove(str(), key)
208+
209+
async def remove_batch_async(self, collection_name: str, keys: List[str]) -> None:
210+
"""Removes a batch of records.
211+
212+
Arguments:
213+
collection_name {str} -- The name of the collection to remove the records from.
214+
keys {List[str]} -- The unique database keys of the records to remove.
215+
216+
Returns:
217+
None
218+
"""
219+
return await self.cosmosStore.remove_batch(str(), keys)
220+
221+
async def get_nearest_matches_async(
222+
self,
223+
collection_name: str,
224+
embedding: ndarray,
225+
limit: int,
226+
min_relevance_score: float,
227+
with_embeddings: bool,
228+
) -> List[Tuple[MemoryRecord, float]]:
229+
"""Gets the nearest matches to an embedding using vector configuration.
230+
231+
Parameters:
232+
collection_name (str) -- The name of the collection to get the nearest matches from.
233+
embedding (ndarray) -- The embedding to find the nearest matches to.
234+
limit {int} -- The maximum number of matches to return.
235+
min_relevance_score {float} -- The minimum relevance score of the matches. (default: {0.0})
236+
with_embeddings {bool} -- Whether to include the embeddings in the results. (default: {False})
237+
238+
Returns:
239+
List[Tuple[MemoryRecord, float]] -- The records and their relevance scores.
240+
"""
241+
return await self.cosmosStore.get_nearest_matches(
242+
str(), embedding, limit, min_relevance_score, with_embeddings
243+
)
244+
245+
async def get_nearest_match_async(
246+
self,
247+
collection_name: str,
248+
embedding: ndarray,
249+
min_relevance_score: float,
250+
with_embedding: bool,
251+
) -> Tuple[MemoryRecord, float]:
252+
"""Gets the nearest match to an embedding using vector configuration parameters.
253+
254+
Arguments:
255+
collection_name {str} -- The name of the collection to get the nearest match from.
256+
embedding {ndarray} -- The embedding to find the nearest match to.
257+
min_relevance_score {float} -- The minimum relevance score of the match. (default: {0.0})
258+
with_embedding {bool} -- Whether to include the embedding in the result. (default: {False})
259+
260+
Returns:
261+
Tuple[MemoryRecord, float] -- The record and the relevance score.
262+
"""
263+
return await self.cosmosStore.get_nearest_match(
264+
str(), embedding, min_relevance_score, with_embedding
265+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
4+
from abc import ABC, abstractmethod
5+
from typing import List, Tuple
6+
7+
from numpy import ndarray
8+
9+
from semantic_kernel.memory.memory_record import MemoryRecord
10+
11+
12+
# Abstract class similar to the original data store that allows API level abstraction
13+
class AzureCosmosDBStoreApi(ABC):
14+
@abstractmethod
15+
async def create_collection(self, collection_name: str) -> None:
16+
raise NotImplementedError
17+
18+
@abstractmethod
19+
async def get_collections(self) -> List[str]:
20+
raise NotImplementedError
21+
22+
@abstractmethod
23+
async def delete_collection(self, collection_name: str) -> None:
24+
raise NotImplementedError
25+
26+
@abstractmethod
27+
async def does_collection_exist(self, collection_name: str) -> bool:
28+
raise NotImplementedError
29+
30+
@abstractmethod
31+
async def upsert(self, collection_name: str, record: MemoryRecord) -> str:
32+
raise NotImplementedError
33+
34+
@abstractmethod
35+
async def upsert_batch(
36+
self, collection_name: str, records: List[MemoryRecord]
37+
) -> List[str]:
38+
raise NotImplementedError
39+
40+
@abstractmethod
41+
async def get(
42+
self, collection_name: str, key: str, with_embedding: bool
43+
) -> MemoryRecord:
44+
raise NotImplementedError
45+
46+
@abstractmethod
47+
async def get_batch(
48+
self, collection_name: str, keys: List[str], with_embeddings: bool
49+
) -> List[MemoryRecord]:
50+
raise NotImplementedError
51+
52+
@abstractmethod
53+
async def remove(self, collection_name: str, key: str) -> None:
54+
raise NotImplementedError
55+
56+
@abstractmethod
57+
async def remove_batch(self, collection_name: str, keys: List[str]) -> None:
58+
raise NotImplementedError
59+
60+
@abstractmethod
61+
async def get_nearest_matches(
62+
self,
63+
collection_name: str,
64+
embedding: ndarray,
65+
limit: int,
66+
min_relevance_score: float,
67+
with_embeddings: bool,
68+
) -> List[Tuple[MemoryRecord, float]]:
69+
raise NotImplementedError
70+
71+
@abstractmethod
72+
async def get_nearest_match(
73+
self,
74+
collection_name: str,
75+
embedding: ndarray,
76+
min_relevance_score: float,
77+
with_embedding: bool,
78+
) -> Tuple[MemoryRecord, float]:
79+
raise NotImplementedError
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from pymongo import MongoClient
4+
5+
6+
def get_mongodb_resources(connection_string: str, database_name: str):
7+
try:
8+
client = MongoClient(connection_string)
9+
database = client[database_name]
10+
except Exception as ex:
11+
raise Exception(
12+
f"Error while connecting to Azure Cosmos MongoDb vCore: {ex}"
13+
) from ex
14+
return client, database

0 commit comments

Comments
 (0)