Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add InMemory and HnswLib vector stores #1

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions langchain/vectorstores/__init__.py
Original file line number Diff line number Diff line change
@@ -7,6 +7,8 @@
from langchain.vectorstores.deeplake import DeepLake
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores.hnsw_lib import HnswLib
from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch
from langchain.vectorstores.milvus import Milvus
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
@@ -34,4 +36,6 @@
"MyScaleSettings",
"SupabaseVectorStore",
"AnalyticDB",
"HnswLib",
"InMemoryExactSearch",
]
141 changes: 141 additions & 0 deletions langchain/vectorstores/hnsw_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Wrapper around HnswLib store."""
from __future__ import annotations

from typing import List, Optional, Type

from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VST
from langchain.vectorstores.vector_store_from_doc_index import (
VecStoreFromDocIndex,
_check_docarray_import,
)


class HnswLib(VecStoreFromDocIndex):
"""Wrapper around HnswLib storage.

To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed.
You can install it with `pip install "langchain[hnswlib]"`.
"""

def __init__(
self,
embedding: Embeddings,
work_dir: str,
n_dim: int,
dist_metric: str = "cosine",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
) -> None:
"""Initialize HnswLib store.

Args:
embedding (Embeddings): Embedding function.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
"ip", and "l2". Defaults to "cosine".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
"""
_check_docarray_import()
from docarray.index import HnswDocumentIndex

try:
import google.protobuf
except ImportError:
raise ImportError(
"Could not import all required packages. "
"Please install it with `pip install \"langchain[hnswlib]\"`."
)

doc_cls = self._get_doc_cls(
{
"dim": n_dim,
"space": dist_metric,
"max_elements": max_elements,
"index": index,
"ef_construction": ef_construction,
"ef": ef,
"M": M,
"allow_replace_deleted": allow_replace_deleted,
"num_threads": num_threads,
}
)
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)
super().__init__(doc_index, embedding)

@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
work_dir: str = None,
n_dim: int = None,
dist_metric: str = "l2",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
) -> HnswLib:
"""Create an HnswLib store and insert data.

Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
"ip", and "l2". Defaults to "l2".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.

Returns:
HnswLib Vector Store
"""
if work_dir is None:
raise ValueError("`work_dir` parameter hs not been set.")
if n_dim is None:
raise ValueError("`n_dim` parameter has not been set.")

store = cls(
work_dir=work_dir,
n_dim=n_dim,
embedding=embedding,
dist_metric=dist_metric,
)
store.add_texts(texts=texts, metadatas=metadatas)
return store
68 changes: 68 additions & 0 deletions langchain/vectorstores/in_memory_exact_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Wrapper around in-memory storage."""
from __future__ import annotations

from typing import List, Optional, Type

from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VST
from langchain.vectorstores.vector_store_from_doc_index import (
VecStoreFromDocIndex,
_check_docarray_import,
)


class InMemoryExactSearch(VecStoreFromDocIndex):
"""Wrapper around in-memory storage for exact search.

To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
You can install it with `pip install "langchain[in_memory_store]"`.
"""

def __init__(
self,
embedding: Embeddings,
metric: str = "cosine_sim",
) -> None:
"""Initialize InMemoryExactSearch store.

Args:
embedding (Embeddings): Embedding function.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".
"""
_check_docarray_import()
from docarray.index import InMemoryExactNNIndex

doc_cls = self._get_doc_cls({"space": metric})
doc_index = InMemoryExactNNIndex[doc_cls]()
super().__init__(doc_index, embedding)

@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metric: str = "cosine_sim",
) -> InMemoryExactSearch:
"""Create an InMemoryExactSearch store and insert data.

Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".

Returns:
InMemoryExactSearch Vector Store
"""
store = cls(
embedding=embedding,
metric=metric,
)
store.add_texts(texts=texts, metadatas=metadatas)
return store
189 changes: 189 additions & 0 deletions langchain/vectorstores/vector_store_from_doc_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from operator import itemgetter
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type

try:
from docarray import BaseDoc
from docarray.index.abstract import BaseDocIndex
from docarray.typing import NdArray
except ImportError:
BaseDoc = None
BaseDocIndex = None
NdArray = None

from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain.vectorstores import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance


def _check_docarray_import() -> None:
try:
import docarray

da_version = docarray.__version__.split(".")
if int(da_version[0]) == 0 and int(da_version[1]) <= 30:
raise ValueError(
f"To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, "
f"received: {docarray.__version__}."
f"To upgrade, please run: `pip install -U docarray`."
)
except ImportError:
raise ImportError(
"Could not import docarray python package. "
"Please install it with `pip install \"langchain[docarray]\"`."
)


class VecStoreFromDocIndex(VectorStore):
doc_index: BaseDocIndex
doc_cls: Type[BaseDoc]
embedding: Embeddings

def __init__(
self,
doc_index: BaseDocIndex,
embedding: Embeddings,
):
"""Initialize a vector store from DocArray's DocIndex."""
self.doc_index = doc_index
self.doc_cls = doc_index._schema
self.embedding = embedding

@staticmethod
def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]:
"""Get docarray Document class describing the schema of DocIndex."""
from docarray import BaseDoc
from pydantic import Field

class DocArrayDoc(BaseDoc):
text: Optional[str]
embedding: Optional[NdArray] = Field(**embeddings_params)
metadata: Optional[dict]

return DocArrayDoc

def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
if metadatas is None:
metadatas = [{} for _ in range(len(list(texts)))]

ids: List[str] = []
embeddings = self.embedding.embed_documents(texts)
for t, m, e in zip(texts, metadatas, embeddings):
doc = self.doc_cls(text=t, embedding=e, metadata=m)
self.doc_index.index([doc])
ids.append(str(doc.id))

return ids

def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query and score for each.
"""
query_embedding = self.embedding.embed_query(query)
query_doc = self.doc_cls(embedding=query_embedding)
docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k)

result = [
(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)
]
return result

def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
results = self.similarity_search_with_score(query, k)
return list(map(itemgetter(0), results))

def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
0 is dissimilar, 1 is most similar.
"""
raise NotImplementedError

def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query vector.
"""

query_doc = self.doc_cls(embedding=embedding)
docs = self.doc_index.find(
query_doc, search_field="embedding", limit=k
).documents

result = [Document(page_content=doc.text) for doc in docs]
return result

def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
query_embedding = self.embedding.embed_query(query)
query_doc = self.doc_cls(embedding=query_embedding)

docs = self.doc_index.find(
query_doc, search_field="embedding", limit=fetch_k
).documents

mmr_selected = maximal_marginal_relevance(query_embedding, docs.embedding, k=k)
results = [
Document(page_content=docs[idx].text, metadata=docs[idx].metadata)
for idx in mmr_selected
]
return results
236 changes: 155 additions & 81 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -69,7 +69,9 @@ pytesseract = {version = "^0.3.10", optional=true}
html2text = {version="^2020.1.16", optional=true}
numexpr = "^2.8.4"
duckduckgo-search = {version="^2.8.6", optional=true}

docarray = {version="^0.31.0.dev35", optional=true}
protobuf = {version="3.19", optional=true}
hnswlib = {version="^0.7.0", optional=true}

[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"
@@ -145,8 +147,10 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes
qdrant = ["qdrant-client"]
openai = ["openai"]
cohere = ["cohere"]
in_memory_store = ["docarray"]
hnswlib = ["docarray", "protobuf", "hnswlib"]
embeddings = ["sentence-transformers"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"]

[tool.ruff]
select = [
149 changes: 149 additions & 0 deletions tests/integration_tests/vectorstores/test_hnsw_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import numpy as np
import pytest

from langchain.schema import Document
from langchain.vectorstores.hnsw_lib import HnswLib
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings


def test_hnswlib_vec_store_from_texts(tmp_path) -> None:
"""Test end to end construction and simple similarity search."""
texts = ["foo", "bar", "baz"]
docsearch = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
n_dim=10,
dist_metric='cosine',
)
assert isinstance(docsearch, HnswLib)
assert docsearch.doc_index.num_docs() == 3


def test_hnswlib_vec_store_add_texts(tmp_path) -> None:
"""Test end to end construction and simple similarity search."""
docsearch = HnswLib(
work_dir=str(tmp_path),
n_dim=10,
embedding=FakeEmbeddings(),
dist_metric='cosine',
)
assert isinstance(docsearch, HnswLib)
assert docsearch.doc_index.num_docs() == 0

texts = ["foo", "bar", "baz"]
docsearch.add_texts(texts=texts)
assert docsearch.doc_index.num_docs() == 3


@pytest.mark.parametrize('metric', ['cosine', 'l2'])
def test_sim_search(metric, tmp_path) -> None:
"""Test end to end construction and simple similarity search."""
texts = ["foo", "bar", "baz"]
hnswlib_vec_store = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
n_dim=10,
dist_metric=metric,
)
output = hnswlib_vec_store.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]


@pytest.mark.parametrize('metric', ['cosine', 'l2'])
def test_sim_search_all_configurations(metric, tmp_path) -> None:
"""Test end to end construction and simple similarity search."""
texts = ["foo", "bar", "baz"]
hnswlib_vec_store = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
dist_metric=metric,
n_dim=10,
max_elements=8,
index=False,
ef_construction=300,
ef=20,
M=8,
allow_replace_deleted=False,
num_threads=2,
)
output = hnswlib_vec_store.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]


@pytest.mark.parametrize('metric', ['cosine', 'l2'])
def test_sim_search_by_vector(metric, tmp_path) -> None:
"""Test end to end construction and similarity search by vector."""
texts = ["foo", "bar", "baz"]
hnswlib_vec_store = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
n_dim=10,
dist_metric=metric,
)
embedding = [1.0] * 10
output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1)

assert output == [Document(page_content="bar")]


@pytest.mark.parametrize('metric', ['cosine', 'l2'])
def test_sim_search_with_score(metric, tmp_path) -> None:
"""Test end to end construction and similarity search with score."""
texts = ["foo", "bar", "baz"]
hnswlib_vec_store = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
n_dim=10,
dist_metric=metric,
)
output = hnswlib_vec_store.similarity_search_with_score("foo", k=1)
assert len(output) == 1

out_doc, out_score = output[0]
assert out_doc == Document(page_content="foo")
assert np.isclose(out_score, 0.0, atol=1.e-6)


def test_sim_search_with_score_for_ip_metric(tmp_path) -> None:
"""
Test end to end construction and similarity search with score for ip
(inner-product) metric.
"""
texts = ["foo", "bar", "baz"]
hnswlib_vec_store = HnswLib.from_texts(
texts,
FakeEmbeddings(),
work_dir=str(tmp_path),
n_dim=10,
dist_metric='ip',
)
output = hnswlib_vec_store.similarity_search_with_score("foo", k=3)
assert len(output) == 3

for result in output:
assert result[1] == -8.0


@pytest.mark.parametrize('metric', ['cosine', 'l2'])
def test_max_marginal_relevance_search(metric, tmp_path) -> None:
"""Test MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = HnswLib.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
dist_metric=metric,
work_dir=str(tmp_path),
n_dim=10,
)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert output == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
import pytest

from langchain.schema import Document
from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings


def test_in_memory_vec_store_from_texts() -> None:
"""Test end to end construction and simple similarity search."""
texts = ["foo", "bar", "baz"]
docsearch = InMemoryExactSearch.from_texts(
texts,
FakeEmbeddings(),
)
assert isinstance(docsearch, InMemoryExactSearch)
assert docsearch.doc_index.num_docs() == 3


def test_in_memory_vec_store_add_texts(tmp_path) -> None:
"""Test end to end construction and simple similarity search."""
docsearch = InMemoryExactSearch(
embedding=FakeEmbeddings(),
)
assert isinstance(docsearch, InMemoryExactSearch)
assert docsearch.doc_index.num_docs() == 0

texts = ["foo", "bar", "baz"]
docsearch.add_texts(texts=texts)
assert docsearch.doc_index.num_docs() == 3


@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
def test_sim_search(metric) -> None:
"""Test end to end construction and simple similarity search."""
texts = ["foo", "bar", "baz"]
in_memory_vec_store = InMemoryExactSearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
metric=metric,
)

output = in_memory_vec_store.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]


@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
def test_sim_search_with_score(metric) -> None:
"""Test end to end construction and similarity search with score."""
texts = ["foo", "bar", "baz"]
in_memory_vec_store = InMemoryExactSearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
metric=metric,
)

output = in_memory_vec_store.similarity_search_with_score("foo", k=1)

out_doc, out_score = output[0]
assert out_doc == Document(page_content="foo")

expected_score = 0.0 if 'dist' in metric else 1.0
assert np.isclose(out_score, expected_score, atol=1.e-6)


@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
def test_sim_search_by_vector(metric) -> None:
"""Test end to end construction and similarity search by vector."""
texts = ["foo", "bar", "baz"]
in_memory_vec_store = InMemoryExactSearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
metric=metric,
)

embedding = [1.0] * 10
output = in_memory_vec_store.similarity_search_by_vector(embedding, k=1)

assert output == [Document(page_content="bar")]


@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
def test_max_marginal_relevance_search(metric) -> None:
"""Test MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = InMemoryExactSearch.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
metric=metric
)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert output == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
]