docarray · anna-charlotte · Apr 24, 2023 · Apr 27, 2023 · Apr 27, 2023 · Apr 27, 2023
diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py
@@ -7,6 +7,8 @@
 from langchain.vectorstores.deeplake import DeepLake
 from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
 from langchain.vectorstores.faiss import FAISS
+from langchain.vectorstores.hnsw_lib import HnswLib
+from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch
 from langchain.vectorstores.milvus import Milvus
 from langchain.vectorstores.myscale import MyScale, MyScaleSettings
 from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
@@ -34,4 +36,6 @@
     "MyScaleSettings",
     "SupabaseVectorStore",
     "AnalyticDB",
+    "HnswLib",
+    "InMemoryExactSearch",
 ]
diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py
@@ -0,0 +1,141 @@
+"""Wrapper around HnswLib store."""
+from __future__ import annotations
+
+from typing import List, Optional, Type
+
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores.base import VST
+from langchain.vectorstores.vector_store_from_doc_index import (
+    VecStoreFromDocIndex,
+    _check_docarray_import,
+)
+
+
+class HnswLib(VecStoreFromDocIndex):
+    """Wrapper around HnswLib storage.
+
+    To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed.
+    You can install it with `pip install "langchain[hnswlib]"`.
+    """
+
+    def __init__(
+        self,
+        embedding: Embeddings,
+        work_dir: str,
+        n_dim: int,
+        dist_metric: str = "cosine",
+        max_elements: int = 1024,
+        index: bool = True,
+        ef_construction: int = 200,
+        ef: int = 10,
+        M: int = 16,
+        allow_replace_deleted: bool = True,
+        num_threads: int = 1,
+    ) -> None:
+        """Initialize HnswLib store.
+
+        Args:
+            embedding (Embeddings): Embedding function.
+            work_dir (str): path to the location where all the data will be stored.
+            n_dim (int): dimension of an embedding.
+            dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
+                "ip", and "l2". Defaults to "cosine".
+            max_elements (int): Maximum number of vectors that can be stored.
+                Defaults to 1024.
+            index (bool): Whether an index should be built for this field.
+                Defaults to True.
+            ef_construction (int): defines a construction time/accuracy trade-off.
+                Defaults to 200.
+            ef (int): parameter controlling query time/accuracy trade-off.
+                Defaults to 10.
+            M (int): parameter that defines the maximum number of outgoing
+                connections in the graph. Defaults to 16.
+            allow_replace_deleted (bool): Enables replacing of deleted elements
+                with new added ones. Defaults to True.
+            num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
+        """
+        _check_docarray_import()
+        from docarray.index import HnswDocumentIndex
+
+        try:
+            import google.protobuf
+        except ImportError:
+            raise ImportError(
+                "Could not import all required packages. "
+                "Please install it with `pip install \"langchain[hnswlib]\"`."
+            )
+
+        doc_cls = self._get_doc_cls(
+            {
+                "dim": n_dim,
+                "space": dist_metric,
+                "max_elements": max_elements,
+                "index": index,
+                "ef_construction": ef_construction,
+                "ef": ef,
+                "M": M,
+                "allow_replace_deleted": allow_replace_deleted,
+                "num_threads": num_threads,
+            }
+        )
+        doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)
+        super().__init__(doc_index, embedding)
+
+    @classmethod
+    def from_texts(
+        cls: Type[VST],
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        work_dir: str = None,
+        n_dim: int = None,
+        dist_metric: str = "l2",
+        max_elements: int = 1024,
+        index: bool = True,
+        ef_construction: int = 200,
+        ef: int = 10,
+        M: int = 16,
+        allow_replace_deleted: bool = True,
+        num_threads: int = 1,
+    ) -> HnswLib:
+        """Create an HnswLib store and insert data.
+
+        Args:
+            texts (List[str]): Text data.
+            embedding (Embeddings): Embedding function.
+            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
+                Defaults to None.
+            work_dir (str): path to the location where all the data will be stored.
+            n_dim (int): dimension of an embedding.
+            dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
+                "ip", and "l2". Defaults to "l2".
+            max_elements (int): Maximum number of vectors that can be stored.
+                Defaults to 1024.
+            index (bool): Whether an index should be built for this field.
+                Defaults to True.
+            ef_construction (int): defines a construction time/accuracy trade-off.
+                Defaults to 200.
+            ef (int): parameter controlling query time/accuracy trade-off.
+                Defaults to 10.
+            M (int): parameter that defines the maximum number of outgoing
+                connections in the graph. Defaults to 16.
+            allow_replace_deleted (bool): Enables replacing of deleted elements
+                with new added ones. Defaults to True.
+            num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
+
+        Returns:
+            HnswLib Vector Store
+        """
+        if work_dir is None:
+            raise ValueError("`work_dir` parameter hs not been set.")
+        if n_dim is None:
+            raise ValueError("`n_dim` parameter has not been set.")
+
+        store = cls(
+            work_dir=work_dir,
+            n_dim=n_dim,
+            embedding=embedding,
+            dist_metric=dist_metric,
+        )
+        store.add_texts(texts=texts, metadatas=metadatas)
+        return store
diff --git a/langchain/vectorstores/in_memory_exact_search.py b/langchain/vectorstores/in_memory_exact_search.py
@@ -0,0 +1,68 @@
+"""Wrapper around in-memory storage."""
+from __future__ import annotations
+
+from typing import List, Optional, Type
+
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores.base import VST
+from langchain.vectorstores.vector_store_from_doc_index import (
+    VecStoreFromDocIndex,
+    _check_docarray_import,
+)
+
+
+class InMemoryExactSearch(VecStoreFromDocIndex):
+    """Wrapper around in-memory storage for exact search.
+
+    To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
+    You can install it with `pip install "langchain[in_memory_store]"`.
+    """
+
+    def __init__(
+        self,
+        embedding: Embeddings,
+        metric: str = "cosine_sim",
+    ) -> None:
+        """Initialize InMemoryExactSearch store.
+
+        Args:
+            embedding (Embeddings): Embedding function.
+            metric (str): metric for exact nearest-neighbor search.
+                Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
+                Defaults to "cosine_sim".
+        """
+        _check_docarray_import()
+        from docarray.index import InMemoryExactNNIndex
+
+        doc_cls = self._get_doc_cls({"space": metric})
+        doc_index = InMemoryExactNNIndex[doc_cls]()
+        super().__init__(doc_index, embedding)
+
+    @classmethod
+    def from_texts(
+        cls: Type[VST],
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        metric: str = "cosine_sim",
+    ) -> InMemoryExactSearch:
+        """Create an InMemoryExactSearch store and insert data.
+
+        Args:
+            texts (List[str]): Text data.
+            embedding (Embeddings): Embedding function.
+            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
+                Defaults to None.
+            metric (str): metric for exact nearest-neighbor search.
+                Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
+                Defaults to "cosine_sim".
+
+        Returns:
+            InMemoryExactSearch Vector Store
+        """
+        store = cls(
+            embedding=embedding,
+            metric=metric,
+        )
+        store.add_texts(texts=texts, metadatas=metadatas)
+        return store
diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py
@@ -0,0 +1,189 @@
+from operator import itemgetter
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
+
+try:
+    from docarray import BaseDoc
+    from docarray.index.abstract import BaseDocIndex
+    from docarray.typing import NdArray
+except ImportError:
+    BaseDoc = None
+    BaseDocIndex = None
+    NdArray = None
+
+from langchain.embeddings.base import Embeddings
+from langchain.schema import Document
+from langchain.vectorstores import VectorStore
+from langchain.vectorstores.utils import maximal_marginal_relevance
+
+
+def _check_docarray_import() -> None:
+    try:
+        import docarray
+
+        da_version = docarray.__version__.split(".")
+        if int(da_version[0]) == 0 and int(da_version[1]) <= 30:
+            raise ValueError(
+                f"To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, "
+                f"received: {docarray.__version__}."
+                f"To upgrade, please run: `pip install -U docarray`."
+            )
+    except ImportError:
+        raise ImportError(
+            "Could not import docarray python package. "
+            "Please install it with `pip install \"langchain[docarray]\"`."
+        )
+
+
+class VecStoreFromDocIndex(VectorStore):
+    doc_index: BaseDocIndex
+    doc_cls: Type[BaseDoc]
+    embedding: Embeddings
+
+    def __init__(
+        self,
+        doc_index: BaseDocIndex,
+        embedding: Embeddings,
+    ):
+        """Initialize a vector store from DocArray's DocIndex."""
+        self.doc_index = doc_index
+        self.doc_cls = doc_index._schema
+        self.embedding = embedding
+
+    @staticmethod
+    def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]:
+        """Get docarray Document class describing the schema of DocIndex."""
+        from docarray import BaseDoc
+        from pydantic import Field
+
+        class DocArrayDoc(BaseDoc):
+            text: Optional[str]
+            embedding: Optional[NdArray] = Field(**embeddings_params)
+            metadata: Optional[dict]
+
+        return DocArrayDoc
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        if metadatas is None:
+            metadatas = [{} for _ in range(len(list(texts)))]
+
+        ids: List[str] = []
+        embeddings = self.embedding.embed_documents(texts)
+        for t, m, e in zip(texts, metadatas, embeddings):
+            doc = self.doc_cls(text=t, embedding=e, metadata=m)
+            self.doc_index.index([doc])
+            ids.append(str(doc.id))
+
+        return ids
+
+    def similarity_search_with_score(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query and score for each.
+        """
+        query_embedding = self.embedding.embed_query(query)
+        query_doc = self.doc_cls(embedding=query_embedding)
+        docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k)
+
+        result = [
+            (Document(page_content=doc.text), score) for doc, score in zip(docs, scores)
+        ]
+        return result
+
+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query.
+        """
+        results = self.similarity_search_with_score(query, k)
+        return list(map(itemgetter(0), results))
+
+    def _similarity_search_with_relevance_scores(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs and relevance scores, normalized on a scale from 0 to 1.
+
+        0 is dissimilar, 1 is most similar.
+        """
+        raise NotImplementedError
+
+    def similarity_search_by_vector(
+        self, embedding: List[float], k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs most similar to embedding vector.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query vector.
+        """
+
+        query_doc = self.doc_cls(embedding=embedding)
+        docs = self.doc_index.find(
+            query_doc, search_field="embedding", limit=k
+        ).documents
+
+        result = [Document(page_content=doc.text) for doc in docs]
+        return result
+
+    def max_marginal_relevance_search(
+        self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        query_embedding = self.embedding.embed_query(query)
+        query_doc = self.doc_cls(embedding=query_embedding)
+
+        docs = self.doc_index.find(
+            query_doc, search_field="embedding", limit=fetch_k
+        ).documents
+
+        mmr_selected = maximal_marginal_relevance(query_embedding, docs.embedding, k=k)
+        results = [
+            Document(page_content=docs[idx].text, metadata=docs[idx].metadata)
+            for idx in mmr_selected
+        ]
+        return results
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,7 +69,9 @@ pytesseract = {version = "^0.3.10", optional=true}
 html2text = {version="^2020.1.16", optional=true}
 numexpr = "^2.8.4"
 duckduckgo-search = {version="^2.8.6", optional=true}
-
+docarray = {version="^0.31.0.dev35", optional=true}
+protobuf = {version="3.19", optional=true}
+hnswlib = {version="^0.7.0", optional=true}
 
 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
@@ -145,8 +147,10 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes
 qdrant = ["qdrant-client"]
 openai = ["openai"]
 cohere = ["cohere"]
+in_memory_store = ["docarray"]
+hnswlib = ["docarray", "protobuf", "hnswlib"]
 embeddings = ["sentence-transformers"]
-all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect"]
+all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"]
 
 [tool.ruff]
 select = [

diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py
@@ -0,0 +1,149 @@
+import numpy as np
+import pytest
+
+from langchain.schema import Document
+from langchain.vectorstores.hnsw_lib import HnswLib
+from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+
+
+def test_hnswlib_vec_store_from_texts(tmp_path) -> None:
+    """Test end to end construction and simple similarity search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        n_dim=10,
+        dist_metric='cosine',
+    )
+    assert isinstance(docsearch, HnswLib)
+    assert docsearch.doc_index.num_docs() == 3
+
+
+def test_hnswlib_vec_store_add_texts(tmp_path) -> None:
+    """Test end to end construction and simple similarity search."""
+    docsearch = HnswLib(
+        work_dir=str(tmp_path),
+        n_dim=10,
+        embedding=FakeEmbeddings(),
+        dist_metric='cosine',
+    )
+    assert isinstance(docsearch, HnswLib)
+    assert docsearch.doc_index.num_docs() == 0
+
+    texts = ["foo", "bar", "baz"]
+    docsearch.add_texts(texts=texts)
+    assert docsearch.doc_index.num_docs() == 3
+
+
+@pytest.mark.parametrize('metric', ['cosine', 'l2'])
+def test_sim_search(metric, tmp_path) -> None:
+    """Test end to end construction and simple similarity search."""
+    texts = ["foo", "bar", "baz"]
+    hnswlib_vec_store = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        n_dim=10,
+        dist_metric=metric,
+    )
+    output = hnswlib_vec_store.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+@pytest.mark.parametrize('metric', ['cosine', 'l2'])
+def test_sim_search_all_configurations(metric, tmp_path) -> None:
+    """Test end to end construction and simple similarity search."""
+    texts = ["foo", "bar", "baz"]
+    hnswlib_vec_store = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        dist_metric=metric,
+        n_dim=10,
+        max_elements=8,
+        index=False,
+        ef_construction=300,
+        ef=20,
+        M=8,
+        allow_replace_deleted=False,
+        num_threads=2,
+    )
+    output = hnswlib_vec_store.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+@pytest.mark.parametrize('metric', ['cosine', 'l2'])
+def test_sim_search_by_vector(metric, tmp_path) -> None:
+    """Test end to end construction and similarity search by vector."""
+    texts = ["foo", "bar", "baz"]
+    hnswlib_vec_store = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        n_dim=10,
+        dist_metric=metric,
+    )
+    embedding = [1.0] * 10
+    output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1)
+
+    assert output == [Document(page_content="bar")]
+
+
+@pytest.mark.parametrize('metric', ['cosine', 'l2'])
+def test_sim_search_with_score(metric, tmp_path) -> None:
+    """Test end to end construction and similarity search with score."""
+    texts = ["foo", "bar", "baz"]
+    hnswlib_vec_store = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        n_dim=10,
+        dist_metric=metric,
+    )
+    output = hnswlib_vec_store.similarity_search_with_score("foo", k=1)
+    assert len(output) == 1
+
+    out_doc, out_score = output[0]
+    assert out_doc == Document(page_content="foo")
+    assert np.isclose(out_score, 0.0, atol=1.e-6)
+
+
+def test_sim_search_with_score_for_ip_metric(tmp_path) -> None:
+    """
+    Test end to end construction and similarity search with score for ip
+    (inner-product) metric.
+    """
+    texts = ["foo", "bar", "baz"]
+    hnswlib_vec_store = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        work_dir=str(tmp_path),
+        n_dim=10,
+        dist_metric='ip',
+    )
+    output = hnswlib_vec_store.similarity_search_with_score("foo", k=3)
+    assert len(output) == 3
+
+    for result in output:
+        assert result[1] == -8.0
+
+
+@pytest.mark.parametrize('metric', ['cosine', 'l2'])
+def test_max_marginal_relevance_search(metric, tmp_path) -> None:
+    """Test MRR search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = HnswLib.from_texts(
+        texts,
+        FakeEmbeddings(),
+        metadatas=metadatas,
+        dist_metric=metric,
+        work_dir=str(tmp_path),
+        n_dim=10,
+    )
+    output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
+    assert output == [
+        Document(page_content="foo", metadata={"page": 0}),
+        Document(page_content="bar", metadata={"page": 1}),
+    ]
diff --git a/tests/integration_tests/vectorstores/test_in_memory_exact_search.py b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pytest
+
+from langchain.schema import Document
+from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch
+from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+
+
+def test_in_memory_vec_store_from_texts() -> None:
+    """Test end to end construction and simple similarity search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = InMemoryExactSearch.from_texts(
+        texts,
+        FakeEmbeddings(),
+    )
+    assert isinstance(docsearch, InMemoryExactSearch)
+    assert docsearch.doc_index.num_docs() == 3
+
+
+def test_in_memory_vec_store_add_texts(tmp_path) -> None:
+    """Test end to end construction and simple similarity search."""
+    docsearch = InMemoryExactSearch(
+        embedding=FakeEmbeddings(),
+    )
+    assert isinstance(docsearch, InMemoryExactSearch)
+    assert docsearch.doc_index.num_docs() == 0
+
+    texts = ["foo", "bar", "baz"]
+    docsearch.add_texts(texts=texts)
+    assert docsearch.doc_index.num_docs() == 3
+
+
+@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
+def test_sim_search(metric) -> None:
+    """Test end to end construction and simple similarity search."""
+    texts = ["foo", "bar", "baz"]
+    in_memory_vec_store = InMemoryExactSearch.from_texts(
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metric=metric,
+    )
+
+    output = in_memory_vec_store.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
+def test_sim_search_with_score(metric) -> None:
+    """Test end to end construction and similarity search with score."""
+    texts = ["foo", "bar", "baz"]
+    in_memory_vec_store = InMemoryExactSearch.from_texts(
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metric=metric,
+    )
+
+    output = in_memory_vec_store.similarity_search_with_score("foo", k=1)
+
+    out_doc, out_score = output[0]
+    assert out_doc == Document(page_content="foo")
+
+    expected_score = 0.0 if 'dist' in metric else 1.0
+    assert np.isclose(out_score, expected_score, atol=1.e-6)
+
+
+@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
+def test_sim_search_by_vector(metric) -> None:
+    """Test end to end construction and similarity search by vector."""
+    texts = ["foo", "bar", "baz"]
+    in_memory_vec_store = InMemoryExactSearch.from_texts(
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metric=metric,
+    )
+
+    embedding = [1.0] * 10
+    output = in_memory_vec_store.similarity_search_by_vector(embedding, k=1)
+
+    assert output == [Document(page_content="bar")]
+
+
+@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist'])
+def test_max_marginal_relevance_search(metric) -> None:
+    """Test MRR search."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": i} for i in range(len(texts))]
+    docsearch = InMemoryExactSearch.from_texts(
+        texts,
+        FakeEmbeddings(),
+        metadatas=metadatas,
+        metric=metric
+    )
+    output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
+    assert output == [
+        Document(page_content="foo", metadata={"page": 0}),
+        Document(page_content="bar", metadata={"page": 1}),
+    ]