From 35f8cf75461ddb64414af0372ba03009c9770aa7 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Thu, 9 Nov 2023 15:05:59 -0800 Subject: [PATCH] Fixing index_prefix_path bug in python for StaticMemoryIndex (#491) * Fixing the same bug I had in static disk index inside of static memory index as well. * Unit tests and a better understanding of why the unit tests were successful despite this bug --- pyproject.toml | 2 +- python/src/_static_disk_index.py | 6 ++--- python/src/_static_memory_index.py | 10 ++++---- python/tests/test_static_disk_index.py | 32 +++++++++++++++++++++++- python/tests/test_static_memory_index.py | 28 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 39c79b81e..7a250e0d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.7.0rc1" +version = "0.7.0rc2" description = "DiskANN Python extension module" readme = "python/README.md" diff --git a/python/src/_static_disk_index.py b/python/src/_static_disk_index.py index 27300e8b3..bd532577e 100644 --- a/python/src/_static_disk_index.py +++ b/python/src/_static_disk_index.py @@ -79,9 +79,9 @@ def __init__( does not exist, you are required to provide it. - **index_prefix**: The prefix of the index files. Defaults to "ann". """ - index_prefix = _valid_index_prefix(index_directory, index_prefix) + index_prefix_path = _valid_index_prefix(index_directory, index_prefix) vector_dtype, metric, _, _ = _ensure_index_metadata( - index_prefix, + index_prefix_path, vector_dtype, distance_metric, 1, # it doesn't matter because we don't need it in this context anyway @@ -101,7 +101,7 @@ def __init__( _index = _native_dap.StaticDiskFloatIndex self._index = _index( distance_metric=dap_metric, - index_path_prefix=index_prefix, + index_path_prefix=index_prefix_path, num_threads=num_threads, num_nodes_to_cache=num_nodes_to_cache, cache_mechanism=cache_mechanism, diff --git a/python/src/_static_memory_index.py b/python/src/_static_memory_index.py index f9bd7e8cc..e481403cf 100644 --- a/python/src/_static_memory_index.py +++ b/python/src/_static_memory_index.py @@ -77,22 +77,22 @@ def __init__( does not exist, you are required to provide it. - **enable_filters**: Indexes built with filters can also be used for filtered search. """ - index_prefix = _valid_index_prefix(index_directory, index_prefix) + index_prefix_path = _valid_index_prefix(index_directory, index_prefix) self._labels_map = {} self._labels_metadata = {} if enable_filters: try: - with open(index_prefix + "_labels_map.txt", "r") as labels_map_if: + with open(f"{index_prefix_path}_labels_map.txt", "r") as labels_map_if: for line in labels_map_if: (key, val) = line.split("\t") self._labels_map[key] = int(val) - with open(f"{index_prefix}_label_metadata.json", "r") as labels_metadata_if: + with open(f"{index_prefix_path}_label_metadata.json", "r") as labels_metadata_if: self._labels_metadata = json.load(labels_metadata_if) except: # noqa: E722 # exceptions are basically presumed to be either file not found or file not formatted correctly raise RuntimeException("Filter labels file was unable to be processed.") vector_dtype, metric, num_points, dims = _ensure_index_metadata( - index_prefix, + index_prefix_path, vector_dtype, distance_metric, 1, # it doesn't matter because we don't need it in this context anyway @@ -119,7 +119,7 @@ def __init__( distance_metric=dap_metric, num_points=num_points, dimensions=dims, - index_path=os.path.join(index_directory, index_prefix), + index_path=index_prefix_path, num_threads=num_threads, initial_search_complexity=initial_search_complexity, ) diff --git a/python/tests/test_static_disk_index.py b/python/tests/test_static_disk_index.py index 35015276e..0397c321d 100644 --- a/python/tests/test_static_disk_index.py +++ b/python/tests/test_static_disk_index.py @@ -3,6 +3,7 @@ import shutil import unittest +from pathlib import Path from tempfile import mkdtemp import diskannpy as dap @@ -165,4 +166,33 @@ def test_zero_threads(self): k = 5 ids, dists = index.batch_search( query_vectors, k_neighbors=k, complexity=5, beam_width=2, num_threads=0 - ) \ No newline at end of file + ) + + def test_relative_paths(self): + # Issue 483 and 491 both fixed errors that were somehow slipping past our unit tests + # os.path.join() acts as a semi-merge if you give it two paths that look absolute. + # since our unit tests are using absolute paths via tempfile.mkdtemp(), the double os.path.join() was never + # caught by our tests, but was very easy to trip when using relative paths + rel_dir = "tmp" + Path(rel_dir).mkdir(exist_ok=True) + try: + tiny_index_vecs = random_vectors(20, 10, dtype=np.float32, seed=12345) + dap.build_disk_index( + data=tiny_index_vecs, + distance_metric="l2", + index_directory=rel_dir, + graph_degree=16, + complexity=32, + search_memory_maximum=0.00003, + build_memory_maximum=1, + num_threads=0, + pq_disk_bytes=0, + ) + index = dap.StaticDiskIndex( + index_directory=rel_dir, + num_threads=16, + num_nodes_to_cache=10, + ) + + finally: + shutil.rmtree(rel_dir, ignore_errors=True) diff --git a/python/tests/test_static_memory_index.py b/python/tests/test_static_memory_index.py index 3078f15a5..fe571be5d 100644 --- a/python/tests/test_static_memory_index.py +++ b/python/tests/test_static_memory_index.py @@ -5,6 +5,7 @@ import shutil import unittest +from pathlib import Path from tempfile import mkdtemp import diskannpy as dap @@ -191,6 +192,33 @@ def test_zero_threads(self): k = 5 ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0) + def test_relative_paths(self): + # Issue 483 and 491 both fixed errors that were somehow slipping past our unit tests + # os.path.join() acts as a semi-merge if you give it two paths that look absolute. + # since our unit tests are using absolute paths via tempfile.mkdtemp(), the double os.path.join() was never + # caught by our tests, but was very easy to trip when using relative paths + rel_dir = "tmp" + Path(rel_dir).mkdir(exist_ok=True) + try: + tiny_index_vecs = random_vectors(20, 10, dtype=np.float32, seed=12345) + dap.build_memory_index( + data=tiny_index_vecs, + distance_metric="l2", + index_directory=rel_dir, + graph_degree=16, + complexity=32, + num_threads=0, + ) + index = dap.StaticMemoryIndex( + index_directory=rel_dir, + num_threads=0, + initial_search_complexity=32, + ) + + finally: + shutil.rmtree(rel_dir, ignore_errors=True) + + class TestFilteredStaticMemoryIndex(unittest.TestCase): def test_simple_scenario(self):