diff --git a/include/parameters.h b/include/parameters.h index 209b9128c..4fec9ae08 100644 --- a/include/parameters.h +++ b/include/parameters.h @@ -83,7 +83,7 @@ class IndexWriteParametersBuilder IndexWriteParametersBuilder &with_num_threads(const uint32_t num_threads) { - _num_threads = num_threads == 0 ? omp_get_num_threads() : num_threads; + _num_threads = num_threads == 0 ? omp_get_num_procs() : num_threads; return *this; } diff --git a/python/src/_builder.py b/python/src/_builder.py index 18e9e9fa0..db2b200db 100644 --- a/python/src/_builder.py +++ b/python/src/_builder.py @@ -70,6 +70,15 @@ def build_disk_index( in the format DiskANN's PQ Flash Index builder requires. This temp folder is deleted upon index creation completion or error. + ## Distance Metric and Vector Datatype Restrictions + | Metric \ Datatype | np.float32 | np.uint8 | np.int8 | + |-------------------|------------|----------|---------| + | L2 | ✅ | ✅ | ✅ | + | MIPS | ✅ | ❌ | ❌ | + | Cosine [^bug-in-disk-cosine] | ❌ | ❌ | ❌ | + + [^bug-in-disk-cosine]: For StaticDiskIndex, Cosine distances are not currently supported. + ### Parameters - **data**: Either a `str` representing a path to a DiskANN vector bin file, or a numpy.ndarray, of a supported dtype, in 2 dimensions. Note that `vector_dtype` must be provided if data is a `str` @@ -119,6 +128,12 @@ def build_disk_index( vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( data, vector_dtype, index_directory, index_prefix ) + _assert(dap_metric != _native_dap.COSINE, "Cosine is currently not supported in StaticDiskIndex") + if dap_metric == _native_dap.INNER_PRODUCT: + _assert( + vector_dtype_actual == np.float32, + "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips" + ) num_points, dimensions = vectors_metadata_from_file(vector_bin_path) @@ -176,6 +191,14 @@ def build_memory_index( `diskannpy.DynamicMemoryIndex`, you **must** supply a valid value for the `tags` parameter. **Do not supply tags if the index is intended to be `diskannpy.StaticMemoryIndex`**! + ## Distance Metric and Vector Datatype Restrictions + + | Metric \ Datatype | np.float32 | np.uint8 | np.int8 | + |-------------------|------------|----------|---------| + | L2 | ✅ | ✅ | ✅ | + | MIPS | ✅ | ❌ | ❌ | + | Cosine | ✅ | ✅ | ✅ | + ### Parameters - **data**: Either a `str` representing a path to an existing DiskANN vector bin file, or a numpy.ndarray of a @@ -232,6 +255,11 @@ def build_memory_index( vector_bin_path, vector_dtype_actual = _valid_path_and_dtype( data, vector_dtype, index_directory, index_prefix ) + if dap_metric == _native_dap.INNER_PRODUCT: + _assert( + vector_dtype_actual == np.float32, + "Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips" + ) num_points, dimensions = vectors_metadata_from_file(vector_bin_path) diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp index f92f4157e..3add2aa5c 100644 --- a/python/src/dynamic_memory_index.cpp +++ b/python/src/dynamic_memory_index.cpp @@ -34,8 +34,7 @@ diskann::Index dynamic_index_builder(const diskann:: const uint32_t initial_search_threads, const bool concurrent_consolidation) { - const uint32_t _initial_search_threads = - initial_search_threads != 0 ? initial_search_threads : omp_get_num_threads(); + const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_procs(); auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, _initial_search_threads); return diskann::Index( diff --git a/python/src/static_disk_index.cpp b/python/src/static_disk_index.cpp index 654f8ec30..9e86b0ad5 100644 --- a/python/src/static_disk_index.cpp +++ b/python/src/static_disk_index.cpp @@ -14,7 +14,8 @@ StaticDiskIndex
::StaticDiskIndex(const diskann::Metric metric, const std::st const uint32_t cache_mechanism) : _reader(std::make_shared()), _index(_reader, metric) { - int load_success = _index.load(num_threads, index_path_prefix.c_str()); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); + int load_success = _index.load(_num_threads, index_path_prefix.c_str()); if (load_success != 0) { throw std::runtime_error("index load failed."); @@ -22,7 +23,7 @@ StaticDiskIndex
::StaticDiskIndex(const diskann::Metric metric, const std::st if (cache_mechanism == 1) { std::string sample_file = index_path_prefix + std::string("_sample_data.bin"); - cache_sample_paths(num_nodes_to_cache, sample_file, num_threads); + cache_sample_paths(num_nodes_to_cache, sample_file, _num_threads); } else if (cache_mechanism == 2) { diff --git a/python/src/static_memory_index.cpp b/python/src/static_memory_index.cpp index 0dbb24dc3..23a349fac 100644 --- a/python/src/static_memory_index.cpp +++ b/python/src/static_memory_index.cpp @@ -17,7 +17,7 @@ diskann::Index static_index_builder(const diskann::Me { throw std::runtime_error("initial_search_complexity must be a positive uint32_t"); } - auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_threads()); + auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_procs()); return diskann::Index
(m, dimensions, num_points, nullptr, // index write params std::make_shared(index_search_params), // index search params @@ -36,7 +36,7 @@ StaticMemoryIndex
::StaticMemoryIndex(const diskann::Metric m, const std::str const uint32_t initial_search_complexity) : _index(static_index_builder
(m, num_points, dimensions, initial_search_complexity)) { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); _index.load(index_prefix.c_str(), _num_threads, initial_search_complexity); } @@ -56,7 +56,7 @@ NeighborsAndDistances StaticMemoryIndex
::batch_search( py::array_t &queries, const uint64_t num_queries, const uint64_t knn, const uint64_t complexity, const uint32_t num_threads) { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs(); py::array_t ids({num_queries, knn}); py::array_t dists({num_queries, knn}); std::vector
empty_vector; diff --git a/python/tests/test_dynamic_memory_index.py b/python/tests/test_dynamic_memory_index.py index 48c05443c..13d9b08db 100644 --- a/python/tests/test_dynamic_memory_index.py +++ b/python/tests/test_dynamic_memory_index.py @@ -40,6 +40,7 @@ def setUpClass(cls) -> None: build_random_vectors_and_memory_index(np.float32, "cosine", with_tags=True), build_random_vectors_and_memory_index(np.uint8, "cosine", with_tags=True), build_random_vectors_and_memory_index(np.int8, "cosine", with_tags=True), + build_random_vectors_and_memory_index(np.float32, "mips", with_tags=True), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -442,4 +443,27 @@ def _tiny_index(): warnings.simplefilter("error") # turns warnings into raised exceptions index.batch_insert(rng.random((2, 10), dtype=np.float32), np.array([15, 25], dtype=np.uint32)) + def test_zero_threads(self): + for ( + metric, + dtype, + query_vectors, + index_vectors, + ann_dir, + vector_bin_file, + generated_tags, + ) in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.DynamicMemoryIndex( + distance_metric="l2", + vector_dtype=dtype, + dimensions=10, + max_vectors=11_000, + complexity=64, + graph_degree=32, + num_threads=0, # explicitly asking it to use all available threads. + ) + index.batch_insert(vectors=index_vectors, vector_ids=generated_tags, num_threads=0) + k = 5 + ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0) diff --git a/python/tests/test_static_disk_index.py b/python/tests/test_static_disk_index.py index c36c581d2..35015276e 100644 --- a/python/tests/test_static_disk_index.py +++ b/python/tests/test_static_disk_index.py @@ -25,7 +25,7 @@ def _build_random_vectors_and_index(dtype, metric): complexity=32, search_memory_maximum=0.00003, build_memory_maximum=1, - num_threads=1, + num_threads=0, pq_disk_bytes=0, ) return metric, dtype, query_vectors, index_vectors, ann_dir @@ -38,6 +38,7 @@ def setUpClass(cls) -> None: _build_random_vectors_and_index(np.float32, "l2"), _build_random_vectors_and_index(np.uint8, "l2"), _build_random_vectors_and_index(np.int8, "l2"), + _build_random_vectors_and_index(np.float32, "mips"), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -149,3 +150,19 @@ def test_value_ranges_batch_search(self): index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs ) + + def test_zero_threads(self): + for metric, dtype, query_vectors, index_vectors, ann_dir in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.StaticDiskIndex( + distance_metric="l2", + vector_dtype=dtype, + index_directory=ann_dir, + num_threads=0, # Issue #432 + num_nodes_to_cache=10, + ) + + k = 5 + ids, dists = index.batch_search( + query_vectors, k_neighbors=k, complexity=5, beam_width=2, num_threads=0 + ) \ No newline at end of file diff --git a/python/tests/test_static_memory_index.py b/python/tests/test_static_memory_index.py index ce12ed3bf..a04f98928 100644 --- a/python/tests/test_static_memory_index.py +++ b/python/tests/test_static_memory_index.py @@ -20,6 +20,7 @@ def setUpClass(cls) -> None: build_random_vectors_and_memory_index(np.float32, "cosine"), build_random_vectors_and_memory_index(np.uint8, "cosine"), build_random_vectors_and_memory_index(np.int8, "cosine"), + build_random_vectors_and_memory_index(np.float32, "mips"), ] cls._example_ann_dir = cls._test_matrix[0][4] @@ -165,3 +166,23 @@ def test_value_ranges_batch_search(self): index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs ) + + def test_zero_threads(self): + for ( + metric, + dtype, + query_vectors, + index_vectors, + ann_dir, + vector_bin_file, + _, + ) in self._test_matrix: + with self.subTest(msg=f"Testing dtype {dtype}"): + index = dap.StaticMemoryIndex( + index_directory=ann_dir, + num_threads=0, + initial_search_complexity=32, + ) + + k = 5 + ids, dists = index.batch_search(query_vectors, k_neighbors=k, complexity=5, num_threads=0) \ No newline at end of file diff --git a/src/index.cpp b/src/index.cpp index 0b10cc9a0..478b86273 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2370,7 +2370,7 @@ consolidation_report Index::consolidate_deletes(const IndexWrit const uint32_t range = params.max_degree; const uint32_t maxc = params.max_occlusion_size; const float alpha = params.alpha; - const uint32_t num_threads = params.num_threads == 0 ? omp_get_num_threads() : params.num_threads; + const uint32_t num_threads = params.num_threads == 0 ? omp_get_num_procs() : params.num_threads; uint32_t num_calls_to_process_delete = 0; diskann::Timer timer;