From 720a809e7dcf21ec1e264ad44954d52e9917d235 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Fri, 7 Jul 2023 10:50:25 -0700 Subject: [PATCH] Python Refactor (#385) * Refactor of diskannpy module code. * 0.5.0.rc1 for python and enabling the build-python portion of the pr-test process. * clang-format changes * In theory this should speed up the python build drastically by only building the wheel for the python version and OS we're attempting to fan out to in our CICD job tree * Missed a dollar sign * Copy/pasting left a CICD step name that implied we were running a code formatting check when instead we were building a wheel. This is now fixed. * In theory, readying the release action too. We won't know if it works until it merges and we cut a release, but at least the paths have been fixed * Designated initializers just happened to work on linux but shouldn't have as they weren't added until cpp20 * Formatting --- .github/actions/python-wheel/action.yml | 22 ++ .github/workflows/build-python.yml | 42 +- .github/workflows/pr-test.yml | 3 + .github/workflows/python-release.yml | 36 +- pyproject.toml | 4 +- python/CMakeLists.txt | 10 +- python/include/builder.h | 26 ++ python/include/common.h | 24 ++ python/include/dynamic_memory_index.h | 51 +++ python/include/static_disk_index.h | 52 +++ python/include/static_memory_index.h | 34 ++ python/src/_builder.py | 6 +- python/src/_static_disk_index.py | 6 +- python/src/builder.cpp | 82 ++++ python/src/diskann_bindings.cpp | 497 ------------------------ python/src/dynamic_memory_index.cpp | 166 ++++++++ python/src/module.cpp | 133 +++++++ python/src/static_disk_index.cpp | 108 +++++ python/src/static_memory_index.cpp | 77 ++++ 19 files changed, 831 insertions(+), 548 deletions(-) create mode 100644 .github/actions/python-wheel/action.yml create mode 100644 python/include/builder.h create mode 100644 python/include/common.h create mode 100644 python/include/dynamic_memory_index.h create mode 100644 python/include/static_disk_index.h create mode 100644 python/include/static_memory_index.h create mode 100644 python/src/builder.cpp create mode 100644 python/src/dynamic_memory_index.cpp create mode 100644 python/src/module.cpp create mode 100644 python/src/static_disk_index.cpp create mode 100644 python/src/static_memory_index.cpp diff --git a/.github/actions/python-wheel/action.yml b/.github/actions/python-wheel/action.yml new file mode 100644 index 000000000..277bd4704 --- /dev/null +++ b/.github/actions/python-wheel/action.yml @@ -0,0 +1,22 @@ +name: Build Python Wheel +description: Builds a python wheel with cibuildwheel +inputs: + cibw-identifer: + description: "CI build wheel identifier to build" + required: true +runs: + using: "composite" + steps: + - uses: actions/setup-python@v3 + - name: Install cibuildwheel + run: python -m pip install cibuildwheel==2.11.3 + shell: bash + - name: Building Python ${{inputs.cibw-identifier}} Wheel + run: python -m cibuildwheel --output-dir dist + env: + CIBW_BUILD: ${{inputs.cibw-identifier}} + shell: bash + - uses: actions/upload-artifact@v3 + with: + name: wheels + path: ./dist/*.whl diff --git a/.github/workflows/build-python.yml b/.github/workflows/build-python.yml index 9d94f3cd6..078ce20e2 100644 --- a/.github/workflows/build-python.yml +++ b/.github/workflows/build-python.yml @@ -1,14 +1,32 @@ name: DiskANN Build Python Wheel on: [workflow_call] jobs: - build: - name: Build for ${{matrix.python-version}} on ${{matrix.os}} + linux-build: + name: Python - Ubuntu - ${{matrix.cibw-identifier}} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] - runs-on: ${{matrix.os}} + cibw-identifier: ["cp38-manylinux_x86_64", "cp39-manylinux_x86_64", "cp310-manylinux_x86_64", "cp311-manylinux_x86_64"] + runs-on: ubuntu-latest + defaults: + run: + shell: bash + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Building python wheel ${{matrix.cibw-identifier}} + uses: ./.github/actions/python-wheel + with: + cibw-identifier: ${{matrix.cibw-identifier}} + windows-build: + name: Python - Windows - ${{matrix.cibw-identifier}} + strategy: + fail-fast: false + matrix: + cibw-identifier: ["cp38-win_amd64", "cp39-win_amd64", "cp310-win_amd64", "cp311-win_amd64"] + runs-on: windows-latest defaults: run: shell: bash @@ -17,14 +35,8 @@ jobs: uses: actions/checkout@v2 with: submodules: true - - uses: actions/setup-python@v3 - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.11.3 - - name: Building Wheel for Python ${{inputs.python-version}} - run: python -m cibuildwheel --output-dir dist - env: - CIBW_BUILD: ${{ inputs.python-version }} - - uses: actions/upload-artifact@v3 + fetch-depth: 1 + - name: Building python wheel ${{matrix.cibw-identifier}} + uses: ./.github/actions/python-wheel with: - name: wheels - path: ./dist/*.whl + cibw-identifier: ${{matrix.cibw-identifier}} diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 1d6b98c0c..38eefb3ff 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -24,3 +24,6 @@ jobs: dynamic: name: Dynamic uses: ./.github/workflows/dynamic.yml + python: + name: Python + uses: ./.github/workflows/build-python.yml diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index 62f889c2f..7c68ecfae 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -3,38 +3,20 @@ on: release: types: [published] jobs: - build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - uses: actions/setup-python@v3 - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.11.3 - - name: build wheels - run: python -m cibuildwheel --output-dir wheelhouse - env: - CIBW_ARCHS_LINUX: x86_64 - - uses: actions/upload-artifact@v3 - with: - name: wheelhouse - path: ./wheelhouse/*.whl + python-release-wheels: + name: Python + uses: ./.github/workflows/build-python.yml release: runs-on: ubuntu-latest - needs: build_wheels + needs: python-release-wheels steps: - uses: actions/download-artifact@v3 with: - name: wheelhouse - path: wheelhouse/ + name: dist + path: dist/ - name: Generate SHA256 files for each wheel run: | - sha256sum wheelhouse/*.whl > checksums.txt + sha256sum dist/*.whl > checksums.txt cat checksums.txt - uses: actions/setup-python@v3 - name: Install twine @@ -44,11 +26,11 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - twine upload wheelhouse/*.whl + twine upload dist/*.whl - name: Update release with SHA256 and Artifacts uses: softprops/action-gh-release@v1 with: token: ${{ secrets.GITHUB_TOKEN }} files: | - wheelhouse/*.whl + dist/*.whl checksums.txt diff --git a/pyproject.toml b/pyproject.toml index ec7246c06..e684d24d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,11 +11,11 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.5.0" +version = "0.5.0.rc1" description = "DiskANN Python extension module" # readme = "../README.md" -requires-python = ">=3.7" +requires-python = ">=3.8" license = {text = "MIT License"} dependencies = [ "numpy" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 89f5f8d40..d4faebf9b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -26,7 +26,15 @@ execute_process(COMMAND ${Python3_EXECUTABLE} -c "import numpy; print(numpy.get_ # pybind11_add_module(diskannpy MODULE src/diskann_bindings.cpp) # the following is fairly synonymous with pybind11_add_module, but we need more target_link_libraries # see https://pybind11.readthedocs.io/en/latest/compiling.html#advanced-interface-library-targets for more details -add_library(_diskannpy MODULE src/diskann_bindings.cpp) +add_library(_diskannpy MODULE + src/module.cpp + src/builder.cpp + src/dynamic_memory_index.cpp + src/static_memory_index.cpp + src/static_disk_index.cpp +) + +target_include_directories(_diskannpy AFTER PRIVATE include) if (MSVC) target_compile_options(_diskannpy PRIVATE /U_WINDLL) diff --git a/python/include/builder.h b/python/include/builder.h new file mode 100644 index 000000000..fc12976e7 --- /dev/null +++ b/python/include/builder.h @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include "common.h" +#include "distance.h" + +namespace diskannpy +{ +template +void build_disk_index(diskann::Metric metric, const std::string &data_file_path, const std::string &index_prefix_path, + uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit, + double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes); + +template +void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path, + const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity, + float alpha, uint32_t num_threads, bool use_pq_build, + size_t num_pq_bytes, bool use_opq, uint32_t filter_complexity, + bool use_tags = false); + +} diff --git a/python/include/common.h b/python/include/common.h new file mode 100644 index 000000000..7c63534fa --- /dev/null +++ b/python/include/common.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include +#include + +namespace py = pybind11; + +namespace diskannpy +{ + +typedef uint32_t filterT; + +typedef uint32_t StaticIdType; +typedef uint32_t DynamicIdType; + +template using NeighborsAndDistances = std::pair, py::array_t>; + +}; // namespace diskannpy diff --git a/python/include/dynamic_memory_index.h b/python/include/dynamic_memory_index.h new file mode 100644 index 000000000..9bfbbec2d --- /dev/null +++ b/python/include/dynamic_memory_index.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include +#include + +#include "common.h" +#include "index.h" +#include "parameters.h" + +namespace py = pybind11; + +namespace diskannpy +{ + +template +class DynamicMemoryIndex +{ + public: + DynamicMemoryIndex(diskann::Metric m, size_t dimensions, size_t max_vectors, uint32_t complexity, + uint32_t graph_degree, bool saturate_graph, uint32_t max_occlusion_size, float alpha, + uint32_t num_threads, uint32_t filter_complexity, uint32_t num_frozen_points, + uint32_t initial_search_complexity, uint32_t initial_search_threads, + bool concurrent_consolidation); + + void load(const std::string &index_path); + int insert(const py::array_t &vector, DynamicIdType id); + py::array_t batch_insert(py::array_t &vectors, + py::array_t &ids, int32_t num_inserts, + int num_threads = 0); + int mark_deleted(DynamicIdType id); + void save(const std::string &save_path, bool compact_before_save = false); + NeighborsAndDistances search(py::array_t &query, uint64_t knn, + uint64_t complexity); + NeighborsAndDistances batch_search(py::array_t &queries, + uint64_t num_queries, uint64_t knn, uint64_t complexity, + uint32_t num_threads); + void consolidate_delete(); + + private: + const uint32_t _initial_search_complexity; + const diskann::IndexWriteParameters _write_parameters; + diskann::Index _index; +}; + +}; // namespace diskannpy \ No newline at end of file diff --git a/python/include/static_disk_index.h b/python/include/static_disk_index.h new file mode 100644 index 000000000..71a1b5aff --- /dev/null +++ b/python/include/static_disk_index.h @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + + +#include +#include + +#ifdef _WINDOWS +#include "windows_aligned_file_reader.h" +#else +#include "linux_aligned_file_reader.h" +#endif + +#include "common.h" +#include "pq_flash_index.h" + +namespace py = pybind11; + +namespace diskannpy { + +#ifdef _WINDOWS +typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader; +#else +typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader; +#endif + +template +class StaticDiskIndex +{ + public: + StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads, + size_t num_nodes_to_cache, uint32_t cache_mechanism); + + void cache_bfs_levels(size_t num_nodes_to_cache); + + void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads); + + NeighborsAndDistances search(py::array_t &query, uint64_t knn, + uint64_t complexity, uint64_t beam_width); + + NeighborsAndDistances batch_search(py::array_t &queries, uint64_t num_queries, + uint64_t knn, uint64_t complexity, uint64_t beam_width, uint32_t num_threads); + private: + std::shared_ptr _reader; + diskann::PQFlashIndex
_index; +}; +} diff --git a/python/include/static_memory_index.h b/python/include/static_memory_index.h new file mode 100644 index 000000000..33f3187ae --- /dev/null +++ b/python/include/static_memory_index.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma once + +#include +#include + +#include +#include + +#include "common.h" +#include "index.h" + +namespace py = pybind11; + +namespace diskannpy { + +template +class StaticMemoryIndex +{ + public: + StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, + size_t dimensions, uint32_t num_threads, uint32_t initial_search_complexity); + + NeighborsAndDistances search(py::array_t &query, uint64_t knn, + uint64_t complexity); + + NeighborsAndDistances batch_search(py::array_t &queries, + uint64_t num_queries, uint64_t knn, uint64_t complexity, uint32_t num_threads); + private: + diskann::Index _index; +}; +} \ No newline at end of file diff --git a/python/src/_builder.py b/python/src/_builder.py index 7a997e4b7..8c9be32d9 100644 --- a/python/src/_builder.py +++ b/python/src/_builder.py @@ -266,11 +266,11 @@ def build_memory_index( num_points, dimensions = vector_file_metadata(vector_bin_path) if vector_dtype_actual == np.single: - _builder = _native_dap.build_in_memory_float_index + _builder = _native_dap.build_memory_float_index elif vector_dtype_actual == np.ubyte: - _builder = _native_dap.build_in_memory_uint8_index + _builder = _native_dap.build_memory_uint8_index else: - _builder = _native_dap.build_in_memory_int8_index + _builder = _native_dap.build_memory_int8_index index_prefix_path = os.path.join(index_directory, index_prefix) diff --git a/python/src/_static_disk_index.py b/python/src/_static_disk_index.py index 183fada5c..9111ffcee 100644 --- a/python/src/_static_disk_index.py +++ b/python/src/_static_disk_index.py @@ -80,11 +80,11 @@ def __init__( self._vector_dtype = vector_dtype if vector_dtype == np.single: - _index = _native_dap.DiskFloatIndex + _index = _native_dap.StaticDiskFloatIndex elif vector_dtype == np.ubyte: - _index = _native_dap.DiskUInt8Index + _index = _native_dap.StaticDiskUInt8Index else: - _index = _native_dap.DiskInt8Index + _index = _native_dap.StaticDiskInt8Index self._index = _index( distance_metric=dap_metric, index_path_prefix=os.path.join(index_directory, index_prefix), diff --git a/python/src/builder.cpp b/python/src/builder.cpp new file mode 100644 index 000000000..4485d66e6 --- /dev/null +++ b/python/src/builder.cpp @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "builder.h" +#include "common.h" +#include "disk_utils.h" +#include "index.h" +#include "parameters.h" + +namespace diskannpy +{ +template +void build_disk_index(const diskann::Metric metric, const std::string &data_file_path, + const std::string &index_prefix_path, const uint32_t complexity, const uint32_t graph_degree, + const double final_index_ram_limit, const double indexing_ram_budget, const uint32_t num_threads, + const uint32_t pq_disk_bytes) +{ + std::string params = std::to_string(graph_degree) + " " + std::to_string(complexity) + " " + + std::to_string(final_index_ram_limit) + " " + std::to_string(indexing_ram_budget) + " " + + std::to_string(num_threads); + if (pq_disk_bytes > 0) + params = params + " " + std::to_string(pq_disk_bytes); + diskann::build_disk_index
(data_file_path.c_str(), index_prefix_path.c_str(), params.c_str(), metric); +} + +template void build_disk_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + double, double, uint32_t, uint32_t); + +template void build_disk_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + double, double, uint32_t, uint32_t); +template void build_disk_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + double, double, uint32_t, uint32_t); + +template +void build_memory_index(const diskann::Metric metric, const std::string &vector_bin_path, + const std::string &index_output_path, const uint32_t graph_degree, const uint32_t complexity, + const float alpha, const uint32_t num_threads, const bool use_pq_build, + const size_t num_pq_bytes, const bool use_opq, const uint32_t filter_complexity, + const bool use_tags) +{ + diskann::IndexWriteParameters index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree) + .with_filter_list_size(filter_complexity) + .with_alpha(alpha) + .with_saturate_graph(false) + .with_num_threads(num_threads) + .build(); + size_t data_num, data_dim; + diskann::get_bin_metadata(vector_bin_path, data_num, data_dim); + diskann::Index index(metric, data_dim, data_num, use_tags, use_tags, false, use_pq_build, + num_pq_bytes, use_opq); + + if (use_tags) + { + const std::string tags_file = index_output_path + ".tags"; + if (!file_exists(tags_file)) + { + throw std::runtime_error("tags file not found at expected path: " + tags_file); + } + TagT *tags_data; + size_t tag_dims = 1; + diskann::load_bin(tags_file, tags_data, data_num, tag_dims); + std::vector tags(tags_data, tags_data + data_num); + index.build(vector_bin_path.c_str(), data_num, index_build_params, tags); + } + else + { + index.build(vector_bin_path.c_str(), data_num, index_build_params); + } + + index.save(index_output_path.c_str()); +} + +template void build_memory_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + float, uint32_t, bool, size_t, bool, uint32_t, bool); + +template void build_memory_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + float, uint32_t, bool, size_t, bool, uint32_t, bool); + +template void build_memory_index(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t, + float, uint32_t, bool, size_t, bool, uint32_t, bool); + +} // namespace diskannpy diff --git a/python/src/diskann_bindings.cpp b/python/src/diskann_bindings.cpp index 25a5262b1..8b1378917 100644 --- a/python/src/diskann_bindings.cpp +++ b/python/src/diskann_bindings.cpp @@ -1,498 +1 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#ifdef _WINDOWS -#include "windows_aligned_file_reader.h" -#else -#include "linux_aligned_file_reader.h" -#endif - -#include "disk_utils.h" -#include "index.h" -#include "pq_flash_index.h" -#include "utils.h" - -PYBIND11_MAKE_OPAQUE(std::vector); -PYBIND11_MAKE_OPAQUE(std::vector); -PYBIND11_MAKE_OPAQUE(std::vector); -PYBIND11_MAKE_OPAQUE(std::vector); - -namespace py = pybind11; -using namespace diskann; - -#ifdef _WINDOWS -typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader; -#else -typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader; -#endif - -template struct DiskIndex -{ - PQFlashIndex *_pq_flash_index; - std::shared_ptr reader; - - DiskIndex(const diskann::Metric metric, const std::string &index_path_prefix, const uint32_t num_threads, - const size_t num_nodes_to_cache, const uint32_t cache_mechanism) - { - reader = std::make_shared(); - _pq_flash_index = new PQFlashIndex(reader, metric); - int load_success = _pq_flash_index->load(num_threads, index_path_prefix.c_str()); - if (load_success != 0) - { - throw std::runtime_error("index load failed."); - } - if (cache_mechanism == 1) - { - std::string sample_file = index_path_prefix + std::string("_sample_data.bin"); - cache_sample_paths(num_nodes_to_cache, sample_file, num_threads); - } - else if (cache_mechanism == 2) - { - cache_bfs_levels(num_nodes_to_cache); - } - } - - ~DiskIndex() - { - delete _pq_flash_index; - } - - void cache_bfs_levels(const size_t num_nodes_to_cache) - { - std::vector node_list; - _pq_flash_index->cache_bfs_levels(num_nodes_to_cache, node_list); - _pq_flash_index->load_cache_list(node_list); - } - - void cache_sample_paths(const size_t num_nodes_to_cache, const std::string &warmup_query_file, - const uint32_t num_threads) - { - if (!file_exists(warmup_query_file)) - { - return; - } - - std::vector node_list; - _pq_flash_index->generate_cache_list_from_sample_queries(warmup_query_file, 15, 4, num_nodes_to_cache, - num_threads, node_list); - _pq_flash_index->load_cache_list(node_list); - } - - auto search(py::array_t &query, const uint64_t knn, - const uint64_t complexity, const uint64_t beam_width) - { - py::array_t ids(knn); - py::array_t dists(knn); - - std::vector u32_ids(knn); - std::vector u64_ids(knn); - QueryStats stats; - - _pq_flash_index->cached_beam_search(query.data(), knn, complexity, u64_ids.data(), dists.mutable_data(), - beam_width, false, &stats); - - auto r = ids.mutable_unchecked<1>(); - for (uint64_t i = 0; i < knn; ++i) - r(i) = (unsigned)u64_ids[i]; - - return std::make_pair(ids, dists); - } - - auto batch_search(py::array_t &queries, const uint64_t num_queries, - const uint64_t knn, const uint64_t complexity, const uint64_t beam_width, const int num_threads) - { - py::array_t ids({num_queries, knn}); - py::array_t dists({num_queries, knn}); - - omp_set_num_threads(num_threads); - - std::vector u64_ids(knn * num_queries); - -#pragma omp parallel for schedule(dynamic, 1) - for (int64_t i = 0; i < (int64_t)num_queries; i++) - { - _pq_flash_index->cached_beam_search(queries.data(i), knn, complexity, u64_ids.data() + i * knn, - dists.mutable_data(i), beam_width); - } - - auto r = ids.mutable_unchecked(); - for (uint64_t i = 0; i < num_queries; ++i) - for (uint64_t j = 0; j < knn; ++j) - r(i, j) = (uint32_t)u64_ids[i * knn + j]; - - return std::make_pair(ids, dists); - } -}; - -typedef uint32_t IdT; -typedef uint32_t filterT; - -template struct DynamicInMemIndex -{ - Index *_index; - IndexWriteParameters _write_params; - const uint32_t _initial_search_complexity; - - DynamicInMemIndex(const Metric m, const size_t dimensions, const size_t max_vectors, const uint32_t complexity, - const uint32_t graph_degree, const bool saturate_graph, const uint32_t max_occlusion_size, - const float alpha, const uint32_t num_threads, const uint32_t filter_complexity, - const uint32_t num_frozen_points, const uint32_t initial_search_complexity, - const uint32_t initial_search_threads, const bool concurrent_consolidation) - : _write_params(IndexWriteParametersBuilder(complexity, graph_degree) - .with_saturate_graph(saturate_graph) - .with_max_occlusion_size(max_occlusion_size) - .with_alpha(alpha) - .with_num_threads(num_threads) - .with_filter_list_size(filter_complexity) - .with_num_frozen_points(num_frozen_points) - .build()), - _initial_search_complexity(initial_search_complexity != 0 ? initial_search_complexity : complexity) - { - const uint32_t _initial_search_threads = - initial_search_threads != 0 ? initial_search_threads : omp_get_num_threads(); - - _index = new Index(m, dimensions, max_vectors, - true, // dynamic_index - _write_params, // used for insert - _initial_search_complexity, // used to prepare the scratch space for searching. can / may - // be expanded if the search asks for a larger L. - _initial_search_threads, // also used for the scratch space - true, // enable_tags - concurrent_consolidation, - false, // pq_dist_build - 0, // num_pq_chunks - false); // use_opq = false - _index->enable_delete(); - } - - ~DynamicInMemIndex() - { - delete _index; - } - - void load(const std::string &index_path) - { - const std::string tags_file = index_path + ".tags"; - if (!file_exists(tags_file)) - { - throw std::runtime_error("tags file not found at expected path: " + tags_file); - } - _index->load(index_path.c_str(), _write_params.num_threads, _initial_search_complexity); - } - - int insert(py::array_t &vector, const IdT id) - { - return _index->insert_point(vector.data(), id); - } - - auto batch_insert(py::array_t &vectors, - py::array_t &ids, const int32_t num_inserts, - const int num_threads = 0) - { - if (num_threads == 0) - omp_set_num_threads(omp_get_num_procs()); - else - omp_set_num_threads(num_threads); - py::array_t insert_retvals(num_inserts); - -#pragma omp parallel for schedule(dynamic, 1) - for (int32_t i = 0; i < num_inserts; i++) - { - insert_retvals.mutable_data()[i] = _index->insert_point(vectors.data(i), *(ids.data(i))); - } - - return insert_retvals; - } - - int mark_deleted(const IdT id) - { - return _index->lazy_delete(id); - } - - void save(const std::string &save_path = "", const bool compact_before_save = false) - { - if (save_path.empty()) - { - throw std::runtime_error("A save_path must be provided"); - } - _index->save(save_path.c_str(), compact_before_save); - } - - auto search(py::array_t &query, const uint64_t knn, - const uint64_t complexity) - { - py::array_t ids(knn); - py::array_t dists(knn); - std::vector empty_vector; - _index->search_with_tags(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data(), empty_vector); - return std::make_pair(ids, dists); - } - - auto batch_search(py::array_t &queries, const uint64_t num_queries, - const uint64_t knn, const uint64_t complexity, const int num_threads) - { - py::array_t ids({num_queries, knn}); - py::array_t dists({num_queries, knn}); - std::vector empty_vector; - - if (num_threads == 0) - omp_set_num_threads(omp_get_num_procs()); - else - omp_set_num_threads(num_threads); - -#pragma omp parallel for schedule(dynamic, 1) - for (int64_t i = 0; i < (int64_t)num_queries; i++) - { - _index->search_with_tags(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i), - empty_vector); - } - - return std::make_pair(ids, dists); - } - - auto consolidate_delete() - { - _index->consolidate_deletes(_write_params); - } -}; - -template struct StaticInMemIndex -{ - Index *_index; - - StaticInMemIndex(const Metric m, const std::string &index_prefix, const size_t num_points, const size_t dimensions, - const uint32_t num_threads, const uint32_t initial_search_complexity) - { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); - if (initial_search_complexity == 0) - { - throw std::runtime_error("initial_search_complexity must be a positive uint32_t"); - } - - _index = new Index(m, dimensions, num_points, - false, // not a dynamic_index - false, // no enable_tags/ids - false, // no concurrent_consolidate, - false, // pq_dist_build - 0, // num_pq_chunks - false, // use_opq = false - 0); // num_frozen_points - _index->load(index_prefix.c_str(), _num_threads, initial_search_complexity); - } - - ~StaticInMemIndex() - { - delete _index; - } - - auto search(py::array_t &query, const uint64_t knn, - const uint64_t complexity) - { - py::array_t ids(knn); - py::array_t dists(knn); - std::vector empty_vector; - _index->search(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data()); - return std::make_pair(ids, dists); - } - - auto batch_search(py::array_t &queries, const uint64_t num_queries, - const uint64_t knn, const uint64_t complexity, const int num_threads) - { - const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); - py::array_t ids({num_queries, knn}); - py::array_t dists({num_queries, knn}); - std::vector empty_vector; - - omp_set_num_threads(_num_threads); - -#pragma omp parallel for schedule(dynamic, 1) - for (int64_t i = 0; i < (int64_t)num_queries; i++) - { - _index->search(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i)); - } - - return std::make_pair(ids, dists); - } -}; - -template -void build_disk_index2(const diskann::Metric metric, const std::string &data_file_path, - const std::string &index_prefix_path, const uint32_t complexity, const uint32_t graph_degree, - const double final_index_ram_limit, const double indexing_ram_budget, const uint32_t num_threads, - const uint32_t pq_disk_bytes) -{ - std::string params = std::to_string(graph_degree) + " " + std::to_string(complexity) + " " + - std::to_string(final_index_ram_limit) + " " + std::to_string(indexing_ram_budget) + " " + - std::to_string(num_threads); - if (pq_disk_bytes > 0) - params = params + " " + std::to_string(pq_disk_bytes); - diskann::build_disk_index(data_file_path.c_str(), index_prefix_path.c_str(), params.c_str(), metric); -} - -template -void build_in_memory_index(const diskann::Metric &metric, const std::string &vector_bin_path, - const std::string &index_output_path, const uint32_t graph_degree, const uint32_t complexity, - const float alpha, const uint32_t num_threads, const bool use_pq_build, - const size_t num_pq_bytes, const bool use_opq, const uint32_t filter_complexity, - const bool use_tags = false) -{ - diskann::IndexWriteParameters index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree) - .with_filter_list_size(filter_complexity) - .with_alpha(alpha) - .with_saturate_graph(false) - .with_num_threads(num_threads) - .build(); - size_t data_num, data_dim; - diskann::get_bin_metadata(vector_bin_path, data_num, data_dim); - diskann::Index index(metric, data_dim, data_num, use_tags, use_tags, false, use_pq_build, - num_pq_bytes, use_opq); - - if (use_tags) - { - const std::string tags_file = index_output_path + ".tags"; - if (!file_exists(tags_file)) - { - throw std::runtime_error("tags file not found at expected path: " + tags_file); - } - TagT *tags_data; - size_t tag_dims = 1; - diskann::load_bin(tags_file, tags_data, data_num, tag_dims); - std::vector tags(tags_data, tags_data + data_num); - index.build(vector_bin_path.c_str(), data_num, index_build_params, tags); - } - else - { - index.build(vector_bin_path.c_str(), data_num, index_build_params); - } - - index.save(index_output_path.c_str()); -} - -template -inline void add_variant(py::module_ &m, const std::string &build_name, const std::string &class_name) -{ - const std::string build_disk_name = "build_disk_" + build_name + "_index"; - m.def(build_disk_name.c_str(), &build_disk_index2, py::arg("distance_metric"), py::arg("data_file_path"), - py::arg("index_prefix_path"), py::arg("complexity"), py::arg("graph_degree"), - py::arg("final_index_ram_limit"), py::arg("indexing_ram_budget"), py::arg("num_threads"), - py::arg("pq_disk_bytes")); - - const std::string build_in_memory_name = "build_in_memory_" + build_name + "_index"; - m.def(build_in_memory_name.c_str(), &build_in_memory_index, py::arg("distance_metric"), - py::arg("data_file_path"), py::arg("index_output_path"), py::arg("graph_degree"), py::arg("complexity"), - py::arg("alpha"), py::arg("num_threads"), py::arg("use_pq_build"), py::arg("num_pq_bytes"), - py::arg("use_opq"), py::arg("filter_complexity") = 0, py::arg("use_tags") = false); - - const std::string static_index = "StaticMemory" + class_name + "Index"; - py::class_>(m, static_index.c_str()) - .def( - py::init([](const diskann::Metric distance_metric, const std::string &index_path, const size_t num_points, - const size_t dimensions, const uint32_t num_threads, const uint32_t initial_search_complexity) { - return std::unique_ptr>(new StaticInMemIndex( - distance_metric, index_path, num_points, dimensions, num_threads, initial_search_complexity)); - }), - py::arg("distance_metric"), py::arg("index_path"), py::arg("num_points"), py::arg("dimensions"), - py::arg("num_threads"), py::arg("initial_search_complexity")) - .def("search", &StaticInMemIndex::search, py::arg("query"), py::arg("knn"), py::arg("complexity")) - .def("batch_search", &StaticInMemIndex::batch_search, py::arg("queries"), py::arg("num_queries"), - py::arg("knn"), py::arg("complexity"), py::arg("num_threads")); - - const std::string dynamic_index = "DynamicMemory" + class_name + "Index"; - py::class_>(m, dynamic_index.c_str()) - .def(py::init([](const diskann::Metric distance_metric, const size_t dimensions, const size_t max_vectors, - const uint32_t complexity, const uint32_t graph_degree, const bool saturate_graph, - const uint32_t max_occlusion_size, const float alpha, const uint32_t num_threads, - const uint32_t filter_complexity, const uint32_t num_frozen_points, - const uint32_t initial_search_complexity, const uint32_t search_threads, - const bool concurrent_consolidation) { - return std::unique_ptr>(new DynamicInMemIndex( - distance_metric, dimensions, max_vectors, complexity, graph_degree, saturate_graph, - max_occlusion_size, alpha, num_threads, filter_complexity, num_frozen_points, - initial_search_complexity, search_threads, concurrent_consolidation)); - }), - py::arg("distance_metric"), py::arg("dimensions"), py::arg("max_vectors"), py::arg("complexity"), - py::arg("graph_degree"), py::arg("saturate_graph") = diskann::defaults::SATURATE_GRAPH, - py::arg("max_occlusion_size") = diskann::defaults::MAX_OCCLUSION_SIZE, - py::arg("alpha") = diskann::defaults::ALPHA, py::arg("num_threads") = diskann::defaults::NUM_THREADS, - py::arg("filter_complexity") = diskann::defaults::FILTER_LIST_SIZE, - py::arg("num_frozen_points") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC, - py::arg("initial_search_complexity") = 0, py::arg("search_threads") = 0, - py::arg("concurrent_consolidation") = true) - .def("search", &DynamicInMemIndex::search, py::arg("query"), py::arg("knn"), py::arg("complexity")) - .def("load", &DynamicInMemIndex::load, py::arg("index_path")) - .def("batch_search", &DynamicInMemIndex::batch_search, py::arg("queries"), py::arg("num_queries"), - py::arg("knn"), py::arg("complexity"), py::arg("num_threads")) - .def("batch_insert", &DynamicInMemIndex::batch_insert, py::arg("vectors"), py::arg("ids"), - py::arg("num_inserts"), py::arg("num_threads")) - .def("save", &DynamicInMemIndex::save, py::arg("save_path") = "", py::arg("compact_before_save") = false) - .def("insert", &DynamicInMemIndex::insert, py::arg("vector"), py::arg("id")) - .def("mark_deleted", &DynamicInMemIndex::mark_deleted, py::arg("id")) - .def("consolidate_delete", &DynamicInMemIndex::consolidate_delete); - - const std::string disk_name = "Disk" + class_name + "Index"; - py::class_>(m, disk_name.c_str()) - .def(py::init([](const diskann::Metric distance_metric, const std::string &index_path_prefix, - const uint32_t num_threads, const size_t num_nodes_to_cache, const uint32_t cache_mechanism) { - return std::unique_ptr>(new DiskIndex(distance_metric, index_path_prefix, num_threads, - num_nodes_to_cache, cache_mechanism)); - }), - py::arg("distance_metric"), py::arg("index_path_prefix"), py::arg("num_threads"), - py::arg("num_nodes_to_cache"), py::arg("cache_mechanism") = 1) - .def("cache_bfs_levels", &DiskIndex::cache_bfs_levels, py::arg("num_nodes_to_cache")) - .def("search", &DiskIndex::search, py::arg("query"), py::arg("knn"), py::arg("complexity"), - py::arg("beam_width")) - .def("batch_search", &DiskIndex::batch_search, py::arg("queries"), py::arg("num_queries"), py::arg("knn"), - py::arg("complexity"), py::arg("beam_width"), py::arg("num_threads")); -} - -PYBIND11_MODULE(_diskannpy, m) -{ - m.doc() = "DiskANN Python Bindings"; -#ifdef VERSION_INFO - m.attr("__version__") = VERSION_INFO; -#else - m.attr("__version__") = "dev"; -#endif - - // let's re-export our defaults - py::module_ default_values = m.def_submodule( - "defaults", - "A collection of the default values used for common diskann operations. `GRAPH_DEGREE` and `COMPLEXITY` are not" - " set as defaults, but some semi-reasonable default values are selected for your convenience. We urge you to " - "investigate their meaning and adjust them for your use cases."); - - default_values.attr("ALPHA") = diskann::defaults::ALPHA; - default_values.attr("NUM_THREADS") = diskann::defaults::NUM_THREADS; - default_values.attr("MAX_OCCLUSION_SIZE") = diskann::defaults::MAX_OCCLUSION_SIZE; - default_values.attr("FILTER_COMPLEXITY") = diskann::defaults::FILTER_LIST_SIZE; - default_values.attr("NUM_FROZEN_POINTS_STATIC") = diskann::defaults::NUM_FROZEN_POINTS_STATIC; - default_values.attr("NUM_FROZEN_POINTS_DYNAMIC") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC; - default_values.attr("SATURATE_GRAPH") = diskann::defaults::SATURATE_GRAPH; - default_values.attr("GRAPH_DEGREE") = diskann::defaults::MAX_DEGREE; - default_values.attr("COMPLEXITY") = diskann::defaults::BUILD_LIST_SIZE; - default_values.attr("PQ_DISK_BYTES") = (uint32_t)0; - default_values.attr("USE_PQ_BUILD") = false; - default_values.attr("NUM_PQ_BYTES") = (uint32_t)0; - default_values.attr("USE_OPQ") = false; - - add_variant(m, "float", "Float"); - add_variant(m, "uint8", "UInt8"); - add_variant(m, "int8", "Int8"); - - py::enum_(m, "Metric") - .value("L2", Metric::L2) - .value("INNER_PRODUCT", Metric::INNER_PRODUCT) - .value("COSINE", Metric::COSINE) - .export_values(); -} diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp new file mode 100644 index 000000000..98cf5b628 --- /dev/null +++ b/python/src/dynamic_memory_index.cpp @@ -0,0 +1,166 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "parameters.h" +#include "dynamic_memory_index.h" + +#include "pybind11/numpy.h" + +namespace diskannpy +{ + +diskann::IndexWriteParameters dynamic_index_write_parameters(const uint32_t complexity, const uint32_t graph_degree, + const bool saturate_graph, + const uint32_t max_occlusion_size, const float alpha, + const uint32_t num_threads, + const uint32_t filter_complexity, + const uint32_t num_frozen_points) +{ + return diskann::IndexWriteParametersBuilder(complexity, graph_degree) + .with_saturate_graph(saturate_graph) + .with_max_occlusion_size(max_occlusion_size) + .with_alpha(alpha) + .with_num_threads(num_threads) + .with_filter_list_size(filter_complexity) + .with_num_frozen_points(num_frozen_points) + .build(); +} + +template +diskann::Index dynamic_index_builder(const diskann::Metric m, + const diskann::IndexWriteParameters &write_params, + const size_t dimensions, const size_t max_vectors, + const uint32_t initial_search_complexity, + const uint32_t initial_search_threads, + const bool concurrent_consolidation) +{ + const uint32_t _initial_search_threads = + initial_search_threads != 0 ? initial_search_threads : omp_get_num_threads(); + return diskann::Index( + m, dimensions, max_vectors, + true, // dynamic_index + write_params, // used for insert + initial_search_complexity, // used to prepare the scratch space for searching. can / may + // be expanded if the search asks for a larger L. + _initial_search_threads, // also used for the scratch space + true, // enable_tags + concurrent_consolidation, + false, // pq_dist_build + 0, // num_pq_chunks + false); // use_opq = false +} + +template +DynamicMemoryIndex
::DynamicMemoryIndex(const diskann::Metric m, const size_t dimensions, const size_t max_vectors, + const uint32_t complexity, const uint32_t graph_degree, + const bool saturate_graph, const uint32_t max_occlusion_size, + const float alpha, const uint32_t num_threads, + const uint32_t filter_complexity, const uint32_t num_frozen_points, + const uint32_t initial_search_complexity, + const uint32_t initial_search_threads, const bool concurrent_consolidation) + : _initial_search_complexity(initial_search_complexity != 0 ? initial_search_complexity : complexity), + _write_parameters(dynamic_index_write_parameters(complexity, graph_degree, saturate_graph, max_occlusion_size, + alpha, num_threads, filter_complexity, num_frozen_points)), + _index(dynamic_index_builder
(m, _write_parameters, dimensions, max_vectors, _initial_search_complexity, + initial_search_threads, concurrent_consolidation)) +{ +} + +template void DynamicMemoryIndex
::load(const std::string &index_path) +{ + const std::string tags_file = index_path + ".tags"; + if (!file_exists(tags_file)) + { + throw std::runtime_error("tags file not found at expected path: " + tags_file); + } + _index.load(index_path.c_str(), _write_parameters.num_threads, _initial_search_complexity); +} + +template +int DynamicMemoryIndex
::insert(const py::array_t &vector, + const DynamicIdType id) +{ + return _index.insert_point(vector.data(), id); +} + +template +py::array_t DynamicMemoryIndex
::batch_insert( + py::array_t &vectors, + py::array_t &ids, const int32_t num_inserts, + const int num_threads) +{ + if (num_threads == 0) + omp_set_num_threads(omp_get_num_procs()); + else + omp_set_num_threads(num_threads); + py::array_t insert_retvals(num_inserts); + +#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_inserts, insert_retvals, vectors, ids) + for (int32_t i = 0; i < num_inserts; i++) + { + insert_retvals.mutable_data()[i] = _index.insert_point(vectors.data(i), *(ids.data(i))); + } + + return insert_retvals; +} + +template int DynamicMemoryIndex
::mark_deleted(const DynamicIdType id) +{ + return this->_index.lazy_delete(id); +} + +template void DynamicMemoryIndex
::save(const std::string &save_path, const bool compact_before_save) +{ + if (save_path.empty()) + { + throw std::runtime_error("A save_path must be provided"); + } + _index.save(save_path.c_str(), compact_before_save); +} + +template +NeighborsAndDistances DynamicMemoryIndex
::search( + py::array_t &query, const uint64_t knn, const uint64_t complexity) +{ + py::array_t ids(knn); + py::array_t dists(knn); + std::vector
empty_vector; + _index.search_with_tags(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data(), empty_vector); + return std::make_pair(ids, dists); +} + +template +NeighborsAndDistances DynamicMemoryIndex
::batch_search( + py::array_t &queries, const uint64_t num_queries, const uint64_t knn, + const uint64_t complexity, const uint32_t num_threads) +{ + py::array_t ids({num_queries, knn}); + py::array_t dists({num_queries, knn}); + std::vector
empty_vector; + + if (num_threads == 0) + omp_set_num_threads(omp_get_num_procs()); + else + omp_set_num_threads(static_cast(num_threads)); + +#pragma omp parallel for schedule(dynamic, 1) default(none) \ + shared(num_queries, queries, knn, complexity, ids, dists, empty_vector) + for (int64_t i = 0; i < (int64_t)num_queries; i++) + { + _index.search_with_tags(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i), + empty_vector); + } + + return std::make_pair(ids, dists); +} + +template void DynamicMemoryIndex
::consolidate_delete() +{ + _index.consolidate_deletes(_write_parameters); +} + +template class DynamicMemoryIndex; +template class DynamicMemoryIndex; +template class DynamicMemoryIndex; + +}; // namespace diskannpy diff --git a/python/src/module.cpp b/python/src/module.cpp new file mode 100644 index 000000000..9f7337002 --- /dev/null +++ b/python/src/module.cpp @@ -0,0 +1,133 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include + +#include +#include + +#include "defaults.h" +#include "distance.h" + +#include "builder.h" +#include "dynamic_memory_index.h" +#include "static_disk_index.h" +#include "static_memory_index.h" + +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); + +namespace py = pybind11; +using namespace pybind11::literals; + +struct Variant +{ + std::string disk_builder_name; + std::string memory_builder_name; + std::string dynamic_memory_index_name; + std::string static_memory_index_name; + std::string static_disk_index_name; +}; + +const Variant FloatVariant{"build_disk_float_index", "build_memory_float_index", "DynamicMemoryFloatIndex", + "StaticMemoryFloatIndex", "StaticDiskFloatIndex"}; + +const Variant UInt8Variant{"build_disk_uint8_index", "build_memory_uint8_index", "DynamicMemoryUInt8Index", + "StaticMemoryUInt8Index", "StaticDiskUInt8Index"}; + +const Variant Int8Variant{"build_disk_int8_index", "build_memory_int8_index", "DynamicMemoryInt8Index", + "StaticMemoryInt8Index", "StaticDiskInt8Index"}; + +template inline void add_variant(py::module_ &m, const Variant &variant) +{ + m.def(variant.disk_builder_name.c_str(), &diskannpy::build_disk_index, "distance_metric"_a, "data_file_path"_a, + "index_prefix_path"_a, "complexity"_a, "graph_degree"_a, "final_index_ram_limit"_a, "indexing_ram_budget"_a, + "num_threads"_a, "pq_disk_bytes"_a); + + m.def(variant.memory_builder_name.c_str(), &diskannpy::build_memory_index, "distance_metric"_a, + "data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a, + "use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "filter_complexity"_a = 0, "use_tags"_a = false); + + py::class_>(m, variant.static_memory_index_name.c_str()) + .def(py::init(), + "distance_metric"_a, "index_path"_a, "num_points"_a, "dimensions"_a, "num_threads"_a, + "initial_search_complexity"_a) + .def("search", &diskannpy::StaticMemoryIndex::search, "query"_a, "knn"_a, "complexity"_a) + .def("batch_search", &diskannpy::StaticMemoryIndex::batch_search, "queries"_a, "num_queries"_a, "knn"_a, + "complexity"_a, "num_threads"_a); + + py::class_>(m, variant.dynamic_memory_index_name.c_str()) + .def(py::init(), + "distance_metric"_a, "dimensions"_a, "max_vectors"_a, "complexity"_a, "graph_degree"_a, + "saturate_graph"_a = diskann::defaults::SATURATE_GRAPH, + "max_occlusion_size"_a = diskann::defaults::MAX_OCCLUSION_SIZE, "alpha"_a = diskann::defaults::ALPHA, + "num_threads"_a = diskann::defaults::NUM_THREADS, + "filter_complexity"_a = diskann::defaults::FILTER_LIST_SIZE, + "num_frozen_points"_a = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC, "initial_search_complexity"_a = 0, + "search_threads"_a = 0, "concurrent_consolidation"_a = true) + .def("search", &diskannpy::DynamicMemoryIndex::search, "query"_a, "knn"_a, "complexity"_a) + .def("load", &diskannpy::DynamicMemoryIndex::load, "index_path"_a) + .def("batch_search", &diskannpy::DynamicMemoryIndex::batch_search, "queries"_a, "num_queries"_a, "knn"_a, + "complexity"_a, "num_threads"_a) + .def("batch_insert", &diskannpy::DynamicMemoryIndex::batch_insert, "vectors"_a, "ids"_a, "num_inserts"_a, + "num_threads"_a) + .def("save", &diskannpy::DynamicMemoryIndex::save, "save_path"_a = "", "compact_before_save"_a = false) + .def("insert", &diskannpy::DynamicMemoryIndex::insert, "vector"_a, "id"_a) + .def("mark_deleted", &diskannpy::DynamicMemoryIndex::mark_deleted, "id"_a) + .def("consolidate_delete", &diskannpy::DynamicMemoryIndex::consolidate_delete); + + py::class_>(m, variant.static_disk_index_name.c_str()) + .def(py::init(), + "distance_metric"_a, "index_path_prefix"_a, "num_threads"_a, "num_nodes_to_cache"_a, + "cache_mechanism"_a = 1) + .def("cache_bfs_levels", &diskannpy::StaticDiskIndex::cache_bfs_levels, "num_nodes_to_cache"_a) + .def("search", &diskannpy::StaticDiskIndex::search, "query"_a, "knn"_a, "complexity"_a, "beam_width"_a) + .def("batch_search", &diskannpy::StaticDiskIndex::batch_search, "queries"_a, "num_queries"_a, "knn"_a, + "complexity"_a, "beam_width"_a, "num_threads"_a); +} + +PYBIND11_MODULE(_diskannpy, m) +{ + m.doc() = "DiskANN Python Bindings"; +#ifdef VERSION_INFO + m.attr("__version__") = VERSION_INFO; +#else + m.attr("__version__") = "dev"; +#endif + + // let's re-export our defaults + py::module_ default_values = m.def_submodule( + "defaults", + "A collection of the default values used for common diskann operations. `GRAPH_DEGREE` and `COMPLEXITY` are not" + " set as defaults, but some semi-reasonable default values are selected for your convenience. We urge you to " + "investigate their meaning and adjust them for your use cases."); + + default_values.attr("ALPHA") = diskann::defaults::ALPHA; + default_values.attr("NUM_THREADS") = diskann::defaults::NUM_THREADS; + default_values.attr("MAX_OCCLUSION_SIZE") = diskann::defaults::MAX_OCCLUSION_SIZE; + default_values.attr("FILTER_COMPLEXITY") = diskann::defaults::FILTER_LIST_SIZE; + default_values.attr("NUM_FROZEN_POINTS_STATIC") = diskann::defaults::NUM_FROZEN_POINTS_STATIC; + default_values.attr("NUM_FROZEN_POINTS_DYNAMIC") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC; + default_values.attr("SATURATE_GRAPH") = diskann::defaults::SATURATE_GRAPH; + default_values.attr("GRAPH_DEGREE") = diskann::defaults::MAX_DEGREE; + default_values.attr("COMPLEXITY") = diskann::defaults::BUILD_LIST_SIZE; + default_values.attr("PQ_DISK_BYTES") = (uint32_t)0; + default_values.attr("USE_PQ_BUILD") = false; + default_values.attr("NUM_PQ_BYTES") = (uint32_t)0; + default_values.attr("USE_OPQ") = false; + + add_variant(m, FloatVariant); + add_variant(m, UInt8Variant); + add_variant(m, Int8Variant); + + py::enum_(m, "Metric") + .value("L2", diskann::Metric::L2) + .value("INNER_PRODUCT", diskann::Metric::INNER_PRODUCT) + .value("COSINE", diskann::Metric::COSINE) + .export_values(); +} diff --git a/python/src/static_disk_index.cpp b/python/src/static_disk_index.cpp new file mode 100644 index 000000000..654f8ec30 --- /dev/null +++ b/python/src/static_disk_index.cpp @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "static_disk_index.h" + +#include "pybind11/numpy.h" + +namespace diskannpy +{ + +template +StaticDiskIndex
::StaticDiskIndex(const diskann::Metric metric, const std::string &index_path_prefix, + const uint32_t num_threads, const size_t num_nodes_to_cache, + const uint32_t cache_mechanism) + : _reader(std::make_shared()), _index(_reader, metric) +{ + int load_success = _index.load(num_threads, index_path_prefix.c_str()); + if (load_success != 0) + { + throw std::runtime_error("index load failed."); + } + if (cache_mechanism == 1) + { + std::string sample_file = index_path_prefix + std::string("_sample_data.bin"); + cache_sample_paths(num_nodes_to_cache, sample_file, num_threads); + } + else if (cache_mechanism == 2) + { + cache_bfs_levels(num_nodes_to_cache); + } +} + +template void StaticDiskIndex
::cache_bfs_levels(const size_t num_nodes_to_cache) +{ + std::vector node_list; + _index.cache_bfs_levels(num_nodes_to_cache, node_list); + _index.load_cache_list(node_list); +} + +template +void StaticDiskIndex
::cache_sample_paths(const size_t num_nodes_to_cache, const std::string &warmup_query_file, + const uint32_t num_threads) +{ + if (!file_exists(warmup_query_file)) + { + return; + } + + std::vector node_list; + _index.generate_cache_list_from_sample_queries(warmup_query_file, 15, 4, num_nodes_to_cache, num_threads, + node_list); + _index.load_cache_list(node_list); +} + +template +NeighborsAndDistances StaticDiskIndex
::search( + py::array_t &query, const uint64_t knn, const uint64_t complexity, + const uint64_t beam_width) +{ + py::array_t ids(knn); + py::array_t dists(knn); + + std::vector u32_ids(knn); + std::vector u64_ids(knn); + diskann::QueryStats stats; + + _index.cached_beam_search(query.data(), knn, complexity, u64_ids.data(), dists.mutable_data(), beam_width, false, + &stats); + + auto r = ids.mutable_unchecked<1>(); + for (uint64_t i = 0; i < knn; ++i) + r(i) = (unsigned)u64_ids[i]; + + return std::make_pair(ids, dists); +} + +template +NeighborsAndDistances StaticDiskIndex
::batch_search( + py::array_t &queries, const uint64_t num_queries, const uint64_t knn, + const uint64_t complexity, const uint64_t beam_width, const uint32_t num_threads) +{ + py::array_t ids({num_queries, knn}); + py::array_t dists({num_queries, knn}); + + omp_set_num_threads(num_threads); + + std::vector u64_ids(knn * num_queries); + +#pragma omp parallel for schedule(dynamic, 1) default(none) \ + shared(num_queries, queries, knn, complexity, u64_ids, dists, beam_width) + for (int64_t i = 0; i < (int64_t)num_queries; i++) + { + _index.cached_beam_search(queries.data(i), knn, complexity, u64_ids.data() + i * knn, dists.mutable_data(i), + beam_width); + } + + auto r = ids.mutable_unchecked(); + for (uint64_t i = 0; i < num_queries; ++i) + for (uint64_t j = 0; j < knn; ++j) + r(i, j) = (uint32_t)u64_ids[i * knn + j]; + + return std::make_pair(ids, dists); +} + +template class StaticDiskIndex; +template class StaticDiskIndex; +template class StaticDiskIndex; +} // namespace diskannpy \ No newline at end of file diff --git a/python/src/static_memory_index.cpp b/python/src/static_memory_index.cpp new file mode 100644 index 000000000..3bd927174 --- /dev/null +++ b/python/src/static_memory_index.cpp @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include "static_memory_index.h" + +#include "pybind11/numpy.h" + +namespace diskannpy +{ + +template +diskann::Index static_index_builder(const diskann::Metric m, const size_t num_points, + const size_t dimensions, + const uint32_t initial_search_complexity) +{ + if (initial_search_complexity == 0) + { + throw std::runtime_error("initial_search_complexity must be a positive uint32_t"); + } + + return diskann::Index
(m, dimensions, num_points, + false, // not a dynamic_index + false, // no enable_tags/ids + false, // no concurrent_consolidate, + false, // pq_dist_build + 0, // num_pq_chunks + false, // use_opq = false + 0); // num_frozen_points +} + +template +StaticMemoryIndex
::StaticMemoryIndex(const diskann::Metric m, const std::string &index_prefix, + const size_t num_points, const size_t dimensions, const uint32_t num_threads, + const uint32_t initial_search_complexity) + : _index(static_index_builder
(m, num_points, dimensions, initial_search_complexity)) +{ + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + _index.load(index_prefix.c_str(), _num_threads, initial_search_complexity); +} + +template +NeighborsAndDistances StaticMemoryIndex
::search( + py::array_t &query, const uint64_t knn, const uint64_t complexity) +{ + py::array_t ids(knn); + py::array_t dists(knn); + std::vector
empty_vector; + _index.search(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data()); + return std::make_pair(ids, dists); +} + +template +NeighborsAndDistances StaticMemoryIndex
::batch_search( + py::array_t &queries, const uint64_t num_queries, const uint64_t knn, + const uint64_t complexity, const uint32_t num_threads) +{ + const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_threads(); + py::array_t ids({num_queries, knn}); + py::array_t dists({num_queries, knn}); + std::vector
empty_vector; + + omp_set_num_threads(static_cast(_num_threads)); + +#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_queries, queries, knn, complexity, ids, dists) + for (int64_t i = 0; i < (int64_t)num_queries; i++) + { + _index.search(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i)); + } + + return std::make_pair(ids, dists); +} + +template class StaticMemoryIndex; +template class StaticMemoryIndex; +template class StaticMemoryIndex; + +} // namespace diskannpy \ No newline at end of file