Skip to content

Commit

Permalink
Support for using PQ distance comparisons for index build. (#177)
Browse files Browse the repository at this point in the history
* Support for using PQ distance comparisons for index build.

label pq pivots file with chunk size

Use full precision distance comparisons for pruning

added PQ based build option to build memory and disk index

fixing sampling probability calculation in PQ training data generation

documented the PQ build option in workflows

* fix help message in build_memory_index for num_PQ_bytes

* Add CI tests

* Added timers to build_disk_index; simplified include file list

* set maxc in index class to default value to avoid scratch space allocations of undefined sizes
  • Loading branch information
harsha-simhadri authored Jan 5, 2023
1 parent eed88e3 commit 0e95756
Show file tree
Hide file tree
Showing 17 changed files with 484 additions and 262 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,18 @@ jobs:
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type float --dist_fn cosine --data_path ./rand_float_10D_10K_norm1.0.bin --index_path_prefix ./index_cosine_rand_float_10D_10K_norm1.0
${{ env.diskann_built_tests }}/search_memory_index --data_type float --dist_fn cosine --index_path_prefix ./index_l2_rand_float_10D_10K_norm1.0 --query_file ./rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file ./cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L 16 32
- name: build and search in-memory index with L2 metric with PQ based distance comparisons
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type float --dist_fn l2 --data_path ./rand_float_10D_10K_norm1.0.bin --index_path_prefix ./index_l2_rand_float_10D_10K_norm1.0_buildpq5 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_memory_index --data_type float --dist_fn l2 --index_path_prefix ./index_l2_rand_float_10D_10K_norm1.0_buildpq5 --query_file ./rand_float_10D_1K_norm1.0.bin --recall_at 10 --result_path temp --gt_file ./l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 -L 16 32
- name: build and search disk index (one shot graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type float --dist_fn l2 --data_path ./rand_float_10D_10K_norm1.0.bin --index_path_prefix ./disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
${{ env.diskann_built_tests }}/search_disk_index --data_type float --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file ./rand_float_10D_1K_norm1.0.bin --gt_file ./l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type float --dist_fn l2 --data_path ./rand_float_10D_10K_norm1.0.bin --index_path_prefix ./disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_disk_index --data_type float --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file ./rand_float_10D_1K_norm1.0.bin --gt_file ./l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type float --dist_fn l2 --data_path ./rand_float_10D_10K_norm1.0.bin --index_path_prefix ./disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
Expand Down Expand Up @@ -129,10 +137,18 @@ jobs:
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type int8 --dist_fn cosine --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix ./index_cosine_rand_int8_10D_10K_norm50.0
${{ env.diskann_built_tests }}/search_memory_index --data_type int8 --dist_fn cosine --index_path_prefix ./index_l2_rand_int8_10D_10K_norm50.0 --query_file ./rand_int8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file ./cosine_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 16 32
- name: build and search in-memory index with L2 metrics with PQ base distance comparisons
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix ./index_l2_rand_int8_10D_10K_norm50.0_buildpq5 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_memory_index --data_type int8 --dist_fn l2 --index_path_prefix ./index_l2_rand_int8_10D_10K_norm50.0_buildpq5 --query_file ./rand_int8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file ./l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 16 32
- name: build and search disk index (one shot graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
${{ env.diskann_built_tests }}/search_disk_index --data_type int8 --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot --result_path /tmp/res --query_file ./rand_int8_10D_1K_norm50.0.bin --gt_file ./l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_disk_index --data_type int8 --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file ./rand_int8_10D_1K_norm50.0.bin --gt_file ./l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type int8 --dist_fn l2 --data_path ./rand_int8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
Expand Down Expand Up @@ -168,10 +184,18 @@ jobs:
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type uint8 --dist_fn cosine --data_path ./rand_uint8_10D_10K_norm50.0.bin --index_path_prefix ./index_cosine_rand_uint8_10D_10K_norm50.0
${{ env.diskann_built_tests }}/search_memory_index --data_type uint8 --dist_fn cosine --index_path_prefix ./index_l2_rand_uint8_10D_10K_norm50.0 --query_file ./rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file ./cosine_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 16 32
- name: build and search in-memory index with L2 metrics with PQ base distance comparisons
run: |
${{ env.diskann_built_tests }}/build_memory_index --data_type uint8 --dist_fn l2 --data_path ./rand_uint8_10D_10K_norm50.0.bin --index_path_prefix ./index_l2_rand_uint8_10D_10K_norm50.0_buildpq5 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_memory_index --data_type uint8 --dist_fn l2 --index_path_prefix ./index_l2_rand_uint8_10D_10K_norm50.0_buildpq5 --query_file ./rand_uint8_10D_1K_norm50.0.bin --recall_at 10 --result_path temp --gt_file ./l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 -L 16 32
- name: build and search disk index (one shot graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type uint8 --dist_fn l2 --data_path ./rand_uint8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
${{ env.diskann_built_tests }}/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot --result_path /tmp/res --query_file ./rand_uint8_10D_1K_norm50.0.bin --gt_file ./l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, L2, no diskPQ, build with PQ distance comparisons)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type uint8 --dist_fn l2 --data_path ./rand_uint8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 -R 16 -L 32 -B 0.00003 -M 1 --build_PQ_bytes 5
${{ env.diskann_built_tests }}/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix ./disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_oneshot_buildpq5 --result_path /tmp/res --query_file ./rand_uint8_10D_1K_norm50.0.bin --gt_file ./l2_rand_uint8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --recall_at 5 -L 5 10 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, L2, no diskPQ)
run: |
${{ env.diskann_built_tests }}/build_disk_index --data_type uint8 --dist_fn l2 --data_path ./rand_uint8_10D_10K_norm50.0.bin --index_path_prefix ./disk_index_l2_rand_uint8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
Expand Down
8 changes: 6 additions & 2 deletions include/common_includes.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,20 @@
#include <atomic>
#include <cassert>
#include <chrono>
#include <climits>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fcntl.h>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <omp.h>
#include <queue>
#include <random>
#include <string.h>
#include <set>
#include <shared_mutex>
#include <sys/stat.h>
#include <sstream>
#include <time.h>
#include <vector>
2 changes: 1 addition & 1 deletion include/disk_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ namespace diskann {
std::string base_file, diskann::Metric _compareMetric, unsigned L,
unsigned R, double sampling_rate, double ram_budget,
std::string mem_index_path, std::string medoids_file,
std::string centroids_file);
std::string centroids_file, size_t build_pq_bytes, bool use_opq);

template<typename T>
DISKANN_DLLEXPORT uint32_t optimize_beamwidth(
Expand Down
44 changes: 30 additions & 14 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@

#pragma once

#include <atomic>
#include <cassert>
#include <shared_mutex>
#include <sstream>
#include <string>
#include "common_includes.h"

#ifdef EXEC_ENV_OLS
#include "aligned_file_reader.h"
Expand All @@ -26,6 +22,7 @@
#define GRAPH_SLACK_FACTOR 1.3
#define OVERHEAD_FACTOR 1.1
#define EXPAND_IF_FULL 0
#define DEFAULT_MAXC 750

namespace diskann {
inline double estimate_ram_usage(_u64 size, _u32 dim, _u32 datasize,
Expand Down Expand Up @@ -72,15 +69,21 @@ namespace diskann {
const size_t max_points = 1,
const bool dynamic_index = false,
const bool enable_tags = false,
const bool concurrent_consolidate = false);
const bool concurrent_consolidate = false,
const bool pq_dist_build = false,
const size_t num_pq_chunks = 0,
const bool use_opq = false);

// Constructor for incremental index
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
const bool dynamic_index,
const Parameters &indexParameters,
const Parameters &searchParameters,
const bool enable_tags = false,
const bool concurrent_consolidate = false);
const bool concurrent_consolidate = false,
const bool pq_dist_build = false,
const size_t num_pq_chunks = 0,
const bool use_opq = false);

DISKANN_DLLEXPORT ~Index();

Expand Down Expand Up @@ -226,21 +229,25 @@ namespace diskann {
InMemQueryScratch<T> *scratch);

void prune_neighbors(const unsigned location, std::vector<Neighbor> &pool,
std::vector<unsigned> &pruned_list);
std::vector<unsigned> &pruned_list,
InMemQueryScratch<T> *scratch);

void prune_neighbors(const unsigned location, std::vector<Neighbor> &pool,
const _u32 range, const _u32 max_candidate_size,
const float alpha, std::vector<unsigned> &pruned_list);
const float alpha, std::vector<unsigned> &pruned_list,
InMemQueryScratch<T> *scratch);

void occlude_list(std::vector<Neighbor> &pool, const float alpha,
const unsigned degree, const unsigned maxc,
std::vector<Neighbor> &result);
std::vector<Neighbor> &result,
InMemQueryScratch<T> *scratch);

// add reverse links from all the visited nodes to node n.
void inter_insert(unsigned n, std::vector<unsigned> &pruned_list,
const _u32 range);
const _u32 range, InMemQueryScratch<T> *scratch);

void inter_insert(unsigned n, std::vector<unsigned> &pruned_list);
void inter_insert(unsigned n, std::vector<unsigned> &pruned_list,
InMemQueryScratch<T> *scratch);

void link(Parameters &parameters);

Expand All @@ -265,10 +272,11 @@ namespace diskann {
// deleted neighbors Acquire _locks[i] prior to calling for thread-safety
void process_delete(const tsl::robin_set<unsigned> &old_delete_set,
size_t i, const unsigned &range, const unsigned &maxc,
const float &alpha);
const float &alpha, InMemQueryScratch<T> *scratch);

void initialize_query_scratch(uint32_t num_threads, uint32_t search_l,
uint32_t indexing_l, uint32_t r, size_t dim);
uint32_t indexing_l, uint32_t r,
uint32_t maxc, size_t dim);

// Do not call without acquiring appropriate locks
// call public member functions save and load to invoke these.
Expand Down Expand Up @@ -340,6 +348,14 @@ namespace diskann {
tsl::robin_set<unsigned> _delete_set;
natural_number_set<unsigned> _empty_slots;

// Flags for PQ based distance calculation
bool _pq_dist = false;
bool _use_opq = false;
size_t _num_pq_chunks = 0;
_u8 *_pq_data = nullptr;
bool _pq_generated = false;
FixedChunkPQTable _pq_table;

bool _lazy_done = false; // true if lazy deletions have been made
bool _data_compacted = true; // true if data has been compacted
bool _is_saved = false; // Gopal. Checking if the index is already saved.
Expand Down
10 changes: 4 additions & 6 deletions include/pq_flash_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,7 @@
// Licensed under the MIT license.

#pragma once
#include <cassert>
#include <sstream>
#include <stack>
#include <string>
#include "tsl/robin_map.h"
#include "tsl/robin_set.h"
#include "common_includes.h"

#include "aligned_file_reader.h"
#include "concurrent_queue.h"
Expand All @@ -18,6 +13,9 @@
#include "utils.h"
#include "windows_customizations.h"
#include "scratch.h"
#include "tsl/robin_map.h"
#include "tsl/robin_set.h"


#define FULL_PRECISION_REORDER_MULTIPLIER 3

Expand Down
46 changes: 31 additions & 15 deletions include/scratch.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,50 +34,66 @@ namespace diskann {

~InMemQueryScratch();
InMemQueryScratch(uint32_t search_l, uint32_t indexing_l, uint32_t r,
size_t dim);
uint32_t maxc, size_t dim, bool init_pq_scratch = false);
void resize_for_query(uint32_t new_search_l);
void clear();

std::vector<Neighbor> &pool() {
inline std::vector<Neighbor> &pool() {
return _pool;
}
std::vector<unsigned> &des() {
return _des;
}
tsl::robin_set<unsigned> &visited() {
inline tsl::robin_set<unsigned> &visited() {
return _visited;
}
std::vector<Neighbor> &best_l_nodes() {
return _best_l_nodes;
}
tsl::robin_set<unsigned> &inserted_into_pool_rs() {
inline tsl::robin_set<unsigned> &inserted_into_pool_rs() {
return _inserted_into_pool_rs;
}
boost::dynamic_bitset<> &inserted_into_pool_bs() {
inline boost::dynamic_bitset<> &inserted_into_pool_bs() {
return *_inserted_into_pool_bs;
}
inline std::vector<unsigned> &id_scratch() {
return _id_scratch;
}
inline float *dist_scratch() {
return _dist_scratch;
}

T *aligned_query() {
return this->_aligned_query;
inline T *aligned_query() {
return _aligned_query;
}
uint32_t *indices() {
return this->_indices;
inline uint32_t *indices() {
return _indices;
}
float *interim_dists() {
return this->_interim_dists;
inline float *interim_dists() {
return _interim_dists;
}

inline std::vector<float> &occlude_factor() {
return _occlude_factor;
}

inline PQScratch<T> *pq_scratch() {
return _pq_scratch;
}

private:
std::vector<Neighbor> _pool;
tsl::robin_set<unsigned> _visited;
std::vector<unsigned> _des;
std::vector<Neighbor> _best_l_nodes;
tsl::robin_set<unsigned> _inserted_into_pool_rs;
boost::dynamic_bitset<> *_inserted_into_pool_bs;
std::vector<unsigned> _id_scratch;
float *_dist_scratch = nullptr;

T *_aligned_query = nullptr;
uint32_t *_indices = nullptr;
float *_interim_dists = nullptr;

std::vector<float> _occlude_factor;

PQScratch<T> *_pq_scratch = nullptr;
};

//
Expand Down
9 changes: 9 additions & 0 deletions include/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,14 @@ namespace diskann {
_clock::now() - check_point)
.count();
}

float elapsed_seconds() const {
return (float) elapsed() / 1000000.0;
}

std::string elapsed_seconds_for_step(const std::string& step) const {
return std::string("Time for ") + step + std::string(": ") +
std::to_string(elapsed_seconds()) + std::string(" seconds");
}
};
} // namespace diskann
18 changes: 2 additions & 16 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,10 @@
// Licensed under the MIT license.

#pragma once
#include <fcntl.h>
#include <algorithm>
#include <errno.h>

#include <cassert>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits.h>

#include <string>
#include <memory>
#include <random>
#include <set>
#include <sstream>
#include <string.h>
#include "common_includes.h"

#ifdef __APPLE__
#else
#include <malloc.h>
Expand All @@ -36,7 +23,6 @@ typedef int FileHandle;
#include "logger.h"
#include "cached_io.h"
#include "ann_exception.h"
#include "common_includes.h"
#include "windows_customizations.h"
#include "tsl/robin_set.h"

Expand Down
Loading

0 comments on commit 0e95756

Please sign in to comment.