Skip to content

Commit

Permalink
Harshasi/reorg scratch simplify build rebase with main (microsoft#152)
Browse files Browse the repository at this point in the history
* move random data slice to generate (disk) quantized data

* moved in-memory scratch allocation to new files, build now uses pre-allocated scratch space, simplified call hierarhcy for build

* remove eager delete support

* fixing bug in search_for_... function

* removing unused functions from
index class: prune_all_nodes and batch_inter_insert

* eliminate some variable from iterate_to_fixed_point and rename the iteration varaibles to be more readable

* change scratch space manager for index class to pass pointers instead of copies of scratch space

* moved scratch space related code for PQ flash index to scratch.h/cpp

* change scratch store manger signnature to make it agnostic to the scratch type

* Rewrote in-memory scratch space class

* use scratch store manager for pq flash index

* change to unit tester, and minor edit in pq_flash

* fixed mips normalization issue

* rearranged 2 lines

Co-authored-by: ravishankar <[email protected]>
  • Loading branch information
harsha-simhadri and ravishankar authored Nov 22, 2022
1 parent 2e1e328 commit 193e746
Show file tree
Hide file tree
Showing 22 changed files with 720 additions and 1,369 deletions.
6 changes: 1 addition & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@
# Some variables like MSVC are defined only after project(), so put that first.
project(diskann)

if(MSVC)
cmake_minimum_required(VERSION 3.15)
else()
cmake_minimum_required(VERSION 2.8)
endif()
cmake_minimum_required(VERSION 3.15)

set(CMAKE_STANDARD 14)

Expand Down
4 changes: 4 additions & 0 deletions include/common_includes.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#pragma once

#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cmath>
Expand All @@ -14,5 +17,6 @@
#include <random>
#include <string.h>
#include <sys/stat.h>
#include <sstream>
#include <time.h>
#include <vector>
1 change: 0 additions & 1 deletion include/disk_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ typedef int FileHandle;
#include "windows_customizations.h"

namespace diskann {
const size_t MAX_PQ_TRAINING_SET_SIZE = 256000;
const size_t MAX_SAMPLE_POINTS_FOR_WARMUP = 100000;
const double PQ_TRAINING_SET_FRACTION = 0.1;
const double SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
Expand Down
156 changes: 25 additions & 131 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,23 @@

#include <atomic>
#include <cassert>
#include <map>
#include <shared_mutex>
#include <sstream>
#include <stack>
#include <string>
#include "tsl/robin_set.h"
#include "tsl/robin_map.h"
#include "tsl/sparse_map.h"

#ifdef EXEC_ENV_OLS
#include "aligned_file_reader.h"
#endif

#include "boost_dynamic_bitset_fwd.h"
#include "distance.h"
#include "locking.h"
#include "natural_number_map.h"
#include "natural_number_set.h"
#include "neighbor.h"
#include "parameters.h"
#include "utils.h"
#include "concurrent_queue.h"
#include "windows_customizations.h"
#include "scratch.h"

#define GRAPH_SLACK_FACTOR 1.3
#define OVERHEAD_FACTOR 1.1
Expand All @@ -46,49 +40,6 @@ namespace diskann {
size_of_outer_vector);
}

template<typename T>
struct InMemQueryScratch {
std::vector<Neighbor> * _pool = nullptr;
tsl::robin_set<unsigned> *_visited = nullptr;
std::vector<unsigned> * _des = nullptr;
std::vector<Neighbor> * _best_l_nodes = nullptr;
tsl::robin_set<unsigned> *_inserted_into_pool_rs = nullptr;
boost::dynamic_bitset<> * _inserted_into_pool_bs = nullptr;

T * aligned_query = nullptr;
uint32_t *indices = nullptr;
float * interim_dists = nullptr;

uint32_t search_l;
uint32_t indexing_l;
uint32_t r;

InMemQueryScratch();
void setup(uint32_t search_l, uint32_t indexing_l, uint32_t r, size_t dim);
void clear();
void resize_for_query(uint32_t new_search_l);
void destroy();

std::vector<Neighbor> &pool() {
return *_pool;
}
std::vector<unsigned> &des() {
return *_des;
}
tsl::robin_set<unsigned> &visited() {
return *_visited;
}
std::vector<Neighbor> &best_l_nodes() {
return *_best_l_nodes;
}
tsl::robin_set<unsigned> &inserted_into_pool_rs() {
return *_inserted_into_pool_rs;
}
boost::dynamic_bitset<> &inserted_into_pool_bs() {
return *_inserted_into_pool_bs;
}
};

struct consolidation_report {
enum status_code {
SUCCESS = 0,
Expand Down Expand Up @@ -121,7 +72,6 @@ namespace diskann {
const size_t max_points = 1,
const bool dynamic_index = false,
const bool enable_tags = false,
const bool support_eager_delete = false,
const bool concurrent_consolidate = false);

// Constructor for incremental index
Expand All @@ -130,7 +80,6 @@ namespace diskann {
const Parameters &indexParameters,
const Parameters &searchParameters,
const bool enable_tags = false,
const bool support_eager_delete = false,
const bool concurrent_consolidate = false);

DISKANN_DLLEXPORT ~Index();
Expand All @@ -144,8 +93,8 @@ namespace diskann {
DISKANN_DLLEXPORT void load(AlignedFileReader &reader, uint32_t num_threads,
uint32_t search_l);
#else
DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads,
uint32_t search_l);
DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads,
uint32_t search_l);
#endif

// get some private variables
Expand All @@ -155,18 +104,18 @@ namespace diskann {
// Batch build from a file. Optionally pass tags vector.
DISKANN_DLLEXPORT void build(
const char *filename, const size_t num_points_to_load,
Parameters & parameters,
Parameters &parameters,
const std::vector<TagT> &tags = std::vector<TagT>());

// Batch build from a file. Optionally pass tags file.
DISKANN_DLLEXPORT void build(const char * filename,
DISKANN_DLLEXPORT void build(const char *filename,
const size_t num_points_to_load,
Parameters & parameters,
const char * tag_filename);
Parameters &parameters,
const char *tag_filename);

// Batch build from a data array, which must pad vectors to aligned_dim
DISKANN_DLLEXPORT void build(const T *data, const size_t num_points_to_load,
Parameters & parameters,
Parameters &parameters,
const std::vector<TagT> &tags);

// Set starting point of an index before inserting any points incrementally
Expand All @@ -192,7 +141,7 @@ namespace diskann {
// Initialize space for res_vectors before calling.
DISKANN_DLLEXPORT size_t search_with_tags(const T *query, const uint64_t K,
const unsigned L, TagT *tags,
float * distances,
float *distances,
std::vector<T *> &res_vectors);

// Will fail if tag already in the index or if tag=0.
Expand All @@ -202,30 +151,20 @@ namespace diskann {
DISKANN_DLLEXPORT int enable_delete();

// Record deleted point now and restructure graph later. Return -1 if tag
// not found, 0 if OK. Do not call if _eager_delete was called earlier and
// data was not consolidated
// not found, 0 if OK.
DISKANN_DLLEXPORT int lazy_delete(const TagT &tag);

// Record deleted points now and restructure graph later. Add to failed_tags
// if tag not found. Do not call if _eager_delete was called earlier and
// data was not consolidated.
// if tag not found.
DISKANN_DLLEXPORT void lazy_delete(const std::vector<TagT> &tags,
std::vector<TagT> & failed_tags);
std::vector<TagT> &failed_tags);

// Call after a series of lazy deletions
// Returns number of live points left after consolidation
// If _conc_consolidates is set in the ctor, then this call can be invoked
// alongside inserts and lazy deletes, else it acquires _update_lock
DISKANN_DLLEXPORT consolidation_report
consolidate_deletes(const Parameters &parameters);

// Delete point from graph and restructure it immediately. Do not call if
// _lazy_delete was called earlier and data was not consolidated
DISKANN_DLLEXPORT int eager_delete(const TagT tag,
const Parameters &parameters,
int delete_mode = 1);

DISKANN_DLLEXPORT void prune_all_nbrs(const Parameters &parameters);
consolidate_deletes(const Parameters &parameters);

DISKANN_DLLEXPORT bool is_index_saved();

Expand Down Expand Up @@ -262,7 +201,7 @@ namespace diskann {
Index<T, TagT> &operator=(const Index<T, TagT> &) = delete;

// Use after _data and _nd have been populated
void build_with_data_populated(Parameters & parameters,
void build_with_data_populated(Parameters &parameters,
const std::vector<TagT> &tags);

// generates 1 frozen point that will never be deleted from the graph
Expand All @@ -272,47 +211,19 @@ namespace diskann {
// determines navigating node of the graph by calculating medoid of data
unsigned calculate_entry_point();

// called only when _eager_delete is to be supported
void update_in_graph();

template<typename IDType>
std::pair<uint32_t, uint32_t> search_impl(const T *query, const size_t K,
const unsigned L, IDType *indices,
float * distances,
InMemQueryScratch<T> &scratch);
float *distances,
InMemQueryScratch<T> *scratch);

std::pair<uint32_t, uint32_t> iterate_to_fixed_point(
const T *node_coords, const unsigned Lindex,
const std::vector<unsigned> &init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> & expanded_nodes_ids,
std::vector<Neighbor> &best_L_nodes, std::vector<unsigned> &des,
tsl::robin_set<unsigned> &inserted_into_pool_rs,
boost::dynamic_bitset<> &inserted_into_pool_bs, bool ret_frozen = true,
bool search_invocation = false);

void get_expanded_nodes(const size_t node, const unsigned Lindex,
std::vector<unsigned> init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> &expanded_nodes_ids,
std::vector<unsigned> & des,
std::vector<Neighbor> & best_L_nodes,
tsl::robin_set<unsigned> &inserted_into_pool_rs,
boost::dynamic_bitset<> & inserted_into_pool_bs);

// get_expanded_nodes for insertion. Must investigate to see if perf can
// be improved here as well using the same technique as above.
void get_expanded_nodes(const size_t node_id, const unsigned Lindex,
std::vector<unsigned> init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> &expanded_nodes_ids);

void search_for_point_and_add_links(
int location, _u32 Lindex, std::vector<Neighbor> &pool,
tsl::robin_set<unsigned> &visited, std::vector<unsigned> &des,
std::vector<Neighbor> & best_L_nodes,
tsl::robin_set<unsigned> &inserted_into_pool_rs,
boost::dynamic_bitset<> & inserted_into_pool_bs);
const std::vector<unsigned> &init_ids, InMemQueryScratch<T> *scratch,
bool ret_frozen = true, bool search_invocation = false);

void search_for_point_and_add_links(int location, _u32 Lindex,
InMemQueryScratch<T> *scratch);

void prune_neighbors(const unsigned location, std::vector<Neighbor> &pool,
std::vector<unsigned> &pruned_list);
Expand All @@ -325,22 +236,11 @@ namespace diskann {
const unsigned degree, const unsigned maxc,
std::vector<Neighbor> &result);

// add reverse links from all the visited nodes to node n.
void batch_inter_insert(unsigned n,
const std::vector<unsigned> &pruned_list,
const _u32 range,
std::vector<unsigned> & need_to_sync);

void batch_inter_insert(unsigned n,
const std::vector<unsigned> &pruned_list,
std::vector<unsigned> & need_to_sync);

// add reverse links from all the visited nodes to node n.
void inter_insert(unsigned n, std::vector<unsigned> &pruned_list,
const _u32 range, bool update_in_graph);
const _u32 range);

void inter_insert(unsigned n, std::vector<unsigned> &pruned_list,
bool update_in_graph);
void inter_insert(unsigned n, std::vector<unsigned> &pruned_list);

void link(Parameters &parameters);

Expand Down Expand Up @@ -396,12 +296,11 @@ namespace diskann {
Distance<T> *_distance = nullptr;

// Data
T * _data = nullptr;
T *_data = nullptr;
char *_opt_graph;

// Graph related data structures
std::vector<std::vector<unsigned>> _final_graph;
std::vector<std::vector<unsigned>> _in_graph;

// Dimensions
size_t _dim = 0;
Expand Down Expand Up @@ -432,7 +331,7 @@ namespace diskann {
uint32_t _search_queue_size;

// Query scratch data structures
ConcurrentQueue<InMemQueryScratch<T>> _query_scratch;
ConcurrentQueue<InMemQueryScratch<T> *> _query_scratch;

// data structures, flags and locks for dynamic indexing
tsl::sparse_map<TagT, unsigned> _tag_to_location;
Expand All @@ -441,18 +340,13 @@ namespace diskann {
tsl::robin_set<unsigned> _delete_set;
natural_number_set<unsigned> _empty_slots;

bool _support_eager_delete =
false; // Enables in-graph, requires more space

bool _eager_done = false; // true if eager deletions have been made
bool _lazy_done = false; // true if lazy deletions have been made
bool _data_compacted = true; // true if data has been compacted
bool _is_saved = false; // Gopal. Checking if the index is already saved.
bool _conc_consolidate = false; // use _lock while searching

// Per node lock, cardinality=max_points_
std::vector<non_recursive_mutex> _locks;
std::vector<non_recursive_mutex> _locks_in;

// If acquiring multiple locks below, acquire locks in the order below
std::shared_timed_mutex // RW mutex between save/load (exclusive lock) and
Expand Down
1 change: 1 addition & 0 deletions include/natural_number_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <memory>
#include <type_traits>
#include <vector>

#include "boost_dynamic_bitset_fwd.h"

Expand Down
Loading

0 comments on commit 193e746

Please sign in to comment.