Skip to content

Commit

Permalink
Ported pr microsoft#163 (Optimize neighbor insertion) into the latest…
Browse files Browse the repository at this point in the history
… version of the main branch (microsoft#190)

* Ported pr163 into the latest version of the main branch.
Included the fixes mentioned in the comments of pr163.
NeighborSet uses the Neighbor::operator< for comparisons.
The NeighborSet::insert method  excludes duplicate ids being inserted into the set.

* Updated from pr review. Mostly name changes.
  • Loading branch information
shanewil authored Jan 10, 2023
1 parent 1ba8993 commit bc01112
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 242 deletions.
129 changes: 94 additions & 35 deletions include/neighbor.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ namespace diskann {
struct Neighbor {
unsigned id;
float distance;
bool flag;
bool expanded;

Neighbor() = default;
Neighbor(unsigned id, float distance, bool f)
: id{id}, distance{distance}, flag(f) {

Neighbor(unsigned id, float distance)
: id{id}, distance{distance}, expanded(false) {
}

inline bool operator<(const Neighbor &other) const {
Expand All @@ -30,40 +31,98 @@ namespace diskann {
}
};

static inline unsigned InsertIntoPool(Neighbor *addr, unsigned K,
Neighbor nn) {
// find the location to insert
unsigned left = 0, right = K - 1;
if (nn < addr[left]) {
memmove((char *) &addr[left + 1], &addr[left], K * sizeof(Neighbor));
addr[left] = nn;
return left;
// Invariant: after every `insert` and `closest_unexpanded()`, `_cur` points to
// the first Neighbor which is unexpanded.
class NeighborPriorityQueue {
public:

NeighborPriorityQueue() : _size(0), _capacity(0), _cur(0) {
}

explicit NeighborPriorityQueue(size_t capacity)
: _size(0), _capacity(capacity), _cur(0), _data(capacity + 1) {
}

// Inserts the item ordered into the set up to the sets capacity.
// The item will be dropped if it is the same id as an exiting
// set item or it has a greated distance than the final
// item in the set. The set cursor that is used to pop() the
// next item will be set to the lowest index of an uncheck item
void insert(const Neighbor &nbr) {
if (_size == _capacity && _data[_size - 1] < nbr) {
return;
}

size_t lo = 0, hi = _size;
while (lo < hi) {
size_t mid = (lo + hi) >> 1;
if (nbr < _data[mid]) {
hi = mid;
// Make sure the same id isn't inserted into the set
} else if (_data[mid].id == nbr.id) {
return;
} else {
lo = mid + 1;
}
}

if (lo < _capacity) {
std::memmove(&_data[lo + 1], &_data[lo],
(_size - lo) * sizeof(Neighbor));
}
_data[lo] = {nbr.id, nbr.distance};
if (_size < _capacity) {
_size++;
}
if (lo < _cur) {
_cur = lo;
}
}

Neighbor closest_unexpanded() {
_data[_cur].expanded = true;
size_t pre = _cur;
while (_cur < _size && _data[_cur].expanded) {
_cur++;
}
return _data[pre];
}

bool has_unexpanded_node() const {
return _cur < _size;
}

size_t size() const {
return _size;
}

size_t capacity() const {
return _capacity;
}
if (addr[right] < nn) {
addr[K] = nn;
return K;

void reserve(size_t capacity) {
if (capacity + 1 > _data.size()) {
_data.resize(capacity + 1);
}
_capacity = capacity;
}
while (right > 1 && left < right - 1) {
unsigned mid = (left + right) / 2;
if (nn < addr[mid])
right = mid;
else
left = mid;

Neighbor &operator[](size_t i) {
return _data[i];
}
// check equal ID

while (left > 0) {
if (addr[left] < nn)
break;
if (addr[left].id == nn.id)
return K + 1;
left--;

Neighbor operator[](size_t i) const {
return _data[i];
}
if (addr[left].id == nn.id || addr[right].id == nn.id)
return K + 1;
memmove((char *) &addr[right + 1], &addr[right],
(K - right) * sizeof(Neighbor));
addr[right] = nn;
return right;
}

void clear() {
_size = 0;
_cur = 0;
}

private:
size_t _size, _capacity, _cur;
std::vector<Neighbor> _data;
};

} // namespace diskann
6 changes: 3 additions & 3 deletions include/scratch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace diskann {
inline tsl::robin_set<unsigned> &visited() {
return _visited;
}
std::vector<Neighbor> &best_l_nodes() {
inline NeighborPriorityQueue &best_l_nodes() {
return _best_l_nodes;
}
inline tsl::robin_set<unsigned> &inserted_into_pool_rs() {
Expand Down Expand Up @@ -81,7 +81,7 @@ namespace diskann {
private:
std::vector<Neighbor> _pool;
tsl::robin_set<unsigned> _visited;
std::vector<Neighbor> _best_l_nodes;
NeighborPriorityQueue _best_l_nodes;
tsl::robin_set<unsigned> _inserted_into_pool_rs;
boost::dynamic_bitset<> *_inserted_into_pool_bs;
std::vector<unsigned> _id_scratch;
Expand Down Expand Up @@ -115,7 +115,7 @@ namespace diskann {
PQScratch<T> *_pq_scratch;

tsl::robin_set<_u64> visited;
std::vector<Neighbor> retset;
NeighborPriorityQueue retset;
std::vector<Neighbor> full_retset;

SSDQueryScratch(size_t aligned_dim, size_t visited_reserve);
Expand Down
2 changes: 1 addition & 1 deletion python/src/diskann_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ PYBIND11_MODULE(diskannpy, m) {

py::class_<Neighbor>(m, "Neighbor")
.def(py::init<>())
.def(py::init<unsigned, float, bool>())
.def(py::init<unsigned, float>())
.def(py::self < py::self)
.def(py::self == py::self);

Expand Down
Loading

0 comments on commit bc01112

Please sign in to comment.