Skip to content

Commit dd1bdb7

Browse files
authored
Merge pull request #434 from nmslib/v07release
adding release notes, bumping the version
2 parents 2c6f244 + 488ab52 commit dd1bdb7

20 files changed

+805
-61
lines changed

.github/workflows/build.yml

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
- name: Test
2222
timeout-minutes: 15
2323
run: |
24-
python -m unittest discover -v --start-directory examples --pattern "example*.py"
24+
python -m unittest discover -v --start-directory examples/python --pattern "example*.py"
2525
python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"
2626
2727
test_cpp:
@@ -61,6 +61,12 @@ jobs:
6161
if [ "$RUNNER_OS" == "Windows" ]; then
6262
cp ./Release/* ./
6363
fi
64+
./example_search
65+
./example_filter
66+
./example_replace_deleted
67+
./example_mt_search
68+
./example_mt_filter
69+
./example_mt_replace_deleted
6470
./searchKnnCloserFirst_test
6571
./searchKnnWithFilter_test
6672
./multiThreadLoad_test

CMakeLists.txt

+20
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,26 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
1616
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
1717
endif()
1818

19+
# examples
20+
add_executable(example_search examples/cpp/example_search.cpp)
21+
target_link_libraries(example_search hnswlib)
22+
23+
add_executable(example_filter examples/cpp/example_filter.cpp)
24+
target_link_libraries(example_filter hnswlib)
25+
26+
add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp)
27+
target_link_libraries(example_replace_deleted hnswlib)
28+
29+
add_executable(example_mt_search examples/cpp/example_mt_search.cpp)
30+
target_link_libraries(example_mt_search hnswlib)
31+
32+
add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp)
33+
target_link_libraries(example_mt_filter hnswlib)
34+
35+
add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp)
36+
target_link_libraries(example_mt_replace_deleted hnswlib)
37+
38+
# tests
1939
add_executable(test_updates tests/cpp/updates_test.cpp)
2040
target_link_libraries(test_updates hnswlib)
2141

README.md

+29-25
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,22 @@
11
# Hnswlib - fast approximate nearest neighbor search
2-
Header-only C++ HNSW implementation with python bindings.
2+
Header-only C++ HNSW implementation with python bindings, insertions and updates.
33

44
**NEWS:**
55

6+
**version 0.7.0**
67

7-
**version 0.6.2**
8-
9-
* Fixed a bug in saving of large pickles. The pickles with > 4GB could have been corrupted. Thanks Kai Wohlfahrt for reporting.
10-
* Thanks to ([@GuyAv46](https://github.com/GuyAv46)) hnswlib inner product now is more consitent accross architectures (SSE, AVX, etc).
11-
*
12-
13-
**version 0.6.1**
14-
15-
* Thanks to ([@tony-kuo](https://github.com/tony-kuo)) hnswlib AVX512 and AVX builds are not backwards-compatible with older SSE and non-AVX512 architectures.
16-
* Thanks to ([@psobot](https://github.com/psobot)) there is now a sencible message instead of segfault when passing a scalar to get_items.
17-
* Thanks to ([@urigoren](https://github.com/urigoren)) hnswlib has a lazy index creation python wrapper.
18-
19-
**version 0.6.0**
20-
* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions).
21-
* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated.
22-
* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md).
23-
* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K.
24-
25-
8+
* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
9+
* Added python interface for filtering (though note its performance is limited by GIL) (#417) by [@gtsoukas](https://github.com/gtsoukas)
10+
* Added support for replacing the elements that were marked as delete with newly inserted elements (to control the size of the index, #418) by [@dyashuni](https://github.com/dyashuni)
11+
* Fixed data races/deadlocks in updates/insertion, added stress test for multithreaded operation (#418) by [@dyashuni](https://github.com/dyashuni)
12+
* Documentation, tests, exception handling, refactoring (#375, #379, #380, #395, #396, #401, #406, #404, #409, #410, #416, #415, #431, #432, #433) by [@jlmelville](https://github.com/jlmelville), [@dyashuni](https://github.com/dyashuni), [@kishorenc](https://github.com/kishorenc), [@korzhenevski](https://github.com/korzhenevski), [@yoshoku](https://github.com/yoshoku), [@jianshu93](https://github.com/jianshu93), [@PLNech](https://github.com/PLNech)
13+
* global linkages (#383) by [@MasterAler](https://github.com/MasterAler), USE_SSE usage in MSVC (#408) by [@alxvth](https://github.com/alxvth)
2614

2715

2816
### Highlights:
2917
1) Lightweight, header-only, no dependencies other than C++ 11
30-
2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
31-
3) Has full support for incremental index construction. Has support for element deletions
18+
2) Interfaces for C++, Python, external support for Java and R (https://github.com/jlmelville/rcpphnsw).
19+
3) Has full support for incremental index construction and updating the elements. Has support for element deletions
3220
(by marking them in index). Index is picklable.
3321
4) Can work with custom user defined distances (C++).
3422
5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.
@@ -50,7 +38,7 @@ Note that inner product is not an actual metric. An element can be closer to som
5038

5139
For other spaces use the nmslib library https://github.com/nmslib/nmslib.
5240

53-
#### Short API description
41+
#### API description
5442
* `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`.
5543

5644
`hnswlib.Index` methods:
@@ -123,7 +111,12 @@ Properties of `hnswlib.Index` that support reading and writing:
123111
124112

125113
#### Python bindings examples
126-
[See more examples here](examples/EXAMPLES.md)
114+
[See more examples here](examples/python/EXAMPLES.md):
115+
* Creating index, inserting elements, searching, serialization/deserialization
116+
* Filtering during the search with a boolean function
117+
* Deleting the elements and reusing the memory of the deleted elements for newly added elements
118+
119+
An example of creating index, inserting elements, searching and pickle serialization:
127120
```python
128121
import hnswlib
129122
import numpy as np
@@ -230,6 +223,14 @@ labels, distances = p.knn_query(data, k=1)
230223
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
231224
```
232225

226+
#### C++ examples
227+
[See examples here](examples/cpp/EXAMPLES.md):
228+
* creating index, inserting elements, searching, serialization/deserialization
229+
* filtering during the search with a boolean function
230+
* deleting the elements and reusing the memory of the deleted elements for newly added elements
231+
* multithreaded usage
232+
233+
233234
### Bindings installation
234235

235236
You can install from sources:
@@ -263,14 +264,17 @@ https://github.com/facebookresearch/faiss
263264
["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422)
264265
(current state-of-the-art in compressed indexes, C++):
265266
https://github.com/dbaranchuk/ivf-hnsw
267+
* Amazon PECOS https://github.com/amzn/pecos
266268
* TOROS N2 (python, C++): https://github.com/kakao/n2
267269
* Online HNSW (C++): https://github.com/andrusha97/online-hnsw)
268270
* Go implementation: https://github.com/Bithack/go-hnsw
269271
* Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering
272+
* Julia implmentation https://github.com/JuliaNeighbors/HNSW.jl
270273
* Java implementation: https://github.com/jelmerk/hnswlib
271274
* Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna
272-
* .Net implementation: https://github.com/microsoft/HNSW.Net
275+
* .Net implementation: https://github.com/curiosity-ai/hnsw-sharp
273276
* CUDA implementation: https://github.com/js1010/cuhnsw
277+
* Rust implementation https://github.com/rust-cv/hnsw
274278
* Rust implementation for memory and thread safety purposes and There is A Trait to enable the user to implement its own distances. It takes as data slices of types T satisfying T:Serialize+Clone+Send+Sync.: https://github.com/jean-pierreBoth/hnswlib-rs
275279

276280
### 200M SIFT test reproduction

examples/cpp/EXAMPLES.md

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# C++ examples
2+
3+
Creating index, inserting elements, searching and serialization
4+
```cpp
5+
#include "../../hnswlib/hnswlib.h"
6+
7+
8+
int main() {
9+
int dim = 16; // Dimension of the elements
10+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
11+
int M = 16; // Tightly connected with internal dimensionality of the data
12+
// strongly affects the memory consumption
13+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
14+
15+
// Initing index
16+
hnswlib::L2Space space(dim);
17+
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);
18+
19+
// Generate random data
20+
std::mt19937 rng;
21+
rng.seed(47);
22+
std::uniform_real_distribution<> distrib_real;
23+
float* data = new float[dim * max_elements];
24+
for (int i = 0; i < dim * max_elements; i++) {
25+
data[i] = distrib_real(rng);
26+
}
27+
28+
// Add data to index
29+
for (int i = 0; i < max_elements; i++) {
30+
alg_hnsw->addPoint(data + i * dim, i);
31+
}
32+
33+
// Query the elements for themselves and measure recall
34+
float correct = 0;
35+
for (int i = 0; i < max_elements; i++) {
36+
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnn(data + i * dim, 1);
37+
hnswlib::labeltype label = result.top().second;
38+
if (label == i) correct++;
39+
}
40+
float recall = correct / max_elements;
41+
std::cout << "Recall: " << recall << "\n";
42+
43+
// Serialize index
44+
std::string hnsw_path = "hnsw.bin";
45+
alg_hnsw->saveIndex(hnsw_path);
46+
delete alg_hnsw;
47+
48+
// Deserialize index and check recall
49+
alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, hnsw_path);
50+
correct = 0;
51+
for (int i = 0; i < max_elements; i++) {
52+
std::priority_queue<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnn(data + i * dim, 1);
53+
hnswlib::labeltype label = result.top().second;
54+
if (label == i) correct++;
55+
}
56+
recall = (float)correct / max_elements;
57+
std::cout << "Recall of deserialized index: " << recall << "\n";
58+
59+
delete[] data;
60+
delete alg_hnsw;
61+
return 0;
62+
}
63+
```
64+
65+
An example of filtering with a boolean function during the search:
66+
```cpp
67+
#include "../../hnswlib/hnswlib.h"
68+
69+
70+
// Filter that allows labels divisible by divisor
71+
class PickDivisibleIds: public hnswlib::BaseFilterFunctor {
72+
unsigned int divisor = 1;
73+
public:
74+
PickDivisibleIds(unsigned int divisor): divisor(divisor) {
75+
assert(divisor != 0);
76+
}
77+
bool operator()(hnswlib::labeltype label_id) {
78+
return label_id % divisor == 0;
79+
}
80+
};
81+
82+
83+
int main() {
84+
int dim = 16; // Dimension of the elements
85+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
86+
int M = 16; // Tightly connected with internal dimensionality of the data
87+
// strongly affects the memory consumption
88+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
89+
90+
// Initing index
91+
hnswlib::L2Space space(dim);
92+
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);
93+
94+
// Generate random data
95+
std::mt19937 rng;
96+
rng.seed(47);
97+
std::uniform_real_distribution<> distrib_real;
98+
float* data = new float[dim * max_elements];
99+
for (int i = 0; i < dim * max_elements; i++) {
100+
data[i] = distrib_real(rng);
101+
}
102+
103+
// Add data to index
104+
for (int i = 0; i < max_elements; i++) {
105+
alg_hnsw->addPoint(data + i * dim, i);
106+
}
107+
108+
// Create filter that allows only even labels
109+
PickDivisibleIds pickIdsDivisibleByTwo(2);
110+
111+
// Query the elements for themselves with filter and check returned labels
112+
int k = 10;
113+
for (int i = 0; i < max_elements; i++) {
114+
std::vector<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnnCloserFirst(data + i * dim, k, &pickIdsDivisibleByTwo);
115+
for (auto item: result) {
116+
if (item.second % 2 == 1) std::cout << "Error: found odd label\n";
117+
}
118+
}
119+
120+
delete[] data;
121+
delete alg_hnsw;
122+
return 0;
123+
}
124+
```
125+
126+
An example with reusing the memory of the deleted elements when new elements are being added (via `allow_replace_deleted` flag):
127+
```cpp
128+
#include "../../hnswlib/hnswlib.h"
129+
130+
131+
int main() {
132+
int dim = 16; // Dimension of the elements
133+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
134+
int M = 16; // Tightly connected with internal dimensionality of the data
135+
// strongly affects the memory consumption
136+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
137+
138+
// Initing index
139+
hnswlib::L2Space space(dim);
140+
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction, 100, true);
141+
142+
// Generate random data
143+
std::mt19937 rng;
144+
rng.seed(47);
145+
std::uniform_real_distribution<> distrib_real;
146+
float* data = new float[dim * max_elements];
147+
for (int i = 0; i < dim * max_elements; i++) {
148+
data[i] = distrib_real(rng);
149+
}
150+
151+
// Add data to index
152+
for (int i = 0; i < max_elements; i++) {
153+
alg_hnsw->addPoint(data + i * dim, i);
154+
}
155+
156+
// Mark first half of elements as deleted
157+
int num_deleted = max_elements / 2;
158+
for (int i = 0; i < num_deleted; i++) {
159+
alg_hnsw->markDelete(i);
160+
}
161+
162+
float* add_data = new float[dim * num_deleted];
163+
for (int i = 0; i < dim * num_deleted; i++) {
164+
add_data[i] = distrib_real(rng);
165+
}
166+
167+
// Replace deleted data with new elements
168+
// Maximum number of elements is reached therefore we cannot add new items,
169+
// but we can replace the deleted ones by using replace_deleted=true
170+
for (int i = 0; i < num_deleted; i++) {
171+
int label = max_elements + i;
172+
alg_hnsw->addPoint(add_data + i * dim, label, true);
173+
}
174+
175+
delete[] data;
176+
delete[] add_data;
177+
delete alg_hnsw;
178+
return 0;
179+
}
180+
```
181+
182+
Multithreaded examples:
183+
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
184+
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
185+
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)

examples/cpp/example_filter.cpp

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#include "../../hnswlib/hnswlib.h"
2+
3+
4+
// Filter that allows labels divisible by divisor
5+
class PickDivisibleIds: public hnswlib::BaseFilterFunctor {
6+
unsigned int divisor = 1;
7+
public:
8+
PickDivisibleIds(unsigned int divisor): divisor(divisor) {
9+
assert(divisor != 0);
10+
}
11+
bool operator()(hnswlib::labeltype label_id) {
12+
return label_id % divisor == 0;
13+
}
14+
};
15+
16+
17+
int main() {
18+
int dim = 16; // Dimension of the elements
19+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
20+
int M = 16; // Tightly connected with internal dimensionality of the data
21+
// strongly affects the memory consumption
22+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
23+
24+
// Initing index
25+
hnswlib::L2Space space(dim);
26+
hnswlib::HierarchicalNSW<float>* alg_hnsw = new hnswlib::HierarchicalNSW<float>(&space, max_elements, M, ef_construction);
27+
28+
// Generate random data
29+
std::mt19937 rng;
30+
rng.seed(47);
31+
std::uniform_real_distribution<> distrib_real;
32+
float* data = new float[dim * max_elements];
33+
for (int i = 0; i < dim * max_elements; i++) {
34+
data[i] = distrib_real(rng);
35+
}
36+
37+
// Add data to index
38+
for (int i = 0; i < max_elements; i++) {
39+
alg_hnsw->addPoint(data + i * dim, i);
40+
}
41+
42+
// Create filter that allows only even labels
43+
PickDivisibleIds pickIdsDivisibleByTwo(2);
44+
45+
// Query the elements for themselves with filter and check returned labels
46+
int k = 10;
47+
for (int i = 0; i < max_elements; i++) {
48+
std::vector<std::pair<float, hnswlib::labeltype>> result = alg_hnsw->searchKnnCloserFirst(data + i * dim, k, &pickIdsDivisibleByTwo);
49+
for (auto item: result) {
50+
if (item.second % 2 == 1) std::cout << "Error: found odd label\n";
51+
}
52+
}
53+
54+
delete[] data;
55+
delete alg_hnsw;
56+
return 0;
57+
}

0 commit comments

Comments
 (0)