Skip to content

Commit 359b2ba

Browse files
authored
Merge pull request #436 from nmslib/develop
Merge 0.7.0 into master
2 parents 443d667 + dd1bdb7 commit 359b2ba

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+4811
-2664
lines changed

.github/workflows/build.yml

+59-5
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,74 @@ name: HNSW CI
33
on: [push, pull_request]
44

55
jobs:
6-
test:
6+
test_python:
77
runs-on: ${{matrix.os}}
88
strategy:
99
matrix:
1010
os: [ubuntu-latest, windows-latest]
11-
python-version: ['3.6', '3.7', '3.8', '3.9']
11+
python-version: ["3.7", "3.8", "3.9", "3.10"]
1212
steps:
13-
- uses: actions/checkout@v2
14-
- uses: actions/setup-python@v2
13+
- uses: actions/checkout@v3
14+
- uses: actions/setup-python@v4
1515
with:
1616
python-version: ${{ matrix.python-version }}
1717

1818
- name: Build and install
1919
run: python -m pip install .
2020

2121
- name: Test
22-
run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
22+
timeout-minutes: 15
23+
run: |
24+
python -m unittest discover -v --start-directory examples/python --pattern "example*.py"
25+
python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"
26+
27+
test_cpp:
28+
runs-on: ${{matrix.os}}
29+
strategy:
30+
matrix:
31+
os: [ubuntu-latest, windows-latest]
32+
steps:
33+
- uses: actions/checkout@v3
34+
- uses: actions/setup-python@v4
35+
with:
36+
python-version: "3.10"
37+
38+
- name: Build
39+
run: |
40+
mkdir build
41+
cd build
42+
cmake ..
43+
if [ "$RUNNER_OS" == "Linux" ]; then
44+
make
45+
elif [ "$RUNNER_OS" == "Windows" ]; then
46+
cmake --build ./ --config Release
47+
fi
48+
shell: bash
49+
50+
- name: Prepare test data
51+
run: |
52+
pip install numpy
53+
cd tests/cpp/
54+
python update_gen_data.py
55+
shell: bash
56+
57+
- name: Test
58+
timeout-minutes: 15
59+
run: |
60+
cd build
61+
if [ "$RUNNER_OS" == "Windows" ]; then
62+
cp ./Release/* ./
63+
fi
64+
./example_search
65+
./example_filter
66+
./example_replace_deleted
67+
./example_mt_search
68+
./example_mt_filter
69+
./example_mt_replace_deleted
70+
./searchKnnCloserFirst_test
71+
./searchKnnWithFilter_test
72+
./multiThreadLoad_test
73+
./multiThread_replace_test
74+
./test_updates
75+
./test_updates update
76+
shell: bash

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ hnswlib.cpython*.so
88
var/
99
.idea/
1010
.vscode/
11-
11+
.vs/
12+
**.DS_Store

ALGO_PARAMS.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,5 @@ ef_construction leads to longer construction, but better index quality. At some
2727
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall
2828
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room
2929
for improvement.
30-
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
30+
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extended by saving/loading (load_index
3131
function has a parameter which defines the new maximum number of elements).

CMakeLists.txt

+32-3
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,41 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
1616
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
1717
endif()
1818

19-
add_executable(test_updates examples/updates_test.cpp)
19+
# examples
20+
add_executable(example_search examples/cpp/example_search.cpp)
21+
target_link_libraries(example_search hnswlib)
22+
23+
add_executable(example_filter examples/cpp/example_filter.cpp)
24+
target_link_libraries(example_filter hnswlib)
25+
26+
add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp)
27+
target_link_libraries(example_replace_deleted hnswlib)
28+
29+
add_executable(example_mt_search examples/cpp/example_mt_search.cpp)
30+
target_link_libraries(example_mt_search hnswlib)
31+
32+
add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp)
33+
target_link_libraries(example_mt_filter hnswlib)
34+
35+
add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp)
36+
target_link_libraries(example_mt_replace_deleted hnswlib)
37+
38+
# tests
39+
add_executable(test_updates tests/cpp/updates_test.cpp)
2040
target_link_libraries(test_updates hnswlib)
2141

22-
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
42+
add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
2343
target_link_libraries(searchKnnCloserFirst_test hnswlib)
2444

25-
add_executable(main main.cpp sift_1b.cpp)
45+
add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
46+
target_link_libraries(searchKnnWithFilter_test hnswlib)
47+
48+
add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
49+
target_link_libraries(multiThreadLoad_test hnswlib)
50+
51+
add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
52+
target_link_libraries(multiThread_replace_test hnswlib)
53+
54+
add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
2655
target_link_libraries(main hnswlib)
2756
endif()

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ dist:
77
python3 -m build --sdist
88

99
test:
10-
python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
10+
python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
1111

1212
clean:
1313
rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so

README.md

+45-37
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,22 @@
11
# Hnswlib - fast approximate nearest neighbor search
2-
Header-only C++ HNSW implementation with python bindings.
2+
Header-only C++ HNSW implementation with python bindings, insertions and updates.
33

44
**NEWS:**
55

6+
**version 0.7.0**
67

7-
**version 0.6.2**
8-
9-
* Fixed a bug in saving of large pickles. The pickles with > 4GB could have been corrupted. Thanks Kai Wohlfahrt for reporting.
10-
* Thanks to ([@GuyAv46](https://github.com/GuyAv46)) hnswlib inner product now is more consitent accross architectures (SSE, AVX, etc).
11-
*
12-
13-
**version 0.6.1**
14-
15-
* Thanks to ([@tony-kuo](https://github.com/tony-kuo)) hnswlib AVX512 and AVX builds are not backwards-compatible with older SSE and non-AVX512 architectures.
16-
* Thanks to ([@psobot](https://github.com/psobot)) there is now a sencible message instead of segfault when passing a scalar to get_items.
17-
* Thanks to ([@urigoren](https://github.com/urigoren)) hnswlib has a lazy index creation python wrapper.
18-
19-
**version 0.6.0**
20-
* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions).
21-
* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated.
22-
* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md).
23-
* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K.
24-
25-
8+
* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
9+
* Added python interface for filtering (though note its performance is limited by GIL) (#417) by [@gtsoukas](https://github.com/gtsoukas)
10+
* Added support for replacing the elements that were marked as delete with newly inserted elements (to control the size of the index, #418) by [@dyashuni](https://github.com/dyashuni)
11+
* Fixed data races/deadlocks in updates/insertion, added stress test for multithreaded operation (#418) by [@dyashuni](https://github.com/dyashuni)
12+
* Documentation, tests, exception handling, refactoring (#375, #379, #380, #395, #396, #401, #406, #404, #409, #410, #416, #415, #431, #432, #433) by [@jlmelville](https://github.com/jlmelville), [@dyashuni](https://github.com/dyashuni), [@kishorenc](https://github.com/kishorenc), [@korzhenevski](https://github.com/korzhenevski), [@yoshoku](https://github.com/yoshoku), [@jianshu93](https://github.com/jianshu93), [@PLNech](https://github.com/PLNech)
13+
* global linkages (#383) by [@MasterAler](https://github.com/MasterAler), USE_SSE usage in MSVC (#408) by [@alxvth](https://github.com/alxvth)
2614

2715

2816
### Highlights:
2917
1) Lightweight, header-only, no dependencies other than C++ 11
30-
2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
31-
3) Has full support for incremental index construction. Has support for element deletions
18+
2) Interfaces for C++, Python, external support for Java and R (https://github.com/jlmelville/rcpphnsw).
19+
3) Has full support for incremental index construction and updating the elements. Has support for element deletions
3220
(by marking them in index). Index is picklable.
3321
4) Can work with custom user defined distances (C++).
3422
5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.
@@ -50,37 +38,42 @@ Note that inner product is not an actual metric. An element can be closer to som
5038

5139
For other spaces use the nmslib library https://github.com/nmslib/nmslib.
5240

53-
#### Short API description
41+
#### API description
5442
* `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`.
5543

5644
`hnswlib.Index` methods:
57-
* `init_index(max_elements, M = 16, ef_construction = 200, random_seed = 100)` initializes the index from with no elements.
45+
* `init_index(max_elements, M = 16, ef_construction = 200, random_seed = 100, allow_replace_deleted = False)` initializes the index from with no elements.
5846
* `max_elements` defines the maximum number of elements that can be stored in the structure(can be increased/shrunk).
5947
* `ef_construction` defines a construction time/accuracy trade-off (see [ALGO_PARAMS.md](ALGO_PARAMS.md)).
6048
* `M` defines tha maximum number of outgoing connections in the graph ([ALGO_PARAMS.md](ALGO_PARAMS.md)).
49+
* `allow_replace_deleted` enables replacing of deleted elements with new added ones.
6150

62-
* `add_items(data, ids, num_threads = -1)` - inserts the `data`(numpy array of vectors, shape:`N*dim`) into the structure.
51+
* `add_items(data, ids, num_threads = -1, replace_deleted = False)` - inserts the `data`(numpy array of vectors, shape:`N*dim`) into the structure.
6352
* `num_threads` sets the number of cpu threads to use (-1 means use default).
6453
* `ids` are optional N-size numpy array of integer labels for all elements in `data`.
6554
- If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
55+
* `replace_deleted` replaces deleted elements. Note it allows to save memory.
56+
- to use it `init_index` should be called with `allow_replace_deleted=True`
6657
* Thread-safe with other `add_items` calls, but not with `knn_query`.
6758

6859
* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results. Throws an exception if it is already deleted.
69-
*
60+
7061
* `unmark_deleted(label)` - unmarks the element as deleted, so it will be not be omitted from search results.
7162

7263
* `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`.
7364

7465
* `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter (
7566
[ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading.
7667

77-
* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closest elements for each element of the
68+
* `knn_query(data, k = 1, num_threads = -1, filter = None)` make a batch query for `k` closest elements for each element of the
7869
* `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
7970
* `num_threads` sets the number of cpu threads to use (-1 means use default).
71+
* `filter` filters elements by its labels, returns elements with allowed ids. Note that search with a filter works slow in python in multithreaded mode. It is recommended to set `num_threads=1`
8072
* Thread-safe with other `knn_query` calls, but not with `add_items`.
8173

82-
* `load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index.
74+
* `load_index(path_to_index, max_elements = 0, allow_replace_deleted = False)` loads the index from persistence to the uninitialized index.
8375
* `max_elements`(optional) resets the maximum number of elements in the structure.
76+
* `allow_replace_deleted` specifies whether the index being loaded has enabled replacing of deleted elements.
8477

8578
* `save_index(path_to_index)` saves the index from persistence.
8679

@@ -118,6 +111,12 @@ Properties of `hnswlib.Index` that support reading and writing:
118111
119112

120113
#### Python bindings examples
114+
[See more examples here](examples/python/EXAMPLES.md):
115+
* Creating index, inserting elements, searching, serialization/deserialization
116+
* Filtering during the search with a boolean function
117+
* Deleting the elements and reusing the memory of the deleted elements for newly added elements
118+
119+
An example of creating index, inserting elements, searching and pickle serialization:
121120
```python
122121
import hnswlib
123122
import numpy as np
@@ -142,7 +141,7 @@ p.add_items(data, ids)
142141
# Controlling the recall by setting ef:
143142
p.set_ef(50) # ef should always be > k
144143

145-
# Query dataset, k - number of closest elements (returns 2 numpy arrays)
144+
# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
146145
labels, distances = p.knn_query(data, k = 1)
147146

148147
# Index objects support pickling
@@ -155,7 +154,6 @@ print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim
155154
print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}")
156155
print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}")
157156
print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}")
158-
159157
```
160158

161159
An example with updates after serialization/deserialization:
@@ -196,7 +194,6 @@ p.set_ef(10)
196194
# By default using all available cores
197195
p.set_num_threads(4)
198196

199-
200197
print("Adding first batch of %d elements" % (len(data1)))
201198
p.add_items(data1)
202199

@@ -226,6 +223,14 @@ labels, distances = p.knn_query(data, k=1)
226223
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
227224
```
228225

226+
#### C++ examples
227+
[See examples here](examples/cpp/EXAMPLES.md):
228+
* creating index, inserting elements, searching, serialization/deserialization
229+
* filtering during the search with a boolean function
230+
* deleting the elements and reusing the memory of the deleted elements for newly added elements
231+
* multithreaded usage
232+
233+
229234
### Bindings installation
230235

231236
You can install from sources:
@@ -245,9 +250,9 @@ Contributions are highly welcome!
245250

246251
Please make pull requests against the `develop` branch.
247252

248-
When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
253+
When making changes please run tests (and please add a test to `tests/python` in case there is new functionality):
249254
```bash
250-
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py
255+
python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
251256
```
252257

253258

@@ -259,20 +264,23 @@ https://github.com/facebookresearch/faiss
259264
["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422)
260265
(current state-of-the-art in compressed indexes, C++):
261266
https://github.com/dbaranchuk/ivf-hnsw
267+
* Amazon PECOS https://github.com/amzn/pecos
262268
* TOROS N2 (python, C++): https://github.com/kakao/n2
263269
* Online HNSW (C++): https://github.com/andrusha97/online-hnsw)
264270
* Go implementation: https://github.com/Bithack/go-hnsw
265271
* Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering
272+
* Julia implmentation https://github.com/JuliaNeighbors/HNSW.jl
266273
* Java implementation: https://github.com/jelmerk/hnswlib
267274
* Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna
268-
* .Net implementation: https://github.com/microsoft/HNSW.Net
275+
* .Net implementation: https://github.com/curiosity-ai/hnsw-sharp
269276
* CUDA implementation: https://github.com/js1010/cuhnsw
270-
277+
* Rust implementation https://github.com/rust-cv/hnsw
278+
* Rust implementation for memory and thread safety purposes and There is A Trait to enable the user to implement its own distances. It takes as data slices of types T satisfying T:Serialize+Clone+Send+Sync.: https://github.com/jean-pierreBoth/hnswlib-rs
271279

272280
### 200M SIFT test reproduction
273281
To download and extract the bigann dataset (from root directory):
274282
```bash
275-
python3 download_bigann.py
283+
python tests/cpp/download_bigann.py
276284
```
277285
To compile:
278286
```bash
@@ -292,7 +300,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs
292300
### Updates test
293301
To generate testing data (from root directory):
294302
```bash
295-
cd examples
303+
cd tests/cpp
296304
python update_gen_data.py
297305
```
298306
To compile (from root directory):

TESTING_RECALL.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ max_elements defines the maximum number of elements that can be stored in the st
2727

2828
### measuring recall example
2929

30-
```
30+
```python
3131
import hnswlib
3232
import numpy as np
3333

0 commit comments

Comments
 (0)