Skip to content

Commit 3f34296

Browse files
authored
Merge pull request #523 from nmslib/develop
Release v0.8.0
2 parents 359b2ba + 5a8fd34 commit 3f34296

23 files changed

+1190
-90
lines changed

.github/workflows/build.yml

+9-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ jobs:
77
runs-on: ${{matrix.os}}
88
strategy:
99
matrix:
10-
os: [ubuntu-latest, windows-latest]
10+
os: [ubuntu-latest, windows-latest, macos-latest]
1111
python-version: ["3.7", "3.8", "3.9", "3.10"]
1212
steps:
1313
- uses: actions/checkout@v3
@@ -28,7 +28,7 @@ jobs:
2828
runs-on: ${{matrix.os}}
2929
strategy:
3030
matrix:
31-
os: [ubuntu-latest, windows-latest]
31+
os: [ubuntu-latest, windows-latest, macos-latest]
3232
steps:
3333
- uses: actions/checkout@v3
3434
- uses: actions/setup-python@v4
@@ -40,10 +40,10 @@ jobs:
4040
mkdir build
4141
cd build
4242
cmake ..
43-
if [ "$RUNNER_OS" == "Linux" ]; then
44-
make
45-
elif [ "$RUNNER_OS" == "Windows" ]; then
43+
if [ "$RUNNER_OS" == "Windows" ]; then
4644
cmake --build ./ --config Release
45+
else
46+
make
4747
fi
4848
shell: bash
4949

@@ -67,10 +67,14 @@ jobs:
6767
./example_mt_search
6868
./example_mt_filter
6969
./example_mt_replace_deleted
70+
./example_multivector_search
71+
./example_epsilon_search
7072
./searchKnnCloserFirst_test
7173
./searchKnnWithFilter_test
7274
./multiThreadLoad_test
7375
./multiThread_replace_test
7476
./test_updates
7577
./test_updates update
78+
./multivector_search_test
79+
./epsilon_search_test
7680
shell: bash

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ var/
1010
.vscode/
1111
.vs/
1212
**.DS_Store
13+
*.pyc

CMakeLists.txt

+56-7
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,68 @@
1-
cmake_minimum_required (VERSION 2.6)
2-
project(hnsw_lib
1+
cmake_minimum_required(VERSION 3.0...3.26)
2+
3+
project(hnswlib
34
LANGUAGES CXX)
45

6+
include(GNUInstallDirs)
7+
include(CheckCXXCompilerFlag)
8+
59
add_library(hnswlib INTERFACE)
6-
target_include_directories(hnswlib INTERFACE .)
10+
add_library(hnswlib::hnswlib ALIAS hnswlib)
11+
12+
target_include_directories(hnswlib INTERFACE
13+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
14+
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
15+
16+
# Install
17+
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
18+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
19+
20+
install(TARGETS hnswlib
21+
EXPORT hnswlibTargets)
22+
23+
install(EXPORT hnswlibTargets
24+
FILE hnswlibConfig.cmake
25+
NAMESPACE hnswlib::
26+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)
727

28+
# Examples and tests
829
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
30+
option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
31+
else()
32+
option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
33+
endif()
34+
if(HNSWLIB_EXAMPLES)
935
set(CMAKE_CXX_STANDARD 11)
1036

11-
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
12-
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
37+
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
38+
SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
39+
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
40+
if(COMPILER_SUPPORT_NATIVE_FLAG)
41+
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
42+
message("set -march=native flag")
43+
else()
44+
check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
45+
if(COMPILER_SUPPORT_M1_FLAG)
46+
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
47+
message("set -mcpu=apple-m1 flag")
48+
endif()
49+
endif()
1350
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
14-
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
51+
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
1552
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
16-
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
53+
SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
1754
endif()
1855

1956
# examples
2057
add_executable(example_search examples/cpp/example_search.cpp)
2158
target_link_libraries(example_search hnswlib)
2259

60+
add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
61+
target_link_libraries(example_epsilon_search hnswlib)
62+
63+
add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
64+
target_link_libraries(example_multivector_search hnswlib)
65+
2366
add_executable(example_filter examples/cpp/example_filter.cpp)
2467
target_link_libraries(example_filter hnswlib)
2568

@@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
3679
target_link_libraries(example_mt_replace_deleted hnswlib)
3780

3881
# tests
82+
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
83+
target_link_libraries(multivector_search_test hnswlib)
84+
85+
add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
86+
target_link_libraries(epsilon_search_test hnswlib)
87+
3988
add_executable(test_updates tests/cpp/updates_test.cpp)
4089
target_link_libraries(test_updates hnswlib)
4190

README.md

+12-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates
33

44
**NEWS:**
55

6+
**version 0.8.0**
7+
8+
* Multi-vector document search and epsilon search (for now, only in C++)
9+
* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)).
10+
* Various bugfixes and improvements
11+
* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list'
12+
13+
Full list of changes: https://github.com/nmslib/hnswlib/pull/523
14+
615
**version 0.7.0**
716

817
* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
@@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
7988

8089
* `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying.
8190

82-
* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors.
91+
* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors.
8392

8493
* `get_ids_list()` - returns a list of all elements' ids.
8594

@@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
229238
* filtering during the search with a boolean function
230239
* deleting the elements and reusing the memory of the deleted elements for newly added elements
231240
* multithreaded usage
241+
* multivector search
242+
* epsilon search
232243

233244

234245
### Bindings installation

examples/cpp/EXAMPLES.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,8 @@ int main() {
182182
Multithreaded examples:
183183
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
184184
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
185-
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
185+
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
186+
187+
More examples:
188+
* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
189+
* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "../../hnswlib/hnswlib.h"
2+
3+
typedef unsigned int docidtype;
4+
typedef float dist_t;
5+
6+
int main() {
7+
int dim = 16; // Dimension of the elements
8+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
9+
int M = 16; // Tightly connected with internal dimensionality of the data
10+
// strongly affects the memory consumption
11+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
12+
int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region
13+
// this parameter is similar to ef
14+
15+
int num_queries = 5;
16+
float epsilon2 = 2.0; // Squared distance to query
17+
18+
// Initing index
19+
hnswlib::L2Space space(dim);
20+
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
21+
22+
// Generate random data
23+
std::mt19937 rng;
24+
rng.seed(47);
25+
std::uniform_real_distribution<> distrib_real;
26+
27+
size_t data_point_size = space.get_data_size();
28+
char* data = new char[data_point_size * max_elements];
29+
for (int i = 0; i < max_elements; i++) {
30+
char* point_data = data + i * data_point_size;
31+
for (int j = 0; j < dim; j++) {
32+
char* vec_data = point_data + j * sizeof(float);
33+
float value = distrib_real(rng);
34+
*(float*)vec_data = value;
35+
}
36+
}
37+
38+
// Add data to index
39+
for (int i = 0; i < max_elements; i++) {
40+
hnswlib::labeltype label = i;
41+
char* point_data = data + i * data_point_size;
42+
alg_hnsw->addPoint(point_data, label);
43+
}
44+
45+
// Query random vectors
46+
for (int i = 0; i < num_queries; i++) {
47+
char* query_data = new char[data_point_size];
48+
for (int j = 0; j < dim; j++) {
49+
size_t offset = j * sizeof(float);
50+
char* vec_data = query_data + offset;
51+
float value = distrib_real(rng);
52+
*(float*)vec_data = value;
53+
}
54+
std::cout << "Query #" << i << "\n";
55+
hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
56+
std::vector<std::pair<float, hnswlib::labeltype>> result =
57+
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
58+
size_t num_vectors = result.size();
59+
std::cout << "Found " << num_vectors << " vectors\n";
60+
delete[] query_data;
61+
}
62+
63+
delete[] data;
64+
delete alg_hnsw;
65+
return 0;
66+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#include "../../hnswlib/hnswlib.h"
2+
3+
typedef unsigned int docidtype;
4+
typedef float dist_t;
5+
6+
int main() {
7+
int dim = 16; // Dimension of the elements
8+
int max_elements = 10000; // Maximum number of elements, should be known beforehand
9+
int M = 16; // Tightly connected with internal dimensionality of the data
10+
// strongly affects the memory consumption
11+
int ef_construction = 200; // Controls index search speed/build speed tradeoff
12+
13+
int num_queries = 5;
14+
int num_docs = 5; // Number of documents to search
15+
int ef_collection = 6; // Number of candidate documents during the search
16+
// Controlls the recall: higher ef leads to better accuracy, but slower search
17+
docidtype min_doc_id = 0;
18+
docidtype max_doc_id = 9;
19+
20+
// Initing index
21+
hnswlib::MultiVectorL2Space<docidtype> space(dim);
22+
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);
23+
24+
// Generate random data
25+
std::mt19937 rng;
26+
rng.seed(47);
27+
std::uniform_real_distribution<> distrib_real;
28+
std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);
29+
30+
size_t data_point_size = space.get_data_size();
31+
char* data = new char[data_point_size * max_elements];
32+
for (int i = 0; i < max_elements; i++) {
33+
// set vector value
34+
char* point_data = data + i * data_point_size;
35+
for (int j = 0; j < dim; j++) {
36+
char* vec_data = point_data + j * sizeof(float);
37+
float value = distrib_real(rng);
38+
*(float*)vec_data = value;
39+
}
40+
// set document id
41+
docidtype doc_id = distrib_docid(rng);
42+
space.set_doc_id(point_data, doc_id);
43+
}
44+
45+
// Add data to index
46+
std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
47+
for (int i = 0; i < max_elements; i++) {
48+
hnswlib::labeltype label = i;
49+
char* point_data = data + i * data_point_size;
50+
alg_hnsw->addPoint(point_data, label);
51+
label_docid_lookup[label] = space.get_doc_id(point_data);
52+
}
53+
54+
// Query random vectors
55+
size_t query_size = dim * sizeof(float);
56+
for (int i = 0; i < num_queries; i++) {
57+
char* query_data = new char[query_size];
58+
for (int j = 0; j < dim; j++) {
59+
size_t offset = j * sizeof(float);
60+
char* vec_data = query_data + offset;
61+
float value = distrib_real(rng);
62+
*(float*)vec_data = value;
63+
}
64+
std::cout << "Query #" << i << "\n";
65+
hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
66+
std::vector<std::pair<float, hnswlib::labeltype>> result =
67+
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
68+
size_t num_vectors = result.size();
69+
70+
std::unordered_map<docidtype, size_t> doc_counter;
71+
for (auto pair: result) {
72+
hnswlib::labeltype label = pair.second;
73+
docidtype doc_id = label_docid_lookup[label];
74+
doc_counter[doc_id] += 1;
75+
}
76+
std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
77+
delete[] query_data;
78+
}
79+
80+
delete[] data;
81+
delete alg_hnsw;
82+
return 0;
83+
}

hnswlib/bruteforce.h

+10-4
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
8484

8585

8686
void removePoint(labeltype cur_external) {
87-
size_t cur_c = dict_external_to_internal[cur_external];
87+
std::unique_lock<std::mutex> lock(index_lock);
8888

89-
dict_external_to_internal.erase(cur_external);
89+
auto found = dict_external_to_internal.find(cur_external);
90+
if (found == dict_external_to_internal.end()) {
91+
return;
92+
}
93+
94+
dict_external_to_internal.erase(found);
9095

96+
size_t cur_c = found->second;
9197
labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
9298
dict_external_to_internal[label] = cur_c;
9399
memcpy(data_ + size_per_element_ * cur_c,
@@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
106112
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
107113
labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_));
108114
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
109-
topResults.push(std::pair<dist_t, labeltype>(dist, label));
115+
topResults.emplace(dist, label);
110116
}
111117
}
112118
dist_t lastdist = topResults.empty() ? std::numeric_limits<dist_t>::max() : topResults.top().first;
@@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
115121
if (dist <= lastdist) {
116122
labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
117123
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
118-
topResults.push(std::pair<dist_t, labeltype>(dist, label));
124+
topResults.emplace(dist, label);
119125
}
120126
if (topResults.size() > k)
121127
topResults.pop();

0 commit comments

Comments
 (0)