Skip to content

Commit

Permalink
Merge pull request #6 from microsoft/tsv_to_bin
Browse files Browse the repository at this point in the history
added tsv to bin file convertor
  • Loading branch information
harsha-simhadri authored May 15, 2021
2 parents b7fdf37 + 77cd029 commit a489f59
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 24 deletions.
4 changes: 1 addition & 3 deletions include/distance.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,7 @@ namespace diskann {
}
};

// Gopal. Slow implementations of the distance functions to get diskann to
// work in v14 machines that do not have AVX2 support. Performance here is not
// a concern, so we are using the simplest possible implementation.
// Slow implementations of the distance functions for machines without AVX2
template<typename T>
class SlowDistanceL2Int : public Distance<T> {
virtual float compare(const T *a, const T *b, unsigned length) const {
Expand Down
7 changes: 7 additions & 0 deletions tests/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ if(MSVC)
target_link_libraries(ivecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(tsv_to_bin tsv_to_bin.cpp)
if(MSVC)
target_link_options(tsv_to_bin PRIVATE /MACHINE:x64)
target_link_libraries(tsv_to_bin debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(int8_to_float int8_to_float.cpp)
if(MSVC)
target_link_options(int8_to_float PRIVATE /MACHINE:x64)
Expand Down
27 changes: 6 additions & 21 deletions tests/utils/compute_groundtruth.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,7 @@ void exact_knn(const size_t dim, const size_t k,
const float *const queries) // queries in Col major
{
float *points_l2sq = new float[npoints];
// std::cout<<"jere"<<std::endl;
float *queries_l2sq = new float[nqueries];
// std::cout<<"jere "<<npoints<<" " <<dim << " " << nqueries <<std::endl;
compute_l2sq(points_l2sq, points, npoints, dim);
compute_l2sq(queries_l2sq, queries, nqueries, dim);

Expand Down Expand Up @@ -164,14 +162,6 @@ void exact_knn(const size_t dim, const size_t k,
assert(std::is_sorted(
dist_closest_points + (ptrdiff_t) q * (ptrdiff_t) k,
dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t) k));
/*std::sort(point_dist.begin(), point_dist.end(),
[](const auto &l, const auto &r) {return l.second < r.second; });
for (int l = 0; l < k; ++l) {
closest_points[(ptrdiff_t)l + (ptrdiff_t)q * (ptrdiff_t)k] =
point_dist[l].first;
dist_closest_points[(ptrdiff_t)l + (ptrdiff_t)q * (ptrdiff_t)k] =
point_dist[l].second;
}*/
}
std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e
<< ")" << std::endl;
Expand Down Expand Up @@ -219,12 +209,10 @@ inline void load_bin_as_float(const char *filename, float *&data, size_t &npts,

reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t),
std::ios::beg);
// data = new T[nptsuint64_t * ndimsuint64_t];
T *data_T = new T[nptsuint64_t * ndimsuint64_t];
reader.read((char *) data_T, sizeof(T) * nptsuint64_t * ndimsuint64_t);
std::cout << "Finished reading part of the bin file." << std::endl;
reader.close();
// data = (nptsuint64_t*ndimsuint64_t, ALIGNMENT);
data = aligned_malloc<float>(nptsuint64_t * ndimsuint64_t, ALIGNMENT);
#pragma omp parallel for schedule(dynamic, 32768)
for (int64_t i = 0; i < (int64_t) nptsuint64_t; i++) {
Expand Down Expand Up @@ -278,11 +266,7 @@ inline void save_groundtruth_as_one_file(const std::string filename,

template<typename T>
int aux_main(int argv, char **argc) {
if (argv != 6) {
command_line_help();
return -1;
}


size_t npoints, nqueries, dim;
std::string base_file(argc[2]);
std::string query_file(argc[3]);
Expand Down Expand Up @@ -331,10 +315,6 @@ int aux_main(int argv, char **argc) {
}
}

// save_bin<int>(gt_file + std::string("_ids.bin"), closest_points, nqueries,
// k);
// save_bin<float>(gt_file + std::string("_dist.bin"), dist_closest_points,
// nqueries, k);
save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points,
nqueries, k);
diskann::aligned_free(query_data);
Expand All @@ -344,6 +324,11 @@ int aux_main(int argv, char **argc) {
}

int main(int argc, char **argv) {
if (argc != 6) {
command_line_help();
return -1;
}

if (std::string(argv[1]) == std::string("float"))
aux_main<float>(argc, argv);
if (std::string(argv[1]) == std::string("int8"))
Expand Down
72 changes: 72 additions & 0 deletions tests/utils/tsv_to_bin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <iostream>
#include "utils.h"

template<class T>
void block_convert(std::ifstream& reader, std::ofstream& writer, _u64 npts,
_u64 ndims) {
auto read_buf = new T[4 * npts * (ndims + 1)];

auto cursor = read_buf;
T val;

for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; ++d) {
reader >> val;
*cursor = val;
cursor++;
}
}
writer.write((char*) read_buf, npts * ndims * sizeof(T));
delete[] read_buf;
}

int main(int argc, char** argv) {
if (argc != 6) {
std::cout << argv[0]
<< "<float/int8/uint8> input_filename.tsv output_filename.bin dim num_pts>"
<< std::endl;
exit(-1);
}

if (std::string(argv[1]) != std::string("float") &&
std::string(argv[1]) != std::string("int8") &&
std::string(argv[1]) != std::string("uint8")) {
std::cout << "Unsupported type. float, int8 and uint8 types are supported."
<< std::endl;
}

_u64 ndims = atoi(argv[4]);
_u64 npts = atoi(argv[5]);

std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
// _u64 fsize = reader.tellg();
reader.seekg(0, std::ios::beg);
reader.seekg(0, std::ios::beg);

_u64 blk_size = 131072;
_u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writer(argv[3], std::ios::binary);
auto npts_s32 = (_u32) npts;
auto ndims_s32 = (_u32) ndims;
writer.write((char*) &npts_s32, sizeof(_u32));
writer.write((char*) &ndims_s32, sizeof(_u32));

for (_u64 i = 0; i < nblks; i++) {
_u64 cblk_size = std::min(npts - i * blk_size, blk_size);
if (std::string(argv[1]) == std::string("float")) {
block_convert<float>(reader, writer, cblk_size, ndims);
} else if (std::string(argv[1]) == std::string("int8")) {
block_convert<int8_t>(reader, writer, cblk_size, ndims);
} else if (std::string(argv[1]) == std::string("uint8")) {
block_convert<uint8_t>(reader, writer, cblk_size, ndims);
}
std::cout << "Block #" << i << " written" << std::endl;
}

reader.close();
writer.close();
}

0 comments on commit a489f59

Please sign in to comment.