Merge pull request #6 from microsoft/tsv_to_bin

added tsv to bin file convertor
microsoft · May 15, 2021 · a489f59 · a489f59
2 parents b7fdf37 + 77cd029
commit a489f59
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 24 deletions.
diff --git a/include/distance.h b/include/distance.h
@@ -226,9 +226,7 @@ namespace diskann {
     }
   };
 
-  // Gopal. Slow implementations of the distance functions to get diskann to
-  // work in v14 machines that do not have AVX2 support. Performance here is not
-  // a concern, so we are using the simplest possible implementation.
+  //  Slow implementations of the distance functions for machines without AVX2
   template<typename T>
   class SlowDistanceL2Int : public Distance<T> {
     virtual float compare(const T *a, const T *b, unsigned length) const {

diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
@@ -17,6 +17,13 @@ if(MSVC)
 	target_link_libraries(ivecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
 endif()
 
+add_executable(tsv_to_bin tsv_to_bin.cpp)
+if(MSVC)
+	target_link_options(tsv_to_bin PRIVATE /MACHINE:x64)
+	target_link_libraries(tsv_to_bin debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
+	target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
+endif()
+
 add_executable(int8_to_float int8_to_float.cpp)
 if(MSVC)
 	target_link_options(int8_to_float PRIVATE /MACHINE:x64)

diff --git a/tests/utils/compute_groundtruth.cpp b/tests/utils/compute_groundtruth.cpp
@@ -116,9 +116,7 @@ void exact_knn(const size_t dim, const size_t k,
                const float *const queries)  // queries in Col major
 {
   float *points_l2sq = new float[npoints];
-  //	std::cout<<"jere"<<std::endl;
   float *queries_l2sq = new float[nqueries];
-  //	std::cout<<"jere "<<npoints<<" " <<dim << " " << nqueries <<std::endl;
   compute_l2sq(points_l2sq, points, npoints, dim);
   compute_l2sq(queries_l2sq, queries, nqueries, dim);
 
@@ -164,14 +162,6 @@ void exact_knn(const size_t dim, const size_t k,
       assert(std::is_sorted(
           dist_closest_points + (ptrdiff_t) q * (ptrdiff_t) k,
           dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t) k));
-      /*std::sort(point_dist.begin(), point_dist.end(),
-          [](const auto &l, const auto &r) {return l.second < r.second; });
-      for (int l = 0; l < k; ++l) {
-          closest_points[(ptrdiff_t)l + (ptrdiff_t)q * (ptrdiff_t)k] =
-      point_dist[l].first;
-          dist_closest_points[(ptrdiff_t)l + (ptrdiff_t)q * (ptrdiff_t)k] =
-      point_dist[l].second;
-      }*/
     }
     std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e
               << ")" << std::endl;
@@ -219,12 +209,10 @@ inline void load_bin_as_float(const char *filename, float *&data, size_t &npts,
 
   reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t),
                std::ios::beg);
-  //    data = new T[nptsuint64_t * ndimsuint64_t];
   T *data_T = new T[nptsuint64_t * ndimsuint64_t];
   reader.read((char *) data_T, sizeof(T) * nptsuint64_t * ndimsuint64_t);
   std::cout << "Finished reading part of the bin file." << std::endl;
   reader.close();
-  //  data =  (nptsuint64_t*ndimsuint64_t, ALIGNMENT);
   data = aligned_malloc<float>(nptsuint64_t * ndimsuint64_t, ALIGNMENT);
 #pragma omp parallel for schedule(dynamic, 32768)
   for (int64_t i = 0; i < (int64_t) nptsuint64_t; i++) {
@@ -278,11 +266,7 @@ inline void save_groundtruth_as_one_file(const std::string filename,
 
 template<typename T>
 int aux_main(int argv, char **argc) {
-  if (argv != 6) {
-    command_line_help();
-    return -1;
-  }
-
+
   size_t      npoints, nqueries, dim;
   std::string base_file(argc[2]);
   std::string query_file(argc[3]);
@@ -331,10 +315,6 @@ int aux_main(int argv, char **argc) {
     }
   }
 
-  //  save_bin<int>(gt_file + std::string("_ids.bin"), closest_points, nqueries,
-  //  k);
-  //  save_bin<float>(gt_file + std::string("_dist.bin"), dist_closest_points,
-  //                  nqueries, k);
   save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points,
                                nqueries, k);
   diskann::aligned_free(query_data);
@@ -344,6 +324,11 @@ int aux_main(int argv, char **argc) {
 }
 
 int main(int argc, char **argv) {
+  if (argc != 6) {
+    command_line_help();
+    return -1;
+  }
+
   if (std::string(argv[1]) == std::string("float"))
     aux_main<float>(argc, argv);
   if (std::string(argv[1]) == std::string("int8"))

diff --git a/tests/utils/tsv_to_bin.cpp b/tests/utils/tsv_to_bin.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <iostream>
+#include "utils.h"
+
+template<class T>
+void block_convert(std::ifstream& reader, std::ofstream& writer, _u64 npts,
+                   _u64 ndims) {
+  auto read_buf = new T[4 * npts * (ndims + 1)];
+
+  auto cursor = read_buf;
+  T    val;
+
+  for (_u64 i = 0; i < npts; i++) {
+    for (_u64 d = 0; d < ndims; ++d) {
+      reader >> val;
+      *cursor = val;
+      cursor++;
+    }
+  }
+  writer.write((char*) read_buf, npts * ndims * sizeof(T));
+  delete[] read_buf;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 6) {
+    std::cout << argv[0]
+                  << "<float/int8/uint8> input_filename.tsv output_filename.bin dim num_pts>"
+                  << std::endl;
+    exit(-1);
+  }
+
+  if (std::string(argv[1]) != std::string("float") &&
+      std::string(argv[1]) != std::string("int8") &&
+      std::string(argv[1]) != std::string("uint8")) {
+    std::cout << "Unsupported type. float, int8 and uint8 types are supported."
+              << std::endl;
+  }
+
+  _u64 ndims = atoi(argv[4]);
+  _u64 npts = atoi(argv[5]);
+
+  std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
+  //  _u64          fsize = reader.tellg();
+  reader.seekg(0, std::ios::beg);
+  reader.seekg(0, std::ios::beg);
+
+  _u64 blk_size = 131072;
+  _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
+  std::cout << "# blks: " << nblks << std::endl;
+  std::ofstream writer(argv[3], std::ios::binary);
+  auto           npts_s32 = (_u32) npts;
+  auto          ndims_s32 = (_u32) ndims;
+  writer.write((char*) &npts_s32, sizeof(_u32));
+  writer.write((char*) &ndims_s32, sizeof(_u32));
+
+  for (_u64 i = 0; i < nblks; i++) {
+    _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
+    if (std::string(argv[1]) == std::string("float")) {
+      block_convert<float>(reader, writer, cblk_size, ndims);
+    } else if (std::string(argv[1]) == std::string("int8")) {
+      block_convert<int8_t>(reader, writer, cblk_size, ndims);
+    } else if (std::string(argv[1]) == std::string("uint8")) {
+      block_convert<uint8_t>(reader, writer, cblk_size, ndims);
+    }
+    std::cout << "Block #" << i << " written" << std::endl;
+  }
+
+  reader.close();
+  writer.close();
+}