@@ -615,7 +615,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr
615
615
double sampling_rate, double ram_budget, std::string mem_index_path,
616
616
std::string medoids_file, std::string centroids_file, size_t build_pq_bytes, bool use_opq,
617
617
uint32_t num_threads, bool use_filters, const std::string &label_file,
618
- const std::string &labels_to_medoids_file , const std::string &universal_label,
618
+ const std::string &disk_labels_to_medoids_file , const std::string &universal_label,
619
619
const uint32_t Lf)
620
620
{
621
621
size_t base_num, base_dim;
@@ -642,22 +642,22 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr
642
642
_index.build (base_file.c_str (), base_num);
643
643
else
644
644
{
645
- // if (universal_label != "")
646
- // { // indicates no universal label
647
- // LabelT unv_label_as_num = 0;
648
- // _index.set_universal_label(unv_label_as_num );
649
- // }
650
- _index.build_filtered_index (base_file.c_str (), label_file, universal_label, base_num);
645
+ if (universal_label != " " )
646
+ { // indicates no universal label
647
+ // LabelT unv_label_as_num = 0;
648
+ _index.set_universal_labels ({universal_label} );
649
+ }
650
+ _index.build_filtered_index (base_file.c_str (), label_file, base_num);
651
651
}
652
652
_index.save (mem_index_path.c_str ());
653
653
654
654
if (use_filters)
655
655
{
656
656
// need to copy the labels_to_medoids file to the specified input
657
657
// file
658
- std::remove (labels_to_medoids_file .c_str ());
658
+ std::remove (disk_labels_to_medoids_file .c_str ());
659
659
std::string mem_labels_to_medoid_file = mem_index_path + " _labels_to_medoids.txt" ;
660
- copy_file (mem_labels_to_medoid_file, labels_to_medoids_file );
660
+ copy_file (mem_labels_to_medoid_file, disk_labels_to_medoids_file );
661
661
std::remove (mem_labels_to_medoid_file.c_str ());
662
662
}
663
663
@@ -712,12 +712,12 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr
712
712
else
713
713
{
714
714
diskann::extract_shard_labels (label_file, shard_ids_file, shard_labels_file);
715
- // if (universal_label != "")
716
- // { // indicates no universal label
717
- // LabelT unv_label_as_num = 0;
718
- // _index.set_universal_label(unv_label_as_num );
719
- // }
720
- _index.build_filtered_index (shard_base_file.c_str (), shard_labels_file, universal_label, shard_base_pts);
715
+ if (universal_label != " " )
716
+ { // indicates no universal label
717
+ // LabelT unv_label_as_num = 0;
718
+ _index.set_universal_labels ({universal_label} );
719
+ }
720
+ _index.build_filtered_index (shard_base_file.c_str (), shard_labels_file, shard_base_pts);
721
721
}
722
722
_index.save (shard_index_file.c_str ());
723
723
// copy universal label file from first shard to the final destination
@@ -738,7 +738,7 @@ int build_merged_vamana_index(std::string base_file, diskann::Metric compareMetr
738
738
timer.reset ();
739
739
diskann::merge_shards (merged_index_prefix + " _subshard-" , " _mem.index" , merged_index_prefix + " _subshard-" ,
740
740
" _ids_uint32.bin" , num_parts, R, mem_index_path, medoids_file, use_filters,
741
- labels_to_medoids_file );
741
+ disk_labels_to_medoids_file );
742
742
diskann::cout << timer.elapsed_seconds_for_step (" merging indices" ) << std::endl;
743
743
744
744
// delete tempFiles
@@ -1159,14 +1159,16 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
1159
1159
std::string pq_compressed_vectors_path = index_prefix_path + " _pq_compressed.bin" ;
1160
1160
std::string mem_index_path = index_prefix_path + " _mem.index" ;
1161
1161
std::string disk_index_path = index_prefix_path + " _disk.index" ;
1162
+
1162
1163
std::string medoids_path = disk_index_path + " _medoids.bin" ;
1163
1164
std::string centroids_path = disk_index_path + " _centroids.bin" ;
1164
1165
1165
- std::string labels_to_medoids_path = disk_index_path + " _labels_to_medoids.txt" ;
1166
+ std::string disk_labels_to_medoids_path = disk_index_path + " _labels_to_medoids.txt" ;
1166
1167
std::string mem_labels_file = mem_index_path + " _labels.txt" ;
1167
1168
std::string disk_labels_file = disk_index_path + " _labels.txt" ;
1168
1169
std::string mem_univ_label_file = mem_index_path + " _universal_label.txt" ;
1169
1170
std::string disk_univ_label_file = disk_index_path + " _universal_label.txt" ;
1171
+ std::string mem_labels_int_map_file = mem_index_path + " _labels_map.txt" ;
1170
1172
std::string disk_labels_int_map_file = disk_index_path + " _labels_map.txt" ;
1171
1173
std::string dummy_remap_file = disk_index_path + " _dummy_remap.txt" ; // remap will be used if we break-up points of
1172
1174
// high label-density to create copies
@@ -1232,19 +1234,19 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
1232
1234
std::string augmented_data_file, augmented_labels_file;
1233
1235
if (use_filters)
1234
1236
{
1235
- convert_labels_string_to_int (labels_file_original, labels_file_to_use, disk_labels_int_map_file,
1236
- universal_label);
1237
+ /* convert_labels_string_to_int(labels_file_original, labels_file_to_use, disk_labels_int_map_file,
1238
+ universal_label);*/
1237
1239
augmented_data_file = index_prefix_path + " _augmented_data.bin" ;
1238
1240
augmented_labels_file = index_prefix_path + " _augmented_labels.txt" ;
1239
1241
if (filter_threshold != 0 )
1240
1242
{
1241
1243
dummy_remap_file = index_prefix_path + " _dummy_remap.txt" ;
1242
- breakup_dense_points<T>(data_file_to_use, labels_file_to_use , filter_threshold, augmented_data_file,
1244
+ breakup_dense_points<T>(data_file_to_use, labels_file_original , filter_threshold, augmented_data_file,
1243
1245
augmented_labels_file,
1244
1246
dummy_remap_file); // RKNOTE: This has large memory footprint,
1245
1247
// need to make this streaming
1246
1248
data_file_to_use = augmented_data_file;
1247
- labels_file_to_use = augmented_labels_file;
1249
+ labels_file_original = augmented_labels_file;
1248
1250
}
1249
1251
}
1250
1252
@@ -1287,10 +1289,10 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
1287
1289
#endif
1288
1290
1289
1291
timer.reset ();
1290
- diskann::build_merged_vamana_index<T, LabelT>(data_file_to_use. c_str (), diskann::Metric::L2, L, R, p_val,
1291
- indexing_ram_budget, mem_index_path, medoids_path, centroids_path ,
1292
- build_pq_bytes, use_opq, num_threads, use_filters, labels_file_to_use ,
1293
- labels_to_medoids_path , universal_label, Lf);
1292
+ diskann::build_merged_vamana_index<T, LabelT>(
1293
+ data_file_to_use. c_str (), diskann::Metric::L2, L, R, p_val, indexing_ram_budget, mem_index_path, medoids_path,
1294
+ centroids_path, build_pq_bytes, use_opq, num_threads, use_filters, labels_file_original ,
1295
+ disk_labels_to_medoids_path , universal_label, Lf);
1294
1296
diskann::cout << timer.elapsed_seconds_for_step (" building merged vamana index" ) << std::endl;
1295
1297
1296
1298
timer.reset ();
@@ -1315,16 +1317,21 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
1315
1317
gen_random_slice<T>(data_file_to_use.c_str (), sample_base_prefix, sample_sampling_rate);
1316
1318
if (use_filters)
1317
1319
{
1318
- copy_file (labels_file_to_use, disk_labels_file);
1320
+ // copy labels file
1321
+ copy_file (mem_labels_file, disk_labels_file);
1319
1322
std::remove (mem_labels_file.c_str ());
1323
+ // copy universal label
1320
1324
if (universal_label != " " )
1321
1325
{
1322
1326
copy_file (mem_univ_label_file, disk_univ_label_file);
1323
1327
std::remove (mem_univ_label_file.c_str ());
1324
1328
}
1329
+ // copy map file
1330
+ copy_file (mem_labels_int_map_file, disk_labels_int_map_file);
1331
+ std::remove (mem_labels_int_map_file.c_str ());
1332
+
1325
1333
std::remove (augmented_data_file.c_str ());
1326
1334
std::remove (augmented_labels_file.c_str ());
1327
- std::remove (labels_file_to_use.c_str ());
1328
1335
}
1329
1336
1330
1337
std::remove (mem_index_path.c_str ());
0 commit comments