Skip to content

Commit af9c40c

Browse files
committed
Merge branch 'fix-rmm_build_error-aleliu' into 'main'
fix DISABLE_CUDF build option See merge request dl/hugectr/hugectr!1521
2 parents 19e7154 + a620bda commit af9c40c

File tree

9 files changed

+99
-33
lines changed

9 files changed

+99
-33
lines changed

CMakeLists.txt

+22-5
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ if (DISABLE_A2A_WARMUP)
122122
endif()
123123

124124
option(DISABLE_CUDF "Disable cudf: disable parquet format related features" OFF)
125-
125+
# this manual definition is a WAR and RMM team will fix it in the future
126+
add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
126127
option(USE_CUDART_STATIC "Setup clangformat target" OFF)
127128
if(USE_CUDART_STATIC)
128129
set(DISABLE_CUDF ON)
@@ -137,13 +138,29 @@ if (DISABLE_CUDF)
137138
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDISABLE_CUDF")
138139
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DDISABLE_CUDF")
139140
else()
141+
execute_process(
142+
COMMAND bash -c "pip show cudf|grep Version | sed 's/.*: //'"
143+
OUTPUT_VARIABLE CUDF_VERSION
144+
)
145+
146+
string(REPLACE "." ";" VERSION_LIST ${CUDF_VERSION})
147+
list(GET VERSION_LIST 0 CUDF_VERSION_MAJOR)
148+
list(GET VERSION_LIST 1 CUDF_VERSION_MINOR)
149+
# list(GET VERSION_LIST 2 CUDF_VERSION_PATCH)
150+
# add_compile_definitions(CUDF_VERSION_PATCH=${CUDF_VERSION_PATCH})
151+
152+
add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
153+
add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
154+
message(STATUS "CUDF_VERSION is ${CUDF_VERSION}")
155+
140156
find_package(Parquet REQUIRED CONFIG PATHS /usr/lib/cmake/arrow/ /usr/lib/cmake/Parquet/ NO_DEFAULT_PATH)
141157
if(Parquet_FOUND AND NOT ENABLE_HDFS AND NOT ENABLE_S3 AND NOT ENABLE_GCS)
142-
message (STATUS "Arrow Parquet is found")
143-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET")
144-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET")
145-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET")
158+
message (STATUS "Arrow Parquet is found")
159+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET")
160+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET")
161+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET")
146162
endif()
163+
147164
endif()
148165

149166
option(SHARP_A2A "Enable SHARP All2All" OFF)

HugeCTR/include/data_generator.hpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,8 @@
3131
#include <cudf/table/table.hpp>
3232
#include <cudf/table/table_view.hpp>
3333
#include <cudf/utilities/bit.hpp>
34-
#endif
35-
3634
#include <rmm/device_buffer.hpp>
35+
#endif
3736

3837
namespace HugeCTR {
3938

HugeCTR/include/resource_manager.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
#include <device_map.hpp>
2222
#include <gpu_resource.hpp>
2323
#include <resource_manager_base.hpp>
24+
25+
#ifndef DISABLE_CUDF
2426
#include <rmm/mr/device/device_memory_resource.hpp>
27+
#endif
2528

2629
namespace HugeCTR {
2730

@@ -45,8 +48,10 @@ class ResourceManager : public ResourceManagerBase {
4548

4649
virtual DeviceMap::Layout get_device_layout() const = 0;
4750

51+
#ifndef DISABLE_CUDF
4852
virtual const std::shared_ptr<rmm::mr::device_memory_resource>&
4953
get_device_rmm_device_memory_resource(int local_gpu_id) const = 0;
54+
#endif
5055
};
5156

5257
} // namespace HugeCTR

HugeCTR/include/resource_managers/resource_manager_core.hpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,16 @@ class ResourceManagerCore : public ResourceManager {
3333
std::vector<std::shared_ptr<GPUResource>> gpu_resources_; /**< GPU resource vector */
3434
std::vector<std::vector<bool>> p2p_matrix_;
3535

36+
void all2all_warmup();
37+
void enable_all_peer_accesses();
38+
39+
#ifndef DISABLE_CUDF
3640
std::vector<std::shared_ptr<rmm::mr::device_memory_resource>> base_cuda_mr_;
3741
std::vector<std::shared_ptr<rmm::mr::device_memory_resource>> memory_resource_;
3842
std::vector<rmm::mr::device_memory_resource*> original_device_resource_;
3943

40-
void all2all_warmup();
41-
void enable_all_peer_accesses();
4244
void initialize_rmm_resources();
45+
#endif
4346

4447
public:
4548
ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map,
@@ -112,7 +115,9 @@ class ResourceManagerCore : public ResourceManager {
112115

113116
DeviceMap::Layout get_device_layout() const override { return device_map_.get_device_layout(); }
114117

118+
#ifndef DISABLE_CUDF
115119
const std::shared_ptr<rmm::mr::device_memory_resource>& get_device_rmm_device_memory_resource(
116120
int local_gpu_id) const override;
121+
#endif
117122
};
118123
} // namespace HugeCTR

HugeCTR/src/data_readers/file_source_parquet.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,12 @@ cudf_io::table_with_metadata ParquetFileSource::read_group(size_t row_group_id,
154154
parquet_args_.set_row_groups(rgrps);
155155
// parquet_args_.set_num_rows(-1);
156156
parquet_args_.set_timestamp_type(cudf::data_type(cudf::type_id::EMPTY));
157-
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr);
158157

158+
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR >= 24
159+
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, cudf::get_default_stream(), mr);
160+
#else
161+
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr);
162+
#endif
159163
if (!counter_) {
160164
HCTR_OWN_THROW(Error_t::UnspecificError, "Read parquet file first\n");
161165
}

HugeCTR/src/resource_managers/resource_manager_core.cpp

+32-22
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@
2121
#include <random>
2222
#include <resource_managers/resource_manager_core.hpp>
2323
#include <utils.hpp>
24+
25+
#ifndef DISABLE_CUDF
2426
#pragma GCC diagnostic push
2527
#pragma GCC diagnostic ignored "-Wunused-variable"
2628
#include <rmm/mr/device/cuda_memory_resource.hpp>
2729
#include <rmm/mr/device/per_device_resource.hpp>
2830
#include <rmm/mr/device/pool_memory_resource.hpp>
2931
#pragma GCC diagnostic pop
32+
#endif
3033

3134
namespace HugeCTR {
3235

@@ -98,27 +101,6 @@ void ResourceManagerCore::enable_all_peer_accesses() {
98101
}
99102
}
100103

101-
void ResourceManagerCore::initialize_rmm_resources() {
102-
const size_t pool_alloc_size = 256 * 1024 * 1024;
103-
using dmmr = rmm::mr::device_memory_resource;
104-
static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE");
105-
bool allow_set = true;
106-
if (allow_set_char && allow_set_char[0] == '0') {
107-
allow_set = false;
108-
}
109-
CudaDeviceContext context;
110-
auto local_gpu_device_id_list = get_local_gpu_device_id_list();
111-
for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) {
112-
context.set_device(local_gpu_device_id_list[i]);
113-
base_cuda_mr_.emplace_back(std::make_shared<rmm::mr::cuda_memory_resource>());
114-
memory_resource_.emplace_back(std::make_shared<rmm::mr::pool_memory_resource<dmmr>>(
115-
base_cuda_mr_.back().get(), pool_alloc_size));
116-
if (allow_set) {
117-
original_device_resource_.push_back(
118-
rmm::mr::set_current_device_resource(memory_resource_.back().get()));
119-
}
120-
}
121-
}
122104
ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map,
123105
unsigned long long seed)
124106
: num_process_(num_process), process_id_(process_id), device_map_(std::move(device_map)) {
@@ -204,13 +186,17 @@ ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, Device
204186

205187
all2all_warmup();
206188

189+
#ifndef DISABLE_CUDF
207190
initialize_rmm_resources();
191+
#endif
208192
// int dev_id = 0;
209193
// cudaGetDevice(&dev_id);
210194
// HCTR_LOG(INFO, WORLD, "ResourceManagerCore ctor getCurrentDeviceId after rmm_init %d\n",
211195
// dev_id);
212196
}
197+
213198
ResourceManagerCore::~ResourceManagerCore() {
199+
#ifndef DISABLE_CUDF
214200
if (original_device_resource_.empty()) {
215201
return;
216202
}
@@ -220,6 +206,7 @@ ResourceManagerCore::~ResourceManagerCore() {
220206
context.set_device(local_gpu_device_id_list[i]);
221207
rmm::mr::set_current_device_resource(original_device_resource_[i]);
222208
}
209+
#endif
223210
}
224211
bool ResourceManagerCore::p2p_enabled(int src_device_id, int dst_device_id) const {
225212
return p2p_matrix_[src_device_id][dst_device_id];
@@ -240,12 +227,35 @@ bool ResourceManagerCore::all_p2p_enabled() const {
240227
return true;
241228
}
242229

230+
#ifndef DISABLE_CUDF
231+
void ResourceManagerCore::initialize_rmm_resources() {
232+
const size_t pool_alloc_size = 256 * 1024 * 1024;
233+
using dmmr = rmm::mr::device_memory_resource;
234+
static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE");
235+
bool allow_set = true;
236+
if (allow_set_char && allow_set_char[0] == '0') {
237+
allow_set = false;
238+
}
239+
CudaDeviceContext context;
240+
auto local_gpu_device_id_list = get_local_gpu_device_id_list();
241+
for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) {
242+
context.set_device(local_gpu_device_id_list[i]);
243+
base_cuda_mr_.emplace_back(std::make_shared<rmm::mr::cuda_memory_resource>());
244+
memory_resource_.emplace_back(std::make_shared<rmm::mr::pool_memory_resource<dmmr>>(
245+
base_cuda_mr_.back().get(), pool_alloc_size));
246+
if (allow_set) {
247+
original_device_resource_.push_back(
248+
rmm::mr::set_current_device_resource(memory_resource_.back().get()));
249+
}
250+
}
251+
}
252+
243253
const std::shared_ptr<rmm::mr::device_memory_resource>&
244254
ResourceManagerCore::get_device_rmm_device_memory_resource(int local_gpu_id) const {
245255
auto dev_list = get_local_gpu_device_id_list();
246256
auto it = std::find(dev_list.begin(), dev_list.end(), local_gpu_id);
247257
auto index = std::distance(dev_list.begin(), it);
248258
return memory_resource_[index];
249259
}
250-
260+
#endif
251261
} // namespace HugeCTR

tools/dlrm_script/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ if (NOT CUDF_RESULT)
4141
list(GET CUDF_VERSION_LIST 0 CUDF_VERSION_MAJOR)
4242
list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_MINOR)
4343
list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH)
44-
44+
add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
45+
add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
4546
if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6)
4647
add_definitions(-DCUDF_GE_2306)
4748
endif()

tools/dlrm_script/dlrm_raw.cu

+17
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,15 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
136136
int loop_count = 0;
137137
while (true) {
138138
total_file_bytes_read += in_args.get_byte_range_size();
139+
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
140+
CUDF_VERSION_MINOR >= 12
141+
cudf_io::table_with_metadata tbl_w_metadata =
142+
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
143+
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
144+
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
145+
#else
139146
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
147+
#endif
140148
total_row_nums += tbl_w_metadata.tbl->num_rows();
141149

142150
dim3 block(prop.maxThreadsPerBlock, 1, 1);
@@ -488,7 +496,16 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
488496
int loop_count = 0;
489497
while (true) {
490498
total_file_bytes_read += in_args.get_byte_range_size();
499+
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
500+
CUDF_VERSION_MINOR >= 12
501+
cudf_io::table_with_metadata tbl_w_metadata =
502+
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
503+
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
504+
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
505+
#else
491506
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
507+
508+
#endif
492509
total_row_nums += tbl_w_metadata.tbl->num_rows();
493510

494511
dim3 block(prop.maxThreadsPerBlock, 1, 1);

tools/dlrm_script/dlrm_raw_utils.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,15 @@ size_t convert_input_binaries(rmm::mr::device_memory_resource *mr, std::string i
536536

537537
while (true) {
538538
process_read_bytes += in_args.get_byte_range_size();
539+
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
540+
CUDF_VERSION_MINOR >= 12
541+
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
542+
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
543+
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
544+
#else
539545
auto tbl_w_metadata = cudf_io::read_csv(in_args, mr);
546+
547+
#endif
540548
int32_t num_rows = tbl_w_metadata.tbl->num_rows();
541549
read_row_nums += num_rows;
542550

0 commit comments

Comments
 (0)