Skip to content

Commit 3a8ea8c

Browse files
authored
Correctly align trees in experimental FIL (#6397)
Due to a bug in the import code, experimental FIL was previously not making use of the `align_bytes` argument correctly. The effect was not just a failure to take advantage of cache line boundaries but a severe pessimization in which padding nodes were inserted in the forest structure at highly non-optimal places. This PR corrects this, resulting in a substantial performance improvement. It also introduces the `layered` layout type, in which nodes of the same depth are stored together. This allows for a moderate performance improvement in some models. It also allows CPU FIL to intelligently set the number of threads rather than accepting the highly non-optimal default. This provides a significant performance improvement for small batch size. Authors: - William Hicks (https://github.com/wphicks) Approvers: - Philip Hyunsu Cho (https://github.com/hcho3) - Dante Gama Dessavre (https://github.com/dantegd) - https://github.com/jakirkham URL: #6397
1 parent c1391ba commit 3a8ea8c

37 files changed

+2376
-529
lines changed

conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dependencies:
2929
- libcuvs==25.4.*,>=0.0.0a0
3030
- libraft-headers==25.4.*,>=0.0.0a0
3131
- librmm==25.4.*,>=0.0.0a0
32+
- llvm-openmp==15.0.7
3233
- ninja
3334
- nvcc_linux-64=11.8
3435
- sysroot_linux-64==2.28

cpp/CMakeLists.txt

+10-2
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,11 @@ if(BUILD_CUML_CPP_LIBRARY)
383383
src/experimental/fil/infer4.cu
384384
src/experimental/fil/infer5.cu
385385
src/experimental/fil/infer6.cu
386-
src/experimental/fil/infer7.cu)
386+
src/experimental/fil/infer7.cu
387+
src/experimental/fil/infer8.cu
388+
src/experimental/fil/infer9.cu
389+
src/experimental/fil/infer10.cu
390+
src/experimental/fil/infer11.cu)
387391
endif()
388392
target_sources(${CUML_CPP_TARGET}
389393
PRIVATE
@@ -395,7 +399,11 @@ if(BUILD_CUML_CPP_LIBRARY)
395399
src/experimental/fil/infer4.cpp
396400
src/experimental/fil/infer5.cpp
397401
src/experimental/fil/infer6.cpp
398-
src/experimental/fil/infer7.cpp)
402+
src/experimental/fil/infer7.cpp
403+
src/experimental/fil/infer8.cpp
404+
src/experimental/fil/infer9.cpp
405+
src/experimental/fil/infer10.cpp
406+
src/experimental/fil/infer11.cpp)
399407
endif()
400408

401409
# todo: organize linear models better

cpp/bench/sg/filex.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -101,9 +101,9 @@ class FILEX : public RegressionFixture<float> {
101101
allowed_storage_types.push_back(ML::fil::storage_type_t::SPARSE8);
102102
}
103103
auto allowed_layouts = std::vector<ML::experimental::fil::tree_layout>{
104-
ML::experimental::fil::tree_layout::breadth_first,
105104
ML::experimental::fil::tree_layout::depth_first,
106-
};
105+
ML::experimental::fil::tree_layout::breadth_first,
106+
ML::experimental::fil::tree_layout::layered_children_together};
107107
auto min_time = std::numeric_limits<std::int64_t>::max();
108108

109109
// Iterate through storage type, algorithm type, and chunk sizes and find optimum

cpp/include/cuml/experimental/fil/decision_forest.hpp

+50-34
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -381,39 +381,55 @@ using preset_decision_forest = decision_forest<
381381
} // namespace detail
382382

383383
/** A variant containing all standard decision_forest instantiations */
384-
using decision_forest_variant =
385-
std::variant<detail::preset_decision_forest<
386-
std::variant_alternative_t<0, detail::specialization_variant>::layout,
387-
std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
388-
std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees>,
389-
detail::preset_decision_forest<
390-
std::variant_alternative_t<1, detail::specialization_variant>::layout,
391-
std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
392-
std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees>,
393-
detail::preset_decision_forest<
394-
std::variant_alternative_t<2, detail::specialization_variant>::layout,
395-
std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
396-
std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees>,
397-
detail::preset_decision_forest<
398-
std::variant_alternative_t<3, detail::specialization_variant>::layout,
399-
std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
400-
std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees>,
401-
detail::preset_decision_forest<
402-
std::variant_alternative_t<4, detail::specialization_variant>::layout,
403-
std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
404-
std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees>,
405-
detail::preset_decision_forest<
406-
std::variant_alternative_t<5, detail::specialization_variant>::layout,
407-
std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
408-
std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees>,
409-
detail::preset_decision_forest<
410-
std::variant_alternative_t<6, detail::specialization_variant>::layout,
411-
std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
412-
std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees>,
413-
detail::preset_decision_forest<
414-
std::variant_alternative_t<7, detail::specialization_variant>::layout,
415-
std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
416-
std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees>>;
384+
using decision_forest_variant = std::variant<
385+
detail::preset_decision_forest<
386+
std::variant_alternative_t<0, detail::specialization_variant>::layout,
387+
std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
388+
std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees>,
389+
detail::preset_decision_forest<
390+
std::variant_alternative_t<1, detail::specialization_variant>::layout,
391+
std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
392+
std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees>,
393+
detail::preset_decision_forest<
394+
std::variant_alternative_t<2, detail::specialization_variant>::layout,
395+
std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
396+
std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees>,
397+
detail::preset_decision_forest<
398+
std::variant_alternative_t<3, detail::specialization_variant>::layout,
399+
std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
400+
std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees>,
401+
detail::preset_decision_forest<
402+
std::variant_alternative_t<4, detail::specialization_variant>::layout,
403+
std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
404+
std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees>,
405+
detail::preset_decision_forest<
406+
std::variant_alternative_t<5, detail::specialization_variant>::layout,
407+
std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
408+
std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees>,
409+
detail::preset_decision_forest<
410+
std::variant_alternative_t<6, detail::specialization_variant>::layout,
411+
std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
412+
std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees>,
413+
detail::preset_decision_forest<
414+
std::variant_alternative_t<7, detail::specialization_variant>::layout,
415+
std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
416+
std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees>,
417+
detail::preset_decision_forest<
418+
std::variant_alternative_t<8, detail::specialization_variant>::layout,
419+
std::variant_alternative_t<8, detail::specialization_variant>::is_double_precision,
420+
std::variant_alternative_t<8, detail::specialization_variant>::has_large_trees>,
421+
detail::preset_decision_forest<
422+
std::variant_alternative_t<9, detail::specialization_variant>::layout,
423+
std::variant_alternative_t<9, detail::specialization_variant>::is_double_precision,
424+
std::variant_alternative_t<9, detail::specialization_variant>::has_large_trees>,
425+
detail::preset_decision_forest<
426+
std::variant_alternative_t<10, detail::specialization_variant>::layout,
427+
std::variant_alternative_t<10, detail::specialization_variant>::is_double_precision,
428+
std::variant_alternative_t<10, detail::specialization_variant>::has_large_trees>,
429+
detail::preset_decision_forest<
430+
std::variant_alternative_t<11, detail::specialization_variant>::layout,
431+
std::variant_alternative_t<11, detail::specialization_variant>::is_double_precision,
432+
std::variant_alternative_t<11, detail::specialization_variant>::has_large_trees>>;
417433

418434
/**
419435
* Determine the variant index of the decision_forest type to used based on

cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp

+33-38
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -58,32 +58,13 @@ struct decision_forest_builder {
5858
/* The type for nodes in the given decision_forest type */
5959
using node_type = typename decision_forest_t::node_type;
6060

61-
/* Add a root node, indicating the beginning of a new tree */
62-
void start_new_tree()
63-
{
64-
if (root_node_indexes_.empty()) {
65-
root_node_indexes_.emplace_back();
66-
} else {
67-
max_tree_size_ = std::max(cur_tree_size_, max_tree_size_);
68-
if (alignment_ != index_type{}) {
69-
if (cur_tree_size_ % alignment_ != index_type{}) {
70-
auto padding = (alignment_ - cur_tree_size_ % alignment_);
71-
for (auto i = index_type{}; i < padding; ++i) {
72-
add_node(typename node_type::threshold_type{}, std::nullopt);
73-
}
74-
}
75-
}
76-
root_node_indexes_.push_back(root_node_indexes_.back() + cur_tree_size_);
77-
cur_tree_size_ = index_type{};
78-
}
79-
}
80-
8161
/* Add a node with a categorical split */
8262
template <typename iter_t>
8363
void add_categorical_node(
8464
iter_t vec_begin,
8565
iter_t vec_end,
8666
std::optional<int> tl_node_id = std::nullopt,
67+
std::size_t depth = std::size_t{1},
8768
bool default_to_distant_child = false,
8869
typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
8970
typename node_type::offset_type offset = typename node_type::offset_type{})
@@ -103,46 +84,62 @@ struct decision_forest_builder {
10384
auto set = bitset{set_storage, max_node_categories};
10485
std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
10586

106-
add_node(node_value, tl_node_id, false, default_to_distant_child, true, feature, offset, false);
87+
add_node(
88+
node_value, tl_node_id, depth, false, default_to_distant_child, true, feature, offset, false);
10789
}
10890

10991
/* Add a leaf node with vector output */
11092
template <typename iter_t>
11193
void add_leaf_vector_node(iter_t vec_begin,
11294
iter_t vec_end,
113-
std::optional<int> tl_node_id = std::nullopt)
95+
std::optional<int> tl_node_id = std::nullopt,
96+
std::size_t depth = std::size_t{1})
11497
{
11598
auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
11699
std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
117-
nodes_.emplace_back(leaf_index,
118-
true,
119-
false,
120-
false,
121-
typename node_type::metadata_storage_type{},
122-
typename node_type::offset_type{});
123-
// 0 indicates the lack of ID mapping for a particular node
124-
node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
125-
++cur_tree_size_;
100+
101+
add_node(leaf_index,
102+
tl_node_id,
103+
depth,
104+
true,
105+
false,
106+
false,
107+
typename node_type::metadata_storage_type{},
108+
typename node_type::offset_type{},
109+
false);
126110
}
127111

128112
/* Add a node to the model */
129113
template <typename value_t>
130114
void add_node(
131115
value_t val,
132116
std::optional<int> tl_node_id = std::nullopt,
117+
std::size_t depth = std::size_t{1},
133118
bool is_leaf_node = true,
134119
bool default_to_distant_child = false,
135120
bool is_categorical_node = false,
136121
typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
137122
typename node_type::offset_type offset = typename node_type::offset_type{},
138123
bool is_inclusive = false)
139124
{
125+
if (depth == std::size_t{}) {
126+
if (alignment_ != index_type{}) {
127+
if (cur_node_index_ % alignment_ != index_type{}) {
128+
auto padding = (alignment_ - cur_node_index_ % alignment_);
129+
for (auto i = index_type{}; i < padding; ++i) {
130+
add_node(typename node_type::threshold_type{}, std::nullopt);
131+
}
132+
}
133+
}
134+
root_node_indexes_.push_back(cur_node_index_);
135+
}
136+
140137
if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
141138
nodes_.emplace_back(
142139
val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
143140
// 0 indicates the lack of ID mapping for a particular node
144141
node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
145-
++cur_tree_size_;
142+
++cur_node_index_;
146143
}
147144

148145
/* Set the element-wise postprocessing operation for this model */
@@ -167,16 +164,15 @@ struct decision_forest_builder {
167164

168165
decision_forest_builder(index_type max_num_categories = index_type{},
169166
index_type align_bytes = index_type{})
170-
: cur_tree_size_{},
167+
: cur_node_index_{},
171168
max_num_categories_{max_num_categories},
172169
alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
173170
output_size_{1},
171+
row_postproc_{},
174172
element_postproc_{},
175173
average_factor_{},
176-
row_postproc_{},
177174
bias_{},
178175
postproc_constant_{},
179-
max_tree_size_{},
180176
nodes_{},
181177
root_node_indexes_{},
182178
vector_output_{}
@@ -233,7 +229,7 @@ struct decision_forest_builder {
233229
}
234230

235231
private:
236-
index_type cur_tree_size_;
232+
index_type cur_node_index_;
237233
index_type max_num_categories_;
238234
index_type alignment_;
239235
index_type output_size_;
@@ -242,7 +238,6 @@ struct decision_forest_builder {
242238
double average_factor_;
243239
double bias_;
244240
double postproc_constant_;
245-
index_type max_tree_size_;
246241

247242
std::vector<node_type> nodes_;
248243
std::vector<index_type> root_node_indexes_;

cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -242,6 +242,10 @@ CUML_FIL_INITIALIZE_DEVICE(extern template, 4)
242242
CUML_FIL_INITIALIZE_DEVICE(extern template, 5)
243243
CUML_FIL_INITIALIZE_DEVICE(extern template, 6)
244244
CUML_FIL_INITIALIZE_DEVICE(extern template, 7)
245+
CUML_FIL_INITIALIZE_DEVICE(extern template, 8)
246+
CUML_FIL_INITIALIZE_DEVICE(extern template, 9)
247+
CUML_FIL_INITIALIZE_DEVICE(extern template, 10)
248+
CUML_FIL_INITIALIZE_DEVICE(extern template, 11)
245249

246250
} // namespace device_initialization
247251
} // namespace detail

cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -147,6 +147,10 @@ CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
147147
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
148148
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
149149
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
150+
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 8)
151+
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 9)
152+
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 10)
153+
CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 11)
150154

151155
} // namespace inference
152156
} // namespace detail

0 commit comments

Comments
 (0)