Random Forest GPU kernel update (uxlfoundation#2464)

* Update GPU kernel for Decision Forest (uxlfoundation#2402) * Fast and scalable version. Not clean code * Remove redundant strucrtures * Minor updates: add comments, remove redundant methods * Apply suggestions from PR uxlfoundation#2402. Ndview instead of ndarray, add check_mul_overflow, const for immutable data and other minor changes. * Add scalability among bins. * Fix bin ofset calculation * Update bin ofset for more scalable calculations * Update according to PR uxlfoundation#2402 sugesstions * Reserve more memory depending on double or float precision * Fix memory capacity calc for double precision case * Optimized double precision computing + fixed minor bugs * More optimized version for regression * Add ONEDAL_ASSERT for checking bin_count * Add kernel description * Merge after cherry picking * Cherry pick the commit * Minor code updates to fix test cases * Update comments * Kernel fixes * Update dependencies for kernel * Fix typo * Add weighted case support * Rebase on master * Apply clang-format * Replace std::log2 and std::exp2 * Rebase onto fresh master and minor naming updates * Minor kernel updates * Draft of fixed kernel * Accuracy fixed * Optimized, not refactored kernel * Code cleaning * Minor updates * Minor optimizations * Kernel refactoring * Apply clang format * Bug fix * Regression precision fixed * Fix spmd test failures * Avoid using double precision type inside the kernel * Check extremal case and throw exception * Fix double * Update license header year * Code refactoring
Alexandr-Solovev · Dec 13, 2023 · 14ad1ab · 14ad1ab
1 parent 97274fc
commit 14ad1ab
Show file tree

Hide file tree

Showing 17 changed files with 898 additions and 3,865 deletions.
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp
@@ -37,6 +37,10 @@ template <typename Float, typename Index, typename Task>
 void infer_kernel_impl<Float, Index, Task>::validate_input(const descriptor_t& desc,
                                                            const model_t& model,
                                                            const table& data) const {
+    if (data.get_row_count() == 0) {
+        throw domain_error(msg::invalid_range_of_rows());
+    }
+
     if (data.get_row_count() > de::limits<Index>::max()) {
         throw domain_error(dal::detail::error_messages::invalid_range_of_rows());
     }

diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp
@@ -449,6 +449,7 @@ sycl::event indexed_features<Float, Bin, Index>::operator()(const table& tbl,
     for (Index i = 0; i < column_count_; i++) {
         last_event =
             store_column(column_bin_vec_[i], full_data_nd_, i, column_count_, { last_event });
+        last_event.wait_and_throw();
         bin_offsets[i] = total;
         entries_[i].offset_ = total;
         total += entries_[i].bin_count_;

diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_cls_hist_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_cls_hist_dpc.cpp
@@ -29,18 +29,23 @@ using result_t = train_result<task::classification>;
 using descriptor_t = detail::descriptor_base<task::classification>;
 
 template <typename Float>
-static result_t call_daal_kernel(const context_gpu& ctx,
-                                 const descriptor_t& desc,
-                                 const table& data,
-                                 const table& responses) {
+static result_t call_train_kernel(const context_gpu& ctx,
+                                  const descriptor_t& desc,
+                                  const table& data,
+                                  const table& responses,
+                                  const table& weights) {
     train_kernel_hist_impl<Float, std::uint32_t, std::int32_t, task::classification>
         train_hist_impl(ctx);
-    return train_hist_impl(desc, data, responses);
+    return train_hist_impl(desc, data, responses, weights);
 }
 
 template <typename Float>
 static result_t train(const context_gpu& ctx, const descriptor_t& desc, const input_t& input) {
-    return call_daal_kernel<Float>(ctx, desc, input.get_data(), input.get_responses());
+    return call_train_kernel<Float>(ctx,
+                                    desc,
+                                    input.get_data(),
+                                    input.get_responses(),
+                                    input.get_weights());
 }
 
 template <typename Float, typename Task>

diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -29,7 +29,6 @@
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type.hpp"
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_model_manager.hpp"
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl.hpp"
-#include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_sp_opt_impl.hpp"
 
 namespace oneapi::dal::decision_forest::backend {
 
@@ -56,21 +55,20 @@ class train_kernel_hist_impl {
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
     using node_t = node<Index>;
-    using node_list_t = node_list<Index>;
-    using node_group_t = node_group<Index>;
-    using node_group_list_t = node_group_list<Index>;
 
 public:
     using hist_type_t = typename task_types<Float, Index, Task>::hist_type_t;
 
     train_kernel_hist_impl(const bk::context_gpu& ctx)
             : queue_(ctx.get_queue()),
               comm_(ctx.get_communicator()),
-              train_service_kernels_(queue_),
-              node_group_list_(queue_) {}
+              train_service_kernels_(queue_) {}
     ~train_kernel_hist_impl() = default;
 
-    result_t operator()(const descriptor_t& desc, const table& data, const table& labels);
+    result_t operator()(const descriptor_t& desc,
+                        const table& data,
+                        const table& labels,
+                        const table& weights);
 
 private:
     std::int64_t get_part_hist_required_mem_size(Index selected_ftr_count,
@@ -103,7 +101,8 @@ class train_kernel_hist_impl {
     void init_params(train_context_t& ctx,
                      const descriptor_t& desc,
                      const table& data,
-                     const table& labels);
+                     const table& labels,
+                     const table& weights);
     /// Allocates all buffers that are used for training.
     /// @param[in] ctx  a training context structure for a GPU backend
     void allocate_buffers(const train_context_t& ctx);
@@ -288,6 +287,7 @@ class train_kernel_hist_impl {
     sycl::event compute_best_split(const train_context_t& ctx,
                                    const pr::ndarray<Bin, 2>& data,
                                    const pr::ndview<Float, 1>& response,
+                                   const pr::ndview<Float, 1>& weights,
                                    const pr::ndarray<Index, 1>& tree_order,
                                    const pr::ndarray<Index, 1>& selected_ftr_list,
                                    const pr::ndarray<Float, 1>& random_bins_com,
@@ -618,6 +618,7 @@ class train_kernel_hist_impl {
     pr::ndarray<Index, 1> ftr_bin_offsets_nd_;
     std::vector<pr::ndarray<Float, 1>> bin_borders_host_;
     pr::ndarray<Float, 1> response_nd_;
+    pr::ndarray<Float, 1> weights_nd_;
     pr::ndarray<Float, 1> response_host_;
     pr::ndarray<Float, 1> data_host_;
 
@@ -632,8 +633,6 @@ class train_kernel_hist_impl {
     pr::ndarray<Float, 1> var_imp_variance_host_;
 
     pr::ndarray<Float, 1> res_var_imp_;
-
-    node_group_list_t node_group_list_;
 };
 
 #endif