Skip to content

Commit

Permalink
Random Forest GPU kernel update (uxlfoundation#2464)
Browse files Browse the repository at this point in the history
* Update GPU kernel for Decision Forest (uxlfoundation#2402)


* Fast and scalable version. Not clean code

* Remove redundant strucrtures

* Minor updates: add comments, remove redundant methods

* Apply suggestions from PR uxlfoundation#2402. Ndview instead of ndarray, add check_mul_overflow, const for immutable data and other minor changes.

* Add scalability among bins.

* Fix bin ofset calculation

* Update bin ofset for more scalable calculations

* Update according to PR uxlfoundation#2402 sugesstions

* Reserve more memory depending on double or float precision

* Fix memory capacity calc for double precision case

* Optimized double precision computing + fixed minor bugs

* More optimized version for regression

* Add ONEDAL_ASSERT for checking bin_count

* Add kernel description

* Merge after cherry picking

* Cherry pick the commit

* Minor code updates to fix test cases

* Update comments

* Kernel fixes

* Update dependencies for kernel

* Fix typo

* Add weighted case support

* Rebase on master

* Apply clang-format

* Replace std::log2 and std::exp2

* Rebase onto fresh master and minor naming updates

* Minor kernel updates

* Draft of fixed kernel

* Accuracy fixed

* Optimized, not refactored kernel

* Code cleaning

* Minor updates

* Minor optimizations

* Kernel refactoring

* Apply clang format

* Bug fix

* Regression precision fixed

* Fix spmd test failures

* Avoid using double precision type inside the kernel

* Check extremal case and throw exception

* Fix double

* Update license header year

* Code refactoring
  • Loading branch information
inteldimitrius authored Dec 13, 2023
1 parent 97274fc commit 14ad1ab
Show file tree
Hide file tree
Showing 17 changed files with 898 additions and 3,865 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ template <typename Float, typename Index, typename Task>
void infer_kernel_impl<Float, Index, Task>::validate_input(const descriptor_t& desc,
const model_t& model,
const table& data) const {
if (data.get_row_count() == 0) {
throw domain_error(msg::invalid_range_of_rows());
}

if (data.get_row_count() > de::limits<Index>::max()) {
throw domain_error(dal::detail::error_messages::invalid_range_of_rows());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ sycl::event indexed_features<Float, Bin, Index>::operator()(const table& tbl,
for (Index i = 0; i < column_count_; i++) {
last_event =
store_column(column_bin_vec_[i], full_data_nd_, i, column_count_, { last_event });
last_event.wait_and_throw();
bin_offsets[i] = total;
entries_[i].offset_ = total;
total += entries_[i].bin_count_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,23 @@ using result_t = train_result<task::classification>;
using descriptor_t = detail::descriptor_base<task::classification>;

template <typename Float>
static result_t call_daal_kernel(const context_gpu& ctx,
const descriptor_t& desc,
const table& data,
const table& responses) {
static result_t call_train_kernel(const context_gpu& ctx,
const descriptor_t& desc,
const table& data,
const table& responses,
const table& weights) {
train_kernel_hist_impl<Float, std::uint32_t, std::int32_t, task::classification>
train_hist_impl(ctx);
return train_hist_impl(desc, data, responses);
return train_hist_impl(desc, data, responses, weights);
}

template <typename Float>
static result_t train(const context_gpu& ctx, const descriptor_t& desc, const input_t& input) {
return call_daal_kernel<Float>(ctx, desc, input.get_data(), input.get_responses());
return call_train_kernel<Float>(ctx,
desc,
input.get_data(),
input.get_responses(),
input.get_weights());
}

template <typename Float, typename Task>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#include "oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type.hpp"
#include "oneapi/dal/algo/decision_forest/backend/gpu/train_model_manager.hpp"
#include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl.hpp"
#include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_sp_opt_impl.hpp"

namespace oneapi::dal::decision_forest::backend {

Expand All @@ -56,21 +55,20 @@ class train_kernel_hist_impl {
using msg = dal::detail::error_messages;
using comm_t = bk::communicator<spmd::device_memory_access::usm>;
using node_t = node<Index>;
using node_list_t = node_list<Index>;
using node_group_t = node_group<Index>;
using node_group_list_t = node_group_list<Index>;

public:
using hist_type_t = typename task_types<Float, Index, Task>::hist_type_t;

train_kernel_hist_impl(const bk::context_gpu& ctx)
: queue_(ctx.get_queue()),
comm_(ctx.get_communicator()),
train_service_kernels_(queue_),
node_group_list_(queue_) {}
train_service_kernels_(queue_) {}
~train_kernel_hist_impl() = default;

result_t operator()(const descriptor_t& desc, const table& data, const table& labels);
result_t operator()(const descriptor_t& desc,
const table& data,
const table& labels,
const table& weights);

private:
std::int64_t get_part_hist_required_mem_size(Index selected_ftr_count,
Expand Down Expand Up @@ -103,7 +101,8 @@ class train_kernel_hist_impl {
void init_params(train_context_t& ctx,
const descriptor_t& desc,
const table& data,
const table& labels);
const table& labels,
const table& weights);
/// Allocates all buffers that are used for training.
/// @param[in] ctx a training context structure for a GPU backend
void allocate_buffers(const train_context_t& ctx);
Expand Down Expand Up @@ -288,6 +287,7 @@ class train_kernel_hist_impl {
sycl::event compute_best_split(const train_context_t& ctx,
const pr::ndarray<Bin, 2>& data,
const pr::ndview<Float, 1>& response,
const pr::ndview<Float, 1>& weights,
const pr::ndarray<Index, 1>& tree_order,
const pr::ndarray<Index, 1>& selected_ftr_list,
const pr::ndarray<Float, 1>& random_bins_com,
Expand Down Expand Up @@ -618,6 +618,7 @@ class train_kernel_hist_impl {
pr::ndarray<Index, 1> ftr_bin_offsets_nd_;
std::vector<pr::ndarray<Float, 1>> bin_borders_host_;
pr::ndarray<Float, 1> response_nd_;
pr::ndarray<Float, 1> weights_nd_;
pr::ndarray<Float, 1> response_host_;
pr::ndarray<Float, 1> data_host_;

Expand All @@ -632,8 +633,6 @@ class train_kernel_hist_impl {
pr::ndarray<Float, 1> var_imp_variance_host_;

pr::ndarray<Float, 1> res_var_imp_;

node_group_list_t node_group_list_;
};

#endif
Expand Down
Loading

0 comments on commit 14ad1ab

Please sign in to comment.