rapidsai
diff --git a/‎conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
+1 b/‎conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
+1
diff --git a/‎cpp/CMakeLists.txt
+10-2 b/‎cpp/CMakeLists.txt
+10-2
diff --git a/‎cpp/bench/sg/filex.cu
+3-3 b/‎cpp/bench/sg/filex.cu
+3-3
diff --git a/‎cpp/include/cuml/experimental/fil/decision_forest.hpp
+50-34 b/‎cpp/include/cuml/experimental/fil/decision_forest.hpp
+50-34
diff --git a/‎cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp
+33-38 b/‎cpp/include/cuml/experimental/fil/detail/decision_forest_builder.hpp
+33-38
diff --git a/‎cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh
+5-1 b/‎cpp/include/cuml/experimental/fil/detail/device_initialization/gpu.cuh
+5-1
diff --git a/‎cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp
+5-1 b/‎cpp/include/cuml/experimental/fil/detail/infer/cpu.hpp
+5-1
@@ -29,6 +29,7 @@ dependencies:
 - libcuvs==25.4.*,>=0.0.0a0
 - libraft-headers==25.4.*,>=0.0.0a0
 - librmm==25.4.*,>=0.0.0a0
+- llvm-openmp==15.0.7
 - ninja
 - nvcc_linux-64=11.8
 - sysroot_linux-64==2.28
 
@@ -383,7 +383,11 @@ if(BUILD_CUML_CPP_LIBRARY)
           src/experimental/fil/infer4.cu
           src/experimental/fil/infer5.cu
           src/experimental/fil/infer6.cu
-          src/experimental/fil/infer7.cu)
+          src/experimental/fil/infer7.cu
+          src/experimental/fil/infer8.cu
+          src/experimental/fil/infer9.cu
+          src/experimental/fil/infer10.cu
+          src/experimental/fil/infer11.cu)
     endif()
     target_sources(${CUML_CPP_TARGET}
       PRIVATE
@@ -395,7 +399,11 @@ if(BUILD_CUML_CPP_LIBRARY)
         src/experimental/fil/infer4.cpp
         src/experimental/fil/infer5.cpp
         src/experimental/fil/infer6.cpp
-        src/experimental/fil/infer7.cpp)
+        src/experimental/fil/infer7.cpp
+        src/experimental/fil/infer8.cpp
+        src/experimental/fil/infer9.cpp
+        src/experimental/fil/infer10.cpp
+        src/experimental/fil/infer11.cpp)
   endif()
 
   # todo: organize linear models better
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,9 +101,9 @@ class FILEX : public RegressionFixture<float> {
       allowed_storage_types.push_back(ML::fil::storage_type_t::SPARSE8);
     }
     auto allowed_layouts = std::vector<ML::experimental::fil::tree_layout>{
-      ML::experimental::fil::tree_layout::breadth_first,
       ML::experimental::fil::tree_layout::depth_first,
-    };
+      ML::experimental::fil::tree_layout::breadth_first,
+      ML::experimental::fil::tree_layout::layered_children_together};
     auto min_time = std::numeric_limits<std::int64_t>::max();
 
     // Iterate through storage type, algorithm type, and chunk sizes and find optimum
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -381,39 +381,55 @@ using preset_decision_forest = decision_forest<
 }  // namespace detail
 
 /** A variant containing all standard decision_forest instantiations */
-using decision_forest_variant =
-  std::variant<detail::preset_decision_forest<
-                 std::variant_alternative_t<0, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<1, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<2, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<3, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<4, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<5, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<6, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees>,
-               detail::preset_decision_forest<
-                 std::variant_alternative_t<7, detail::specialization_variant>::layout,
-                 std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
-                 std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees>>;
+using decision_forest_variant = std::variant<
+  detail::preset_decision_forest<
+    std::variant_alternative_t<0, detail::specialization_variant>::layout,
+    std::variant_alternative_t<0, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<0, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<1, detail::specialization_variant>::layout,
+    std::variant_alternative_t<1, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<1, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<2, detail::specialization_variant>::layout,
+    std::variant_alternative_t<2, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<2, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<3, detail::specialization_variant>::layout,
+    std::variant_alternative_t<3, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<3, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<4, detail::specialization_variant>::layout,
+    std::variant_alternative_t<4, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<4, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<5, detail::specialization_variant>::layout,
+    std::variant_alternative_t<5, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<5, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<6, detail::specialization_variant>::layout,
+    std::variant_alternative_t<6, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<6, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<7, detail::specialization_variant>::layout,
+    std::variant_alternative_t<7, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<7, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<8, detail::specialization_variant>::layout,
+    std::variant_alternative_t<8, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<8, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<9, detail::specialization_variant>::layout,
+    std::variant_alternative_t<9, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<9, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<10, detail::specialization_variant>::layout,
+    std::variant_alternative_t<10, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<10, detail::specialization_variant>::has_large_trees>,
+  detail::preset_decision_forest<
+    std::variant_alternative_t<11, detail::specialization_variant>::layout,
+    std::variant_alternative_t<11, detail::specialization_variant>::is_double_precision,
+    std::variant_alternative_t<11, detail::specialization_variant>::has_large_trees>>;
 
 /**
  * Determine the variant index of the decision_forest type to used based on
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,32 +58,13 @@ struct decision_forest_builder {
   /* The type for nodes in the given decision_forest type */
   using node_type = typename decision_forest_t::node_type;
 
-  /* Add a root node, indicating the beginning of a new tree */
-  void start_new_tree()
-  {
-    if (root_node_indexes_.empty()) {
-      root_node_indexes_.emplace_back();
-    } else {
-      max_tree_size_ = std::max(cur_tree_size_, max_tree_size_);
-      if (alignment_ != index_type{}) {
-        if (cur_tree_size_ % alignment_ != index_type{}) {
-          auto padding = (alignment_ - cur_tree_size_ % alignment_);
-          for (auto i = index_type{}; i < padding; ++i) {
-            add_node(typename node_type::threshold_type{}, std::nullopt);
-          }
-        }
-      }
-      root_node_indexes_.push_back(root_node_indexes_.back() + cur_tree_size_);
-      cur_tree_size_ = index_type{};
-    }
-  }
-
   /* Add a node with a categorical split */
   template <typename iter_t>
   void add_categorical_node(
     iter_t vec_begin,
     iter_t vec_end,
     std::optional<int> tl_node_id                     = std::nullopt,
+    std::size_t depth                                 = std::size_t{1},
     bool default_to_distant_child                     = false,
     typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
     typename node_type::offset_type offset            = typename node_type::offset_type{})
@@ -103,46 +84,62 @@ struct decision_forest_builder {
     auto set = bitset{set_storage, max_node_categories};
     std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
 
-    add_node(node_value, tl_node_id, false, default_to_distant_child, true, feature, offset, false);
+    add_node(
+      node_value, tl_node_id, depth, false, default_to_distant_child, true, feature, offset, false);
   }
 
   /* Add a leaf node with vector output */
   template <typename iter_t>
   void add_leaf_vector_node(iter_t vec_begin,
                             iter_t vec_end,
-                            std::optional<int> tl_node_id = std::nullopt)
+                            std::optional<int> tl_node_id = std::nullopt,
+                            std::size_t depth             = std::size_t{1})
   {
     auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
     std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
-    nodes_.emplace_back(leaf_index,
-                        true,
-                        false,
-                        false,
-                        typename node_type::metadata_storage_type{},
-                        typename node_type::offset_type{});
-    // 0 indicates the lack of ID mapping for a particular node
-    node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
-    ++cur_tree_size_;
+
+    add_node(leaf_index,
+             tl_node_id,
+             depth,
+             true,
+             false,
+             false,
+             typename node_type::metadata_storage_type{},
+             typename node_type::offset_type{},
+             false);
   }
 
   /* Add a node to the model */
   template <typename value_t>
   void add_node(
     value_t val,
     std::optional<int> tl_node_id                     = std::nullopt,
+    std::size_t depth                                 = std::size_t{1},
     bool is_leaf_node                                 = true,
     bool default_to_distant_child                     = false,
     bool is_categorical_node                          = false,
     typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
     typename node_type::offset_type offset            = typename node_type::offset_type{},
     bool is_inclusive                                 = false)
   {
+    if (depth == std::size_t{}) {
+      if (alignment_ != index_type{}) {
+        if (cur_node_index_ % alignment_ != index_type{}) {
+          auto padding = (alignment_ - cur_node_index_ % alignment_);
+          for (auto i = index_type{}; i < padding; ++i) {
+            add_node(typename node_type::threshold_type{}, std::nullopt);
+          }
+        }
+      }
+      root_node_indexes_.push_back(cur_node_index_);
+    }
+
     if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
     nodes_.emplace_back(
       val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
     // 0 indicates the lack of ID mapping for a particular node
     node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
-    ++cur_tree_size_;
+    ++cur_node_index_;
   }
 
   /* Set the element-wise postprocessing operation for this model */
@@ -167,16 +164,15 @@ struct decision_forest_builder {
 
   decision_forest_builder(index_type max_num_categories = index_type{},
                           index_type align_bytes        = index_type{})
-    : cur_tree_size_{},
+    : cur_node_index_{},
       max_num_categories_{max_num_categories},
       alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
       output_size_{1},
+      row_postproc_{},
       element_postproc_{},
       average_factor_{},
-      row_postproc_{},
       bias_{},
       postproc_constant_{},
-      max_tree_size_{},
       nodes_{},
       root_node_indexes_{},
       vector_output_{}
@@ -233,7 +229,7 @@ struct decision_forest_builder {
   }
 
  private:
-  index_type cur_tree_size_;
+  index_type cur_node_index_;
   index_type max_num_categories_;
   index_type alignment_;
   index_type output_size_;
@@ -242,7 +238,6 @@ struct decision_forest_builder {
   double average_factor_;
   double bias_;
   double postproc_constant_;
-  index_type max_tree_size_;
 
   std::vector<node_type> nodes_;
   std::vector<index_type> root_node_indexes_;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -242,6 +242,10 @@ CUML_FIL_INITIALIZE_DEVICE(extern template, 4)
 CUML_FIL_INITIALIZE_DEVICE(extern template, 5)
 CUML_FIL_INITIALIZE_DEVICE(extern template, 6)
 CUML_FIL_INITIALIZE_DEVICE(extern template, 7)
+CUML_FIL_INITIALIZE_DEVICE(extern template, 8)
+CUML_FIL_INITIALIZE_DEVICE(extern template, 9)
+CUML_FIL_INITIALIZE_DEVICE(extern template, 10)
+CUML_FIL_INITIALIZE_DEVICE(extern template, 11)
 
 }  // namespace device_initialization
 }  // namespace detail
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,6 +147,10 @@ CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
+CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 8)
+CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 9)
+CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 10)
+CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 11)
 
 }  // namespace inference
 }  // namespace detail