clang-tidy

RAMitchell · RAMitchell · commit 8c3b93a90bde · 2025-03-05T03:08:25.000-08:00
diff --git a/.clang-tidy b/.clang-tidy
@@ -15,6 +15,7 @@ Checks: >
   -misc-include-cleaner,
   -misc-use-internal-linkage,
   modernize-*,
+  -modernize-use-trailing-return-type,
   hicpp-*,
   -hicpp-signed-bitwise,
   readability-*,
diff --git a/src/encoder/target_encoder.cc b/src/encoder/target_encoder.cc
@@ -29,7 +29,7 @@ std::vector<std::unordered_map<T, int>> create_categories_map(
   const legate::Rect<1>& row_pointers_shape)
 {
   std::vector<std::unordered_map<T, int>> categories_map;
-  for (int feature_idx = row_pointers_shape.lo[0]; feature_idx <= row_pointers_shape.hi[0] - 1;
+  for (auto feature_idx = row_pointers_shape.lo[0]; feature_idx <= row_pointers_shape.hi[0] - 1;
        feature_idx++) {
     auto feature_start = row_pointers[feature_idx];
     auto feature_end   = row_pointers[feature_idx + 1];
diff --git a/src/encoder/target_encoder.cu b/src/encoder/target_encoder.cu
@@ -18,21 +18,22 @@
 #include <vector>
 #include <unordered_map>
 #include <random>
+#include <tcb/span.hpp>
 #include "../cpp_utils/cpp_utils.cuh"
 
 namespace legateboost {
 
 template <typename T>
 struct CategoriesMap {
-  legate::AccessorRO<T, 1> categories;
-  legate::AccessorRO<int64_t, 1> row_pointers;
+  tcb::span<const T> categories;
+  tcb::span<const int64_t> row_pointers;
   __device__ int64_t GetIndex(T x, int64_t feature_idx) const
   {
-    auto begin  = row_pointers[feature_idx];
-    auto end    = row_pointers[feature_idx + 1];
-    auto result = thrust::find(thrust::seq, categories.ptr(0) + begin, categories.ptr(0) + end, x);
-    if (result == categories.ptr(0) + end) { return -1; }
-    return result - categories.ptr(0);
+    const auto* begin  = categories.begin() + row_pointers[feature_idx];
+    const auto* end    = categories.begin() + row_pointers[feature_idx + 1];
+    const auto* result = thrust::find(thrust::seq, begin, end, x);
+    if (result == end) { return -1; }
+    return result - categories.begin();
   }
 };
 
@@ -53,7 +54,9 @@ struct target_encoder_mean_fn {
     auto [row_pointers, row_pointers_shape, row_pointers_accessor] =
       GetInputStore<int64_t, 1>(context.input(3).data());
 
-    CategoriesMap<T> categories_map{categories_accessor, row_pointers_accessor};
+    const CategoriesMap<T> categories_map{
+      {categories_accessor.ptr(0), categories_shape.volume()},
+      {row_pointers_accessor.ptr(0), row_pointers_shape.volume()}};
     auto cv_fold = context.scalars().at(0).value<int64_t>();
     auto do_cv   = context.scalars().at(1).value<bool>();
 
@@ -68,7 +71,7 @@ struct target_encoder_mean_fn {
     auto means =
       context.reduction(0).data().reduce_accessor<legate::SumReduction<double>, true, 3>();
 
-    auto stream       = context.get_task_stream();
+    auto* stream      = context.get_task_stream();
     auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
     thrust::for_each_n(
@@ -109,7 +112,9 @@ struct target_encoder_variance_fn {
     auto [row_pointers, row_pointers_shape, row_pointers_accessor] =
       GetInputStore<int64_t, 1>(context.input(3).data());
 
-    CategoriesMap<T> categories_map{categories_accessor, row_pointers_accessor};
+    const CategoriesMap<T> categories_map{
+      {categories_accessor.ptr(0), categories_shape.volume()},
+      {row_pointers_accessor.ptr(0), row_pointers_shape.volume()}};
 
     // Sum and count per each category and output
     auto mean          = context.input(4).data();
@@ -137,7 +142,7 @@ struct target_encoder_variance_fn {
     auto y_variance =
       context.reduction(1).data().reduce_accessor<legate::SumReduction<double>, true, 1, true>();
 
-    auto stream       = context.get_task_stream();
+    auto* stream      = context.get_task_stream();
     auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
     thrust::for_each_n(
@@ -200,8 +205,10 @@ struct target_encoder_encode_fn {
       cv_indices = cv_indices_accessor;
     }
 
-    CategoriesMap<T> categories_map{categories_accessor, row_pointers_accessor};
-    auto stream       = context.get_task_stream();
+    const CategoriesMap<T> categories_map{
+      {categories_accessor.ptr(0), categories_shape.volume()},
+      {row_pointers_accessor.ptr(0), row_pointers_shape.volume()}};
+    auto* stream      = context.get_task_stream();
     auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
     thrust::for_each_n(policy,
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
@@ -178,8 +178,8 @@ auto SelectSplitSamples(legate::TaskContext context,
     split_proposals[end - 1] = std::numeric_limits<T>::infinity();
   }
 
-  return SparseSplitProposals<T>(
-    split_proposals, row_pointers, num_features, split_proposals_tmp.size());
+  return SparseSplitProposals<T>({split_proposals.ptr(0), split_proposals_tmp.size()},
+                                 {row_pointers.ptr(0), narrow<std::size_t>(num_features + 1)});
 }
 
 template <typename T>
@@ -198,8 +198,9 @@ struct TreeBuilder {
   {
     sorted_positions = legate::create_buffer<std::tuple<int32_t, int32_t>>(num_rows);
     for (auto i = 0; i < num_rows; ++i) { sorted_positions[i] = {0, i}; }
-    const std::size_t max_bytes      = 1000000000;  // 1 GB
-    const std::size_t bytes_per_node = num_outputs * split_proposals.histogram_size * sizeof(GPair);
+    const std::size_t max_bytes = 1000000000;  // 1 GB
+    const std::size_t bytes_per_node =
+      num_outputs * split_proposals.HistogramSize() * sizeof(GPair);
     const std::size_t max_histogram_nodes = std::max(1UL, max_bytes / bytes_per_node);
     int depth                             = 0;
     while (BinaryTree::LevelEnd(depth + 1) <= max_histogram_nodes && depth <= max_depth) {
@@ -208,7 +209,7 @@ struct TreeBuilder {
     histogram      = Histogram<GPair>(BinaryTree::LevelBegin(0),
                                  BinaryTree::LevelEnd(depth),
                                  num_outputs,
-                                 split_proposals.histogram_size);
+                                 split_proposals.HistogramSize());
     max_batch_size = max_histogram_nodes;
   }
   template <typename TYPE>
@@ -243,7 +244,7 @@ struct TreeBuilder {
       context,
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
       tcb::span<double>(reinterpret_cast<double*>(histogram.Ptr(batch.node_idx_begin)),
-                        batch.NodesInBatch() * num_outputs * split_proposals.histogram_size * 2));
+                        batch.NodesInBatch() * num_outputs * split_proposals.HistogramSize() * 2));
     this->Scan(histogram, batch, tree);
   }
 
@@ -383,7 +384,7 @@ struct TreeBuilder {
 
     histogram.Destroy();
     histogram = Histogram<GPair>(
-      batch.node_idx_begin, batch.node_idx_end, num_outputs, split_proposals.histogram_size);
+      batch.node_idx_begin, batch.node_idx_end, num_outputs, split_proposals.HistogramSize());
     return histogram;
   }
 
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
@@ -508,16 +508,16 @@ struct HistogramKernel {
     // Find feature groups
     // This is a bin packing problem
     // We want to pack as many features as possible into a group
-    std::vector<int> split_proposal_row_pointers(split_proposals.num_features + 1);
+    std::vector<int> split_proposal_row_pointers(split_proposals.row_pointers.size());
     CHECK_CUDA(cudaMemcpyAsync(split_proposal_row_pointers.data(),
-                               split_proposals.row_pointers.ptr(0),
-                               (split_proposals.num_features + 1) * sizeof(int),
+                               split_proposals.row_pointers.data(),
+                               split_proposals.row_pointers.size() * sizeof(int),
                                cudaMemcpyDeviceToHost,
                                stream));
     CHECK_CUDA(cudaStreamSynchronize(stream));
     std::vector<int> feature_groups({0});
     int current_bins_in_group = 0;
-    for (int i = 0; i < split_proposals.num_features; i++) {
+    for (int i = 0; i < split_proposals.NumFeatures(); i++) {
       int const bins_in_feature =
         split_proposal_row_pointers[i + 1] - split_proposal_row_pointers[i];
       EXPECT(bins_in_feature <= kMaxSharedBins, "Too many bins in a feature");
@@ -527,9 +527,10 @@ struct HistogramKernel {
       }
       current_bins_in_group += bins_in_feature;
     }
-    feature_groups.push_back(split_proposals.num_features);
+    feature_groups.push_back(split_proposals.NumFeatures());
     num_groups = narrow<int>(feature_groups.size() - 1);
-    EXPECT(num_groups * kMaxSharedBins >= split_proposals.histogram_size, "Too few feature groups");
+    EXPECT(num_groups * kMaxSharedBins >= split_proposals.HistogramSize(),
+           "Too few feature groups");
     this->feature_groups = legate::create_buffer<int>(num_groups + 1);
     CHECK_CUDA(cudaMemcpyAsync(this->feature_groups.ptr(0),
                                feature_groups.data(),
@@ -551,9 +552,9 @@ struct HistogramKernel {
                       int64_t seed,
                       cudaStream_t stream)
   {
-    if (batch.InstancesInBatch() == 0) return;
+    if (batch.InstancesInBatch() == 0) { return; }
 
-    int const average_features_per_group = split_proposals.num_features / num_groups;
+    int const average_features_per_group = split_proposals.NumFeatures() / num_groups;
     std::size_t const average_elements_per_group =
       batch.InstancesInBatch() * average_features_per_group;
     auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
@@ -709,7 +710,7 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
   double thread_best_gain = 0;
   int thread_best_bin_idx = -1;
 
-  for (int bin_idx = narrow_cast<int>(threadIdx.x); bin_idx < split_proposals.histogram_size;
+  for (int bin_idx = narrow_cast<int>(threadIdx.x); bin_idx < split_proposals.HistogramSize();
        bin_idx += BLOCK_THREADS) {
     // Check if this feature is in the feature set
     if (optional_feature_set.has_value() &&
@@ -952,10 +953,11 @@ auto SelectSplitSamples(legate::TaskContext context,
   row_samples.destroy();
   draft_proposals.destroy();
   out_keys.destroy();
-  return SparseSplitProposals<T>(split_proposals, row_pointers, num_features, n_unique);
+  return SparseSplitProposals<T>({split_proposals.ptr(0), narrow<std::size_t>(n_unique)},
+                                 {row_pointers.ptr(0), narrow<std::size_t>(num_features + 1)});
 }
 
-// Can't put a device l1_regularization in constructor so make this a function
+// Can't put a lambda in constructor so make this a function
 void FillPositions(const legate::Buffer<cuda::std::tuple<int32_t, int32_t>>& sorted_positions,
                    std::size_t num_rows,
                    cudaStream_t stream)
@@ -993,8 +995,9 @@ struct TreeBuilder {
     // User a fixed reasonable upper bound on memory usage
     // CAUTION: all workers MUST have the same max_batch_size
     // Therefore we don't try to calculate this based on available memory
-    const std::size_t max_bytes      = 1000000000;  // 1 GB
-    const std::size_t bytes_per_node = num_outputs * split_proposals.histogram_size * sizeof(GPair);
+    const std::size_t max_bytes = 1000000000;  // 1 GB
+    const std::size_t bytes_per_node =
+      num_outputs * split_proposals.HistogramSize() * sizeof(GPair);
     const std::size_t max_histogram_nodes = std::max(1UL, max_bytes / bytes_per_node);
     int depth                             = 0;
     while (BinaryTree::LevelEnd(depth + 1) <= max_histogram_nodes && depth <= max_depth) {
@@ -1003,7 +1006,7 @@ struct TreeBuilder {
     cached_histogram = Histogram<IntegerGPair>(BinaryTree::LevelBegin(0),
                                                BinaryTree::LevelEnd(depth),
                                                num_outputs,
-                                               split_proposals.histogram_size,
+                                               split_proposals.HistogramSize(),
                                                stream);
     max_batch_size   = max_histogram_nodes;
   }
@@ -1100,7 +1103,7 @@ struct TreeBuilder {
       context,
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
       tcb::span<ReduceT>(reinterpret_cast<ReduceT*>(histogram.Ptr(batch.node_idx_begin)),
-                         batch.NodesInBatch() * num_outputs * split_proposals.histogram_size * 2),
+                         batch.NodesInBatch() * num_outputs * split_proposals.HistogramSize() * 2),
       stream);
 
     const int kScanBlockThreads  = 256;
@@ -1189,7 +1192,7 @@ struct TreeBuilder {
     cached_histogram = Histogram<IntegerGPair>(batch.node_idx_begin,
                                                batch.node_idx_end,
                                                num_outputs,
-                                               split_proposals.histogram_size,
+                                               split_proposals.HistogramSize(),
                                                stream);
     return cached_histogram;
   }
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
@@ -155,56 +155,50 @@ __host__ __device__ inline auto CalculateLeafValue(double G,
 template <typename T>
 class SparseSplitProposals {
  public:
-  legate::Buffer<T, 1> split_proposals;
-  legate::Buffer<int32_t, 1> row_pointers;
-  int32_t num_features;
-  std::size_t histogram_size;
+  tcb::span<T> split_proposals;
+  tcb::span<int32_t> row_pointers;
   // The rightmost split proposal for each feature must be +inf
-  SparseSplitProposals(const legate::Buffer<T, 1>& split_proposals,
-                       const legate::Buffer<int32_t, 1>& row_pointers,
-                       int32_t num_features,
-                       std::size_t histogram_size)
-    : split_proposals(split_proposals),
-      row_pointers(row_pointers),
-      num_features(num_features),
-      histogram_size(histogram_size)
+  SparseSplitProposals(const tcb::span<T>& split_proposals, const tcb::span<int32_t> row_pointers)
+    : split_proposals(split_proposals), row_pointers(row_pointers)
   {
   }
 
+  [[nodiscard]] __host__ __device__ std::size_t HistogramSize() const
+  {
+    return split_proposals.size();
+  }
+  [[nodiscard]] __host__ __device__ std::size_t NumFeatures() const
+  {
+    return row_pointers.size() - 1;
+  }
 // Returns the bin index for a given feature and value
 // If the value is not in the split proposals, -1 is returned
 #ifdef __CUDACC__
   __device__ auto FindBin(T x, int feature) const -> int
   {
-    auto feature_row_begin = row_pointers[feature];
-    auto feature_row_end   = row_pointers[feature + 1];
-    auto* ptr              = thrust::lower_bound(thrust::seq,
-                                    split_proposals.ptr(0) + feature_row_begin,
-                                    split_proposals.ptr(0) + feature_row_end,
-                                    x);
-    EXPECT_DEVICE(ptr != split_proposals.ptr(0) + feature_row_end,
-                  "Value not found in split proposals");
-    return ptr - split_proposals.ptr(0);
+    const auto begin   = split_proposals.begin() + row_pointers[feature];
+    const auto end     = split_proposals.begin() + row_pointers[feature + 1];
+    const auto* result = thrust::lower_bound(thrust::seq, begin, end, x);
+    EXPECT_DEVICE(result != end, "Value not found in split proposals");
+    return result - split_proposals.begin();
   }
 #else
   [[nodiscard]] auto FindBin(T x, int feature) const -> int
   {
-    auto feature_row_begin = legate::coord_t{row_pointers[feature]};
-    auto feature_row_end   = legate::coord_t{row_pointers[feature + 1]};
-    auto* ptr              = std::lower_bound(
-      split_proposals.ptr(feature_row_begin), split_proposals.ptr(feature_row_end), x);
-    EXPECT(ptr != split_proposals.ptr(feature_row_end), "Value not found in split proposals");
-    return ptr - split_proposals.ptr(0);
+    const auto begin   = split_proposals.begin() + row_pointers[feature];
+    const auto end     = split_proposals.begin() + row_pointers[feature + 1];
+    const auto* result = std::lower_bound(begin, end, x);
+    EXPECT(result != end, "Value not found in split proposals");
+    return result - split_proposals.begin();
   }
 #endif
 
 #ifdef __CUDACC__
   __host__ __device__ auto FindFeature(int bin_idx) const -> int
   {
     // Binary search for the feature
-    return thrust::upper_bound(
-             thrust::seq, row_pointers.ptr(0), row_pointers.ptr(num_features), bin_idx) -
-           row_pointers.ptr(0) - 1;
+    return thrust::upper_bound(thrust::seq, row_pointers.begin(), row_pointers.end(), bin_idx) -
+           row_pointers.begin() - 1;
   }
 #endif
 

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ std::vector<std::unordered_map<T, int>> create_categories_map(`
`29`	`29`	`const legate::Rect<1>& row_pointers_shape)`
`30`	`30`	`{`
`31`	`31`	`std::vector<std::unordered_map<T, int>> categories_map;`
`32`		`- for (int feature_idx = row_pointers_shape.lo[0]; feature_idx <= row_pointers_shape.hi[0] - 1;`
	`32`	`+ for (auto feature_idx = row_pointers_shape.lo[0]; feature_idx <= row_pointers_shape.hi[0] - 1;`
`33`	`33`	`feature_idx++) {`
`34`	`34`	`auto feature_start = row_pointers[feature_idx];`
`35`	`35`	`auto feature_end = row_pointers[feature_idx + 1];`