@@ -508,16 +508,16 @@ struct HistogramKernel {
508
508
// Find feature groups
509
509
// This is a bin packing problem
510
510
// We want to pack as many features as possible into a group
511
- std::vector<int > split_proposal_row_pointers (split_proposals.num_features + 1 );
511
+ std::vector<int > split_proposal_row_pointers (split_proposals.row_pointers . size () );
512
512
CHECK_CUDA (cudaMemcpyAsync (split_proposal_row_pointers.data (),
513
- split_proposals.row_pointers .ptr ( 0 ),
514
- ( split_proposals.num_features + 1 ) * sizeof (int ),
513
+ split_proposals.row_pointers .data ( ),
514
+ split_proposals.row_pointers . size ( ) * sizeof (int ),
515
515
cudaMemcpyDeviceToHost,
516
516
stream));
517
517
CHECK_CUDA (cudaStreamSynchronize (stream));
518
518
std::vector<int > feature_groups ({0 });
519
519
int current_bins_in_group = 0 ;
520
- for (int i = 0 ; i < split_proposals.num_features ; i++) {
520
+ for (int i = 0 ; i < split_proposals.NumFeatures () ; i++) {
521
521
int const bins_in_feature =
522
522
split_proposal_row_pointers[i + 1 ] - split_proposal_row_pointers[i];
523
523
EXPECT (bins_in_feature <= kMaxSharedBins , " Too many bins in a feature" );
@@ -527,9 +527,10 @@ struct HistogramKernel {
527
527
}
528
528
current_bins_in_group += bins_in_feature;
529
529
}
530
- feature_groups.push_back (split_proposals.num_features );
530
+ feature_groups.push_back (split_proposals.NumFeatures () );
531
531
num_groups = narrow<int >(feature_groups.size () - 1 );
532
- EXPECT (num_groups * kMaxSharedBins >= split_proposals.histogram_size , " Too few feature groups" );
532
+ EXPECT (num_groups * kMaxSharedBins >= split_proposals.HistogramSize (),
533
+ " Too few feature groups" );
533
534
this ->feature_groups = legate::create_buffer<int >(num_groups + 1 );
534
535
CHECK_CUDA (cudaMemcpyAsync (this ->feature_groups .ptr (0 ),
535
536
feature_groups.data (),
@@ -551,9 +552,9 @@ struct HistogramKernel {
551
552
int64_t seed,
552
553
cudaStream_t stream)
553
554
{
554
- if (batch.InstancesInBatch () == 0 ) return ;
555
+ if (batch.InstancesInBatch () == 0 ) { return ; }
555
556
556
- int const average_features_per_group = split_proposals.num_features / num_groups;
557
+ int const average_features_per_group = split_proposals.NumFeatures () / num_groups;
557
558
std::size_t const average_elements_per_group =
558
559
batch.InstancesInBatch () * average_features_per_group;
559
560
auto min_blocks = (average_elements_per_group + kItemsPerTile - 1 ) / kItemsPerTile ;
@@ -709,7 +710,7 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
709
710
double thread_best_gain = 0 ;
710
711
int thread_best_bin_idx = -1 ;
711
712
712
- for (int bin_idx = narrow_cast<int >(threadIdx .x ); bin_idx < split_proposals.histogram_size ;
713
+ for (int bin_idx = narrow_cast<int >(threadIdx .x ); bin_idx < split_proposals.HistogramSize () ;
713
714
bin_idx += BLOCK_THREADS) {
714
715
// Check if this feature is in the feature set
715
716
if (optional_feature_set.has_value () &&
@@ -952,10 +953,11 @@ auto SelectSplitSamples(legate::TaskContext context,
952
953
row_samples.destroy ();
953
954
draft_proposals.destroy ();
954
955
out_keys.destroy ();
955
- return SparseSplitProposals<T>(split_proposals, row_pointers, num_features, n_unique);
956
+ return SparseSplitProposals<T>({split_proposals.ptr (0 ), narrow<std::size_t >(n_unique)},
957
+ {row_pointers.ptr (0 ), narrow<std::size_t >(num_features + 1 )});
956
958
}
957
959
958
- // Can't put a device l1_regularization in constructor so make this a function
960
+ // Can't put a lambda in constructor so make this a function
959
961
void FillPositions (const legate::Buffer<cuda::std::tuple<int32_t , int32_t >>& sorted_positions,
960
962
std::size_t num_rows,
961
963
cudaStream_t stream)
@@ -993,8 +995,9 @@ struct TreeBuilder {
993
995
// User a fixed reasonable upper bound on memory usage
994
996
// CAUTION: all workers MUST have the same max_batch_size
995
997
// Therefore we don't try to calculate this based on available memory
996
- const std::size_t max_bytes = 1000000000 ; // 1 GB
997
- const std::size_t bytes_per_node = num_outputs * split_proposals.histogram_size * sizeof (GPair);
998
+ const std::size_t max_bytes = 1000000000 ; // 1 GB
999
+ const std::size_t bytes_per_node =
1000
+ num_outputs * split_proposals.HistogramSize () * sizeof (GPair);
998
1001
const std::size_t max_histogram_nodes = std::max (1UL , max_bytes / bytes_per_node);
999
1002
int depth = 0 ;
1000
1003
while (BinaryTree::LevelEnd (depth + 1 ) <= max_histogram_nodes && depth <= max_depth) {
@@ -1003,7 +1006,7 @@ struct TreeBuilder {
1003
1006
cached_histogram = Histogram<IntegerGPair>(BinaryTree::LevelBegin (0 ),
1004
1007
BinaryTree::LevelEnd (depth),
1005
1008
num_outputs,
1006
- split_proposals.histogram_size ,
1009
+ split_proposals.HistogramSize () ,
1007
1010
stream);
1008
1011
max_batch_size = max_histogram_nodes;
1009
1012
}
@@ -1100,7 +1103,7 @@ struct TreeBuilder {
1100
1103
context,
1101
1104
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
1102
1105
tcb::span<ReduceT>(reinterpret_cast <ReduceT*>(histogram.Ptr (batch.node_idx_begin )),
1103
- batch.NodesInBatch () * num_outputs * split_proposals.histogram_size * 2 ),
1106
+ batch.NodesInBatch () * num_outputs * split_proposals.HistogramSize () * 2 ),
1104
1107
stream);
1105
1108
1106
1109
const int kScanBlockThreads = 256 ;
@@ -1189,7 +1192,7 @@ struct TreeBuilder {
1189
1192
cached_histogram = Histogram<IntegerGPair>(batch.node_idx_begin ,
1190
1193
batch.node_idx_end ,
1191
1194
num_outputs,
1192
- split_proposals.histogram_size ,
1195
+ split_proposals.HistogramSize () ,
1193
1196
stream);
1194
1197
return cached_histogram;
1195
1198
}
0 commit comments