Skip to content

Commit 8921091

Browse files
committed
fmt
1 parent ee0aee5 commit 8921091

File tree

3 files changed

+147
-73
lines changed

3 files changed

+147
-73
lines changed

icicle/backend/cuda/src/msm/cuda_msm.cu

+66-32
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@
33
namespace msm {
44

55
namespace {
6-
__global__ void
7-
precompute_points_kernel(const bn254::affine_t* points, int shift, int prec_factor, int count, bn254::affine_t* points_out, bool is_montgomery)
6+
__global__ void precompute_points_kernel(
7+
const bn254::affine_t* points,
8+
int shift,
9+
int prec_factor,
10+
int count,
11+
bn254::affine_t* points_out,
12+
bool is_montgomery)
813
{
914
int tid = blockIdx.x * blockDim.x + threadIdx.x;
1015
if (tid >= count) return;
@@ -192,7 +197,8 @@ namespace msm {
192197
unsigned point_ind = point_indices[bucket_offset + i];
193198
bn254::affine_t point = points[point_ind];
194199
bucket = i || !init_buckets ? (point == bn254::affine_t::zero() ? bucket : bucket + point)
195-
: (point == bn254::affine_t::zero() ? bn254::projective_t::zero() : bn254::projective_t::from_affine(point));
200+
: (point == bn254::affine_t::zero() ? bn254::projective_t::zero()
201+
: bn254::projective_t::from_affine(point));
196202
}
197203
buckets[bucket_index] = bucket;
198204
}
@@ -224,8 +230,9 @@ namespace msm {
224230
i++) { // add the relevant points starting from the relevant offset up to the bucket size
225231
unsigned point_ind = point_indices[bucket_offset + i];
226232
bn254::affine_t point = points[point_ind];
227-
bucket =
228-
i ? (point == bn254::affine_t::zero() ? bucket : bucket + point) : (point == bn254::affine_t::zero() ? bn254::projective_t::zero() : bn254::projective_t::from_affine(point));
233+
bucket = i ? (point == bn254::affine_t::zero() ? bucket : bucket + point)
234+
: (point == bn254::affine_t::zero() ? bn254::projective_t::zero()
235+
: bn254::projective_t::from_affine(point));
229236
}
230237
buckets[tid] = run_length ? bucket : bn254::projective_t::zero();
231238
}
@@ -252,7 +259,8 @@ namespace msm {
252259

253260
// this kernel sums the entire bucket module
254261
// each thread deals with a single bucket module
255-
__global__ void big_triangle_sum_kernel(const bn254::projective_t* buckets, bn254::projective_t* final_sums, unsigned nof_bms, unsigned c)
262+
__global__ void big_triangle_sum_kernel(
263+
const bn254::projective_t* buckets, bn254::projective_t* final_sums, unsigned nof_bms, unsigned c)
256264
{
257265
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
258266
if (tid >= nof_bms) return;
@@ -293,7 +301,11 @@ namespace msm {
293301
// this kernel computes the final result using the double and add algorithm
294302
// it is done by a single thread
295303
__global__ void final_accumulation_kernel(
296-
const bn254::projective_t* final_sums, bn254::projective_t* final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
304+
const bn254::projective_t* final_sums,
305+
bn254::projective_t* final_results,
306+
unsigned nof_msms,
307+
unsigned nof_results,
308+
unsigned c)
297309
{
298310
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
299311
if (tid >= nof_msms) return;
@@ -310,7 +322,7 @@ namespace msm {
310322
final_results[tid] = final_result + final_sums[tid * nof_results];
311323
}
312324

313-
template<typename E>
325+
template <typename E>
314326
static cudaError_t from_montgomery_on_device(const E* d_input, int n, cudaStream_t stream, E* d_output)
315327
{
316328
auto config = default_vec_ops_config();
@@ -321,7 +333,6 @@ namespace msm {
321333
return montgomery::ConvertMontgomery<E, false>(d_input, n, config, d_output);
322334
}
323335

324-
325336
static cudaError_t split_and_sort_scalars(
326337
cudaStream_t stream,
327338
unsigned nof_scalars,
@@ -420,8 +431,8 @@ namespace msm {
420431
if (!are_scalars_on_device) {
421432
// copy scalars to gpu
422433
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_scalars, sizeof(bn254::scalar_t) * nof_scalars, stream));
423-
CHK_IF_RETURN(
424-
cudaMemcpyAsync(d_allocated_scalars, scalars, sizeof(bn254::scalar_t) * nof_scalars, cudaMemcpyHostToDevice, stream));
434+
CHK_IF_RETURN(cudaMemcpyAsync(
435+
d_allocated_scalars, scalars, sizeof(bn254::scalar_t) * nof_scalars, cudaMemcpyHostToDevice, stream));
425436

426437
if (are_scalars_montgomery_form) {
427438
CHK_IF_RETURN(from_montgomery_on_device(d_allocated_scalars, nof_scalars, stream, d_allocated_scalars));
@@ -456,8 +467,8 @@ namespace msm {
456467
if (!are_points_on_device) {
457468
// copy points to gpu
458469
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_points, sizeof(bn254::affine_t) * nof_points, stream_points));
459-
CHK_IF_RETURN(
460-
cudaMemcpyAsync(d_allocated_points, points, sizeof(bn254::affine_t) * nof_points, cudaMemcpyHostToDevice, stream_points));
470+
CHK_IF_RETURN(cudaMemcpyAsync(
471+
d_allocated_points, points, sizeof(bn254::affine_t) * nof_points, cudaMemcpyHostToDevice, stream_points));
461472

462473
if (are_points_montgomery_form) {
463474
CHK_IF_RETURN(from_montgomery_on_device(d_allocated_points, nof_points, stream_points, d_allocated_points));
@@ -496,7 +507,8 @@ namespace msm {
496507
unsigned* d_bucket_offsets)
497508
{
498509
if (init_buckets) {
499-
CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(bn254::projective_t) * (total_nof_buckets + nof_bms_in_batch), stream));
510+
CHK_IF_RETURN(
511+
cudaMallocAsync(&buckets, sizeof(bn254::projective_t) * (total_nof_buckets + nof_bms_in_batch), stream));
500512

501513
// launch the bucket initialization kernel with maximum threads
502514
unsigned NUM_THREADS = 1 << 10;
@@ -642,7 +654,8 @@ namespace msm {
642654
large_bucket_indices);
643655

644656
bn254::projective_t* large_buckets;
645-
CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(bn254::projective_t) * large_buckets_nof_threads, stream_large_buckets));
657+
CHK_IF_RETURN(
658+
cudaMallocAsync(&large_buckets, sizeof(bn254::projective_t) * large_buckets_nof_threads, stream_large_buckets));
646659

647660
NUM_THREADS = max(1, min(1 << 8, large_buckets_nof_threads));
648661
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
@@ -779,7 +792,8 @@ namespace msm {
779792
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // half the bits rounded up
780793
target_windows_count = source_windows_count << 1; // twice the number of bms
781794
const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
782-
CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(bn254::projective_t) * target_buckets_count * batch_size, stream));
795+
CHK_IF_RETURN(
796+
cudaMallocAsync(&target_buckets, sizeof(bn254::projective_t) * target_buckets_count * batch_size, stream));
783797
CHK_IF_RETURN(cudaMallocAsync(
784798
&temp_buckets1, sizeof(bn254::projective_t) * source_buckets_count * batch_size,
785799
stream)); // for type1 reduction (strided, bottom window - evens)
@@ -832,7 +846,8 @@ namespace msm {
832846
nof_bms_per_msm = target_windows_count;
833847
unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
834848

835-
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(bn254::projective_t) * total_nof_final_results, stream));
849+
CHK_IF_RETURN(
850+
cudaMallocAsync(&final_results, sizeof(bn254::projective_t) * total_nof_final_results, stream));
836851

837852
unsigned NUM_THREADS = 32;
838853
unsigned NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS;
@@ -1008,7 +1023,8 @@ namespace msm {
10081023

10091024
if (!are_results_on_device)
10101025
CHK_IF_RETURN(cudaMemcpyAsync(
1011-
final_result, d_allocated_final_result, sizeof(bn254::projective_t) * batch_size, cudaMemcpyDeviceToHost, stream));
1026+
final_result, d_allocated_final_result, sizeof(bn254::projective_t) * batch_size, cudaMemcpyDeviceToHost,
1027+
stream));
10121028

10131029
// (7) cleaunp - free memory and release streams+events (possibly async)
10141030
if (d_allocated_scalars) CHK_IF_RETURN(cudaFreeAsync(d_allocated_scalars, stream));
@@ -1062,7 +1078,8 @@ namespace msm {
10621078
indices_mem = 7 * sizeof(unsigned) * msm_size * batch_size *
10631079
nof_bms; // factor 7 as an estimation for the sorting extra memory. can be reduced by
10641080
// sorting separately or changing sort algorithm
1065-
points_mem = sizeof(bn254::affine_t) * msm_size * config.precompute_factor * (config.are_points_shared_in_batch ? 1 : batch_size);
1081+
points_mem = sizeof(bn254::affine_t) * msm_size * config.precompute_factor *
1082+
(config.are_points_shared_in_batch ? 1 : batch_size);
10661083
buckets_mem = 4 * sizeof(bn254::projective_t) * (1 << c) * batch_size *
10671084
nof_bms_after_precomputation; // factor 3 for the extra memory in the iterative reduction algorithm.
10681085
// +1 for large buckets. can be reduced with some optimizations.
@@ -1126,8 +1143,9 @@ namespace msm {
11261143
fixed_c = floor(std::log2(
11271144
static_cast<double>(reduced_gpu_memory) /
11281145
static_cast<double>(
1129-
3 * sizeof(bn254::projective_t) * nof_bms_after_precomputation))); // nof_bms_after_precomputation is a function of c so
1130-
// there is no analytical solution, hence the while loop
1146+
3 * sizeof(bn254::projective_t) *
1147+
nof_bms_after_precomputation))); // nof_bms_after_precomputation is a function of c so
1148+
// there is no analytical solution, hence the while loop
11311149
compute_required_memory(
11321150
config, msm_size, fixed_c, 1, bitsize, nof_bms_after_precomputation, scalars_mem, indices_mem, points_mem,
11331151
buckets_mem, reduced_gpu_memory);
@@ -1308,7 +1326,12 @@ namespace msm {
13081326
return CHK_LAST();
13091327
}
13101328

1311-
cudaError_t msm_cuda(const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::projective_t* results)
1329+
cudaError_t msm_cuda(
1330+
const bn254::scalar_t* scalars,
1331+
const bn254::affine_t* points,
1332+
int msm_size,
1333+
const MSMConfig& config,
1334+
bn254::projective_t* results)
13121335
{
13131336
const int bitsize = (config.bitsize == 0) ? bn254::scalar_t::NBITS : config.bitsize;
13141337
cudaStream_t stream = reinterpret_cast<cudaStream_t>(config.stream);
@@ -1371,7 +1394,8 @@ namespace msm {
13711394
bn254::affine_t* points_d;
13721395
if (!are_points_on_device) {
13731396
CHK_IF_RETURN(cudaMallocAsync(&points_d, sizeof(bn254::affine_t) * points_size, stream));
1374-
CHK_IF_RETURN(cudaMemcpyAsync(points_d, points, sizeof(bn254::affine_t) * points_size, cudaMemcpyHostToDevice, stream));
1397+
CHK_IF_RETURN(
1398+
cudaMemcpyAsync(points_d, points, sizeof(bn254::affine_t) * points_size, cudaMemcpyHostToDevice, stream));
13751399
}
13761400

13771401
unsigned total_nof_bms = (bn254::projective_t::SCALAR_FF_NBITS - 1) / c + 1;
@@ -1406,16 +1430,17 @@ namespace msm {
14061430
points_precomputed_d = points_precomputed;
14071431
} else {
14081432
CHK_IF_RETURN(cudaMallocAsync(&points_d, sizeof(bn254::affine_t) * chunk_size * 2, stream));
1409-
CHK_IF_RETURN(
1410-
cudaMallocAsync(&points_precomputed_d, sizeof(bn254::affine_t) * chunk_size * 2 * config.precompute_factor, stream));
1433+
CHK_IF_RETURN(cudaMallocAsync(
1434+
&points_precomputed_d, sizeof(bn254::affine_t) * chunk_size * 2 * config.precompute_factor, stream));
14111435
points_precomputed_h = points_precomputed;
14121436
}
14131437
for (int i = 0; i < nof_chunks; i++) {
14141438
int sub_msm_size = min(points_size - chunk_size * i, chunk_size);
14151439
if (sub_msm_size <= 0) break;
14161440
if (!config.are_points_on_device) {
14171441
CHK_IF_RETURN(cudaMemcpyAsync(
1418-
points_d + (i % 2) * chunk_size, points + i * chunk_size, sizeof(bn254::affine_t) * sub_msm_size, cudaMemcpyHostToDevice,
1442+
points_d + (i % 2) * chunk_size, points + i * chunk_size, sizeof(bn254::affine_t) * sub_msm_size,
1443+
cudaMemcpyHostToDevice,
14191444
stream)); // points are on host
14201445
CHK_IF_RETURN((precompute_msm_points_chunk(
14211446
points_d + (i % 2) * chunk_size, sub_msm_size, config.precompute_factor, c, true,
@@ -1454,14 +1479,14 @@ namespace msm {
14541479
return nof_chunks;
14551480
}
14561481

1457-
static cudaError_t
1458-
cuda_precompute_msm_points(const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::affine_t* output_points)
1482+
static cudaError_t cuda_precompute_msm_points(
1483+
const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::affine_t* output_points)
14591484
{
14601485
unsigned c = (config.c == 0) ? min(get_optimal_c(msm_size), MAX_C_FOR_PRECOMPUTATION)
14611486
: config.c; // limit precomputation c so we won't run into bucket memory overflow in
14621487
// msm (TODO - find better solution)
1463-
int nof_chunks = get_precomputation_nof_chunks(
1464-
config, msm_size, config.are_points_shared_in_batch ? 1 : config.batch_size);
1488+
int nof_chunks =
1489+
get_precomputation_nof_chunks(config, msm_size, config.are_points_shared_in_batch ? 1 : config.batch_size);
14651490
if (nof_chunks) {
14661491
return CHK_STICKY((chunked_precompute(
14671492
points, msm_size, c, config.are_points_shared_in_batch ? 1 : config.batch_size, config, output_points,
@@ -1481,14 +1506,23 @@ namespace msm {
14811506
}
14821507

14831508
static eIcicleError msm_cuda_wrapper(
1484-
const Device& device, const bn254::scalar_t* scalars, const bn254::affine_t* bases, int msm_size, const MSMConfig& config, bn254::projective_t* results)
1509+
const Device& device,
1510+
const bn254::scalar_t* scalars,
1511+
const bn254::affine_t* bases,
1512+
int msm_size,
1513+
const MSMConfig& config,
1514+
bn254::projective_t* results)
14851515
{
14861516
auto err = msm_cuda(scalars, bases, msm_size, config, results);
14871517
return translateCudaError(err);
14881518
}
14891519

14901520
static eIcicleError msm_precompute_bases_cuda_wrapper(
1491-
const Device& device, const bn254::affine_t* input_bases, int nof_bases, const MSMConfig& config, bn254::affine_t* output_bases)
1521+
const Device& device,
1522+
const bn254::affine_t* input_bases,
1523+
int nof_bases,
1524+
const MSMConfig& config,
1525+
bn254::affine_t* output_bases)
14921526
{
14931527
auto err = cuda_precompute_msm_points(input_bases, nof_bases, config, output_bases);
14941528
return translateCudaError(err);

icicle/backend/cuda/src/msm/cuda_msm.cuh

+10-5
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,13 @@
2828
#define MAX_C_FOR_PRECOMPUTATION 22
2929

3030
namespace msm {
31-
static cudaError_t
32-
cuda_precompute_msm_points(const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::affine_t* output_points);
33-
34-
cudaError_t msm_cuda(const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::projective_t* results);
35-
}
31+
static cudaError_t cuda_precompute_msm_points(
32+
const bn254::affine_t* points, int msm_size, const MSMConfig& config, bn254::affine_t* output_points);
33+
34+
cudaError_t msm_cuda(
35+
const bn254::scalar_t* scalars,
36+
const bn254::affine_t* points,
37+
int msm_size,
38+
const MSMConfig& config,
39+
bn254::projective_t* results);
40+
} // namespace msm

0 commit comments

Comments
 (0)