3
3
namespace msm {
4
4
5
5
namespace {
6
- __global__ void
7
- precompute_points_kernel (const bn254::affine_t * points, int shift, int prec_factor, int count, bn254::affine_t * points_out, bool is_montgomery)
6
+ __global__ void precompute_points_kernel (
7
+ const bn254::affine_t * points,
8
+ int shift,
9
+ int prec_factor,
10
+ int count,
11
+ bn254::affine_t * points_out,
12
+ bool is_montgomery)
8
13
{
9
14
int tid = blockIdx .x * blockDim .x + threadIdx .x ;
10
15
if (tid >= count) return ;
@@ -192,7 +197,8 @@ namespace msm {
192
197
unsigned point_ind = point_indices[bucket_offset + i];
193
198
bn254::affine_t point = points[point_ind];
194
199
bucket = i || !init_buckets ? (point == bn254::affine_t::zero () ? bucket : bucket + point)
195
- : (point == bn254::affine_t::zero () ? bn254::projective_t::zero () : bn254::projective_t::from_affine (point));
200
+ : (point == bn254::affine_t::zero () ? bn254::projective_t::zero ()
201
+ : bn254::projective_t::from_affine (point));
196
202
}
197
203
buckets[bucket_index] = bucket;
198
204
}
@@ -224,8 +230,9 @@ namespace msm {
224
230
i++) { // add the relevant points starting from the relevant offset up to the bucket size
225
231
unsigned point_ind = point_indices[bucket_offset + i];
226
232
bn254::affine_t point = points[point_ind];
227
- bucket =
228
- i ? (point == bn254::affine_t::zero () ? bucket : bucket + point) : (point == bn254::affine_t::zero () ? bn254::projective_t::zero () : bn254::projective_t::from_affine (point));
233
+ bucket = i ? (point == bn254::affine_t::zero () ? bucket : bucket + point)
234
+ : (point == bn254::affine_t::zero () ? bn254::projective_t::zero ()
235
+ : bn254::projective_t::from_affine (point));
229
236
}
230
237
buckets[tid] = run_length ? bucket : bn254::projective_t::zero ();
231
238
}
@@ -252,7 +259,8 @@ namespace msm {
252
259
253
260
// this kernel sums the entire bucket module
254
261
// each thread deals with a single bucket module
255
- __global__ void big_triangle_sum_kernel (const bn254::projective_t * buckets, bn254::projective_t * final_sums, unsigned nof_bms, unsigned c)
262
+ __global__ void big_triangle_sum_kernel (
263
+ const bn254::projective_t * buckets, bn254::projective_t * final_sums, unsigned nof_bms, unsigned c)
256
264
{
257
265
unsigned tid = (blockIdx .x * blockDim .x ) + threadIdx .x ;
258
266
if (tid >= nof_bms) return ;
@@ -293,7 +301,11 @@ namespace msm {
293
301
// this kernel computes the final result using the double and add algorithm
294
302
// it is done by a single thread
295
303
__global__ void final_accumulation_kernel (
296
- const bn254::projective_t * final_sums, bn254::projective_t * final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
304
+ const bn254::projective_t * final_sums,
305
+ bn254::projective_t * final_results,
306
+ unsigned nof_msms,
307
+ unsigned nof_results,
308
+ unsigned c)
297
309
{
298
310
unsigned tid = (blockIdx .x * blockDim .x ) + threadIdx .x ;
299
311
if (tid >= nof_msms) return ;
@@ -310,7 +322,7 @@ namespace msm {
310
322
final_results[tid] = final_result + final_sums[tid * nof_results];
311
323
}
312
324
313
- template <typename E>
325
+ template <typename E>
314
326
static cudaError_t from_montgomery_on_device (const E* d_input, int n, cudaStream_t stream, E* d_output)
315
327
{
316
328
auto config = default_vec_ops_config ();
@@ -321,7 +333,6 @@ namespace msm {
321
333
return montgomery::ConvertMontgomery<E, false >(d_input, n, config, d_output);
322
334
}
323
335
324
-
325
336
static cudaError_t split_and_sort_scalars (
326
337
cudaStream_t stream,
327
338
unsigned nof_scalars,
@@ -420,8 +431,8 @@ namespace msm {
420
431
if (!are_scalars_on_device) {
421
432
// copy scalars to gpu
422
433
CHK_IF_RETURN (cudaMallocAsync (&d_allocated_scalars, sizeof (bn254::scalar_t ) * nof_scalars, stream));
423
- CHK_IF_RETURN (
424
- cudaMemcpyAsync ( d_allocated_scalars, scalars, sizeof (bn254::scalar_t ) * nof_scalars, cudaMemcpyHostToDevice, stream));
434
+ CHK_IF_RETURN (cudaMemcpyAsync (
435
+ d_allocated_scalars, scalars, sizeof (bn254::scalar_t ) * nof_scalars, cudaMemcpyHostToDevice, stream));
425
436
426
437
if (are_scalars_montgomery_form) {
427
438
CHK_IF_RETURN (from_montgomery_on_device (d_allocated_scalars, nof_scalars, stream, d_allocated_scalars));
@@ -456,8 +467,8 @@ namespace msm {
456
467
if (!are_points_on_device) {
457
468
// copy points to gpu
458
469
CHK_IF_RETURN (cudaMallocAsync (&d_allocated_points, sizeof (bn254::affine_t ) * nof_points, stream_points));
459
- CHK_IF_RETURN (
460
- cudaMemcpyAsync ( d_allocated_points, points, sizeof (bn254::affine_t ) * nof_points, cudaMemcpyHostToDevice, stream_points));
470
+ CHK_IF_RETURN (cudaMemcpyAsync (
471
+ d_allocated_points, points, sizeof (bn254::affine_t ) * nof_points, cudaMemcpyHostToDevice, stream_points));
461
472
462
473
if (are_points_montgomery_form) {
463
474
CHK_IF_RETURN (from_montgomery_on_device (d_allocated_points, nof_points, stream_points, d_allocated_points));
@@ -496,7 +507,8 @@ namespace msm {
496
507
unsigned * d_bucket_offsets)
497
508
{
498
509
if (init_buckets) {
499
- CHK_IF_RETURN (cudaMallocAsync (&buckets, sizeof (bn254::projective_t ) * (total_nof_buckets + nof_bms_in_batch), stream));
510
+ CHK_IF_RETURN (
511
+ cudaMallocAsync (&buckets, sizeof (bn254::projective_t ) * (total_nof_buckets + nof_bms_in_batch), stream));
500
512
501
513
// launch the bucket initialization kernel with maximum threads
502
514
unsigned NUM_THREADS = 1 << 10 ;
@@ -642,7 +654,8 @@ namespace msm {
642
654
large_bucket_indices);
643
655
644
656
bn254::projective_t * large_buckets;
645
- CHK_IF_RETURN (cudaMallocAsync (&large_buckets, sizeof (bn254::projective_t ) * large_buckets_nof_threads, stream_large_buckets));
657
+ CHK_IF_RETURN (
658
+ cudaMallocAsync (&large_buckets, sizeof (bn254::projective_t ) * large_buckets_nof_threads, stream_large_buckets));
646
659
647
660
NUM_THREADS = max (1 , min (1 << 8 , large_buckets_nof_threads));
648
661
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1 ) / NUM_THREADS;
@@ -779,7 +792,8 @@ namespace msm {
779
792
const unsigned target_bits_count = (source_bits_count + 1 ) >> 1 ; // half the bits rounded up
780
793
target_windows_count = source_windows_count << 1 ; // twice the number of bms
781
794
const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
782
- CHK_IF_RETURN (cudaMallocAsync (&target_buckets, sizeof (bn254::projective_t ) * target_buckets_count * batch_size, stream));
795
+ CHK_IF_RETURN (
796
+ cudaMallocAsync (&target_buckets, sizeof (bn254::projective_t ) * target_buckets_count * batch_size, stream));
783
797
CHK_IF_RETURN (cudaMallocAsync (
784
798
&temp_buckets1, sizeof (bn254::projective_t ) * source_buckets_count * batch_size,
785
799
stream)); // for type1 reduction (strided, bottom window - evens)
@@ -832,7 +846,8 @@ namespace msm {
832
846
nof_bms_per_msm = target_windows_count;
833
847
unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
834
848
835
- CHK_IF_RETURN (cudaMallocAsync (&final_results, sizeof (bn254::projective_t ) * total_nof_final_results, stream));
849
+ CHK_IF_RETURN (
850
+ cudaMallocAsync (&final_results, sizeof (bn254::projective_t ) * total_nof_final_results, stream));
836
851
837
852
unsigned NUM_THREADS = 32 ;
838
853
unsigned NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1 ) / NUM_THREADS;
@@ -1008,7 +1023,8 @@ namespace msm {
1008
1023
1009
1024
if (!are_results_on_device)
1010
1025
CHK_IF_RETURN (cudaMemcpyAsync (
1011
- final_result, d_allocated_final_result, sizeof (bn254::projective_t ) * batch_size, cudaMemcpyDeviceToHost, stream));
1026
+ final_result, d_allocated_final_result, sizeof (bn254::projective_t ) * batch_size, cudaMemcpyDeviceToHost,
1027
+ stream));
1012
1028
1013
1029
// (7) cleaunp - free memory and release streams+events (possibly async)
1014
1030
if (d_allocated_scalars) CHK_IF_RETURN (cudaFreeAsync (d_allocated_scalars, stream));
@@ -1062,7 +1078,8 @@ namespace msm {
1062
1078
indices_mem = 7 * sizeof (unsigned ) * msm_size * batch_size *
1063
1079
nof_bms; // factor 7 as an estimation for the sorting extra memory. can be reduced by
1064
1080
// sorting separately or changing sort algorithm
1065
- points_mem = sizeof (bn254::affine_t ) * msm_size * config.precompute_factor * (config.are_points_shared_in_batch ? 1 : batch_size);
1081
+ points_mem = sizeof (bn254::affine_t ) * msm_size * config.precompute_factor *
1082
+ (config.are_points_shared_in_batch ? 1 : batch_size);
1066
1083
buckets_mem = 4 * sizeof (bn254::projective_t ) * (1 << c) * batch_size *
1067
1084
nof_bms_after_precomputation; // factor 3 for the extra memory in the iterative reduction algorithm.
1068
1085
// +1 for large buckets. can be reduced with some optimizations.
@@ -1126,8 +1143,9 @@ namespace msm {
1126
1143
fixed_c = floor (std::log2 (
1127
1144
static_cast <double >(reduced_gpu_memory) /
1128
1145
static_cast <double >(
1129
- 3 * sizeof (bn254::projective_t ) * nof_bms_after_precomputation))); // nof_bms_after_precomputation is a function of c so
1130
- // there is no analytical solution, hence the while loop
1146
+ 3 * sizeof (bn254::projective_t ) *
1147
+ nof_bms_after_precomputation))); // nof_bms_after_precomputation is a function of c so
1148
+ // there is no analytical solution, hence the while loop
1131
1149
compute_required_memory (
1132
1150
config, msm_size, fixed_c, 1 , bitsize, nof_bms_after_precomputation, scalars_mem, indices_mem, points_mem,
1133
1151
buckets_mem, reduced_gpu_memory);
@@ -1308,7 +1326,12 @@ namespace msm {
1308
1326
return CHK_LAST ();
1309
1327
}
1310
1328
1311
- cudaError_t msm_cuda (const bn254::scalar_t * scalars, const bn254::affine_t * points, int msm_size, const MSMConfig& config, bn254::projective_t * results)
1329
+ cudaError_t msm_cuda (
1330
+ const bn254::scalar_t * scalars,
1331
+ const bn254::affine_t * points,
1332
+ int msm_size,
1333
+ const MSMConfig& config,
1334
+ bn254::projective_t * results)
1312
1335
{
1313
1336
const int bitsize = (config.bitsize == 0 ) ? bn254::scalar_t ::NBITS : config.bitsize ;
1314
1337
cudaStream_t stream = reinterpret_cast <cudaStream_t>(config.stream );
@@ -1371,7 +1394,8 @@ namespace msm {
1371
1394
bn254::affine_t * points_d;
1372
1395
if (!are_points_on_device) {
1373
1396
CHK_IF_RETURN (cudaMallocAsync (&points_d, sizeof (bn254::affine_t ) * points_size, stream));
1374
- CHK_IF_RETURN (cudaMemcpyAsync (points_d, points, sizeof (bn254::affine_t ) * points_size, cudaMemcpyHostToDevice, stream));
1397
+ CHK_IF_RETURN (
1398
+ cudaMemcpyAsync (points_d, points, sizeof (bn254::affine_t ) * points_size, cudaMemcpyHostToDevice, stream));
1375
1399
}
1376
1400
1377
1401
unsigned total_nof_bms = (bn254::projective_t ::SCALAR_FF_NBITS - 1 ) / c + 1 ;
@@ -1406,16 +1430,17 @@ namespace msm {
1406
1430
points_precomputed_d = points_precomputed;
1407
1431
} else {
1408
1432
CHK_IF_RETURN (cudaMallocAsync (&points_d, sizeof (bn254::affine_t ) * chunk_size * 2 , stream));
1409
- CHK_IF_RETURN (
1410
- cudaMallocAsync ( &points_precomputed_d, sizeof (bn254::affine_t ) * chunk_size * 2 * config.precompute_factor , stream));
1433
+ CHK_IF_RETURN (cudaMallocAsync (
1434
+ &points_precomputed_d, sizeof (bn254::affine_t ) * chunk_size * 2 * config.precompute_factor , stream));
1411
1435
points_precomputed_h = points_precomputed;
1412
1436
}
1413
1437
for (int i = 0 ; i < nof_chunks; i++) {
1414
1438
int sub_msm_size = min (points_size - chunk_size * i, chunk_size);
1415
1439
if (sub_msm_size <= 0 ) break ;
1416
1440
if (!config.are_points_on_device ) {
1417
1441
CHK_IF_RETURN (cudaMemcpyAsync (
1418
- points_d + (i % 2 ) * chunk_size, points + i * chunk_size, sizeof (bn254::affine_t ) * sub_msm_size, cudaMemcpyHostToDevice,
1442
+ points_d + (i % 2 ) * chunk_size, points + i * chunk_size, sizeof (bn254::affine_t ) * sub_msm_size,
1443
+ cudaMemcpyHostToDevice,
1419
1444
stream)); // points are on host
1420
1445
CHK_IF_RETURN ((precompute_msm_points_chunk (
1421
1446
points_d + (i % 2 ) * chunk_size, sub_msm_size, config.precompute_factor , c, true ,
@@ -1454,14 +1479,14 @@ namespace msm {
1454
1479
return nof_chunks;
1455
1480
}
1456
1481
1457
- static cudaError_t
1458
- cuda_precompute_msm_points ( const bn254::affine_t * points, int msm_size, const MSMConfig& config, bn254::affine_t * output_points)
1482
+ static cudaError_t cuda_precompute_msm_points (
1483
+ const bn254::affine_t * points, int msm_size, const MSMConfig& config, bn254::affine_t * output_points)
1459
1484
{
1460
1485
unsigned c = (config.c == 0 ) ? min (get_optimal_c (msm_size), MAX_C_FOR_PRECOMPUTATION)
1461
1486
: config.c ; // limit precomputation c so we won't run into bucket memory overflow in
1462
1487
// msm (TODO - find better solution)
1463
- int nof_chunks = get_precomputation_nof_chunks (
1464
- config, msm_size, config.are_points_shared_in_batch ? 1 : config.batch_size );
1488
+ int nof_chunks =
1489
+ get_precomputation_nof_chunks ( config, msm_size, config.are_points_shared_in_batch ? 1 : config.batch_size );
1465
1490
if (nof_chunks) {
1466
1491
return CHK_STICKY ((chunked_precompute (
1467
1492
points, msm_size, c, config.are_points_shared_in_batch ? 1 : config.batch_size , config, output_points,
@@ -1481,14 +1506,23 @@ namespace msm {
1481
1506
}
1482
1507
1483
1508
static eIcicleError msm_cuda_wrapper (
1484
- const Device& device, const bn254::scalar_t * scalars, const bn254::affine_t * bases, int msm_size, const MSMConfig& config, bn254::projective_t * results)
1509
+ const Device& device,
1510
+ const bn254::scalar_t * scalars,
1511
+ const bn254::affine_t * bases,
1512
+ int msm_size,
1513
+ const MSMConfig& config,
1514
+ bn254::projective_t * results)
1485
1515
{
1486
1516
auto err = msm_cuda (scalars, bases, msm_size, config, results);
1487
1517
return translateCudaError (err);
1488
1518
}
1489
1519
1490
1520
static eIcicleError msm_precompute_bases_cuda_wrapper (
1491
- const Device& device, const bn254::affine_t * input_bases, int nof_bases, const MSMConfig& config, bn254::affine_t * output_bases)
1521
+ const Device& device,
1522
+ const bn254::affine_t * input_bases,
1523
+ int nof_bases,
1524
+ const MSMConfig& config,
1525
+ bn254::affine_t * output_bases)
1492
1526
{
1493
1527
auto err = cuda_precompute_msm_points (input_bases, nof_bases, config, output_bases);
1494
1528
return translateCudaError (err);
0 commit comments