Skip to content

Commit 436f401

Browse files
authored
Add FRI folding for M31 (#618)
This PR adds FRI folding for M31 field.
1 parent 98dd414 commit 436f401

File tree

6 files changed

+593
-0
lines changed

6 files changed

+593
-0
lines changed

icicle/include/fri/fri.cuh

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#pragma once
2+
#ifndef FRI_H
3+
#define FRI_H
4+
5+
#include <cuda_runtime.h>
6+
7+
#include "gpu-utils/device_context.cuh"
8+
9+
namespace fri {
10+
11+
struct FriConfig {
12+
device_context::DeviceContext ctx;
13+
bool are_evals_on_device;
14+
bool are_domain_elements_on_device;
15+
bool are_results_on_device;
16+
bool is_async;
17+
};
18+
19+
/**
20+
* @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
21+
*
22+
* @param evals Pointer to the array of evaluation in the current FRI layer.
23+
* @param domain_xs Pointer to a subset of line domain values.
24+
* @param alpha The folding factor used in the FRI protocol.
25+
* @param folded_evals Pointer to the array where the folded evaluations will be stored.
26+
* @param n The number of evaluations in the original layer (before folding).
27+
*
28+
* @tparam S The scalar field type used for domain_xs.
29+
* @tparam E The evaluation type, typically the same as the field element type.
30+
*
31+
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
32+
* evaluations by half.
33+
*/
34+
template <typename S, typename E>
35+
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);
36+
37+
/**
38+
* @brief Folds a layer of FRI evaluations from a circle into a line.
39+
*
40+
* This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
41+
* specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
42+
* them into a line using the provided folding factor alpha.
43+
*
44+
* @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
45+
* @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals
46+
* represents.
47+
* @param alpha The folding factor used in the FRI protocol.
48+
* @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
49+
* @param n The number of evaluations in the original layer (before folding).
50+
*
51+
* @tparam S The scalar field type used for alpha and domain_ys.
52+
* @tparam E The evaluation type, typically the same as the field element type.
53+
*
54+
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
55+
* evaluations by half.
56+
* @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
57+
*/
58+
59+
template <typename S, typename E>
60+
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);
61+
62+
} // namespace fri
63+
64+
#endif

icicle/src/fields/CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ endif ()
44

55
SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
66
SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
7+
SET(SUPPORTED_FIELDS_WITH_FRI m31)
78

89
set(TARGET icicle_field)
910

@@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
4243
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT
4344
endif()
4445

46+
if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
47+
list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
48+
list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
49+
endif()
50+
4551
add_library(${TARGET} STATIC ${FIELD_SOURCE})
4652
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
4753
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")

icicle/src/fri/extern.cu

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include "fields/field_config.cuh"
2+
using namespace field_config;
3+
4+
#include "fri.cu"
5+
#include "utils/utils.h"
6+
7+
namespace fri {
8+
/**
9+
* Extern "C" version of [fold_line](@ref fold_line) function with the following values of
10+
* template parameters (where the field is given by `-DFIELD` env variable during build):
11+
* - `E` is the extension field type used for evaluations and alpha
12+
* - `S` is the scalar field type used for domain elements
13+
* @param line_eval Pointer to the array of evaluations on the line
14+
* @param domain_elements Pointer to the array of domain elements
15+
* @param alpha The folding factor
16+
* @param folded_evals Pointer to the array where folded evaluations will be stored
17+
* @param n The number of evaluations
18+
* @param ctx The device context; if the stream is not 0, then everything is run async
19+
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
20+
*/
21+
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_line)(
22+
extension_t* line_eval,
23+
scalar_t* domain_elements,
24+
extension_t alpha,
25+
extension_t* folded_evals,
26+
uint64_t n,
27+
FriConfig& cfg)
28+
{
29+
return fri::fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
30+
};
31+
32+
/**
33+
* Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
34+
* template parameters (where the field is given by `-DFIELD` env variable during build):
35+
* - `E` is the extension field type used for evaluations and alpha
36+
* - `S` is the scalar field type used for domain elements
37+
* @param circle_evals Pointer to the array of evaluations on the circle
38+
* @param domain_elements Pointer to the array of domain elements
39+
* @param alpha The folding factor
40+
* @param folded_line_evals Pointer to the array where folded evaluations will be stored
41+
* @param n The number of evaluations
42+
* @param ctx The device context; if the stream is not 0, then everything is run async
43+
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
44+
*/
45+
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_circle_into_line)(
46+
extension_t* circle_evals,
47+
scalar_t* domain_elements,
48+
extension_t alpha,
49+
extension_t* folded_line_evals,
50+
uint64_t n,
51+
FriConfig& cfg)
52+
{
53+
return fri::fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
54+
};
55+
} // namespace fri

icicle/src/fri/fri.cu

+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#include <cuda_runtime.h>
2+
3+
#include "fri/fri.cuh"
4+
5+
#include "fields/field.cuh"
6+
#include "gpu-utils/error_handler.cuh"
7+
#include "gpu-utils/device_context.cuh"
8+
9+
namespace fri {
10+
11+
namespace {
12+
template <typename S, typename E>
13+
__device__ void ibutterfly(E& v0, E& v1, const S& itwid)
14+
{
15+
E tmp = v0;
16+
v0 = tmp + v1;
17+
v1 = (tmp - v1) * itwid;
18+
}
19+
20+
template <typename S, typename E>
21+
__global__ void fold_line_kernel(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n)
22+
{
23+
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
24+
if (idx % 2 == 0 && idx < n) {
25+
E f_x = eval[idx]; // even
26+
E f_x_neg = eval[idx + 1]; // odd
27+
S x_domain = domain_xs[idx / 2];
28+
ibutterfly(f_x, f_x_neg, S::inverse(x_domain));
29+
auto folded_eval_idx = idx / 2;
30+
folded_eval[folded_eval_idx] = f_x + alpha * f_x_neg;
31+
}
32+
}
33+
34+
template <typename S, typename E>
35+
__global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, E alpha, E alpha_sq, E* folded_eval, uint64_t n)
36+
{
37+
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
38+
if (idx % 2 == 0 && idx < n) {
39+
E f0_px = eval[idx];
40+
E f1_px = eval[idx + 1];
41+
ibutterfly(f0_px, f1_px, S::inverse(domain_ys[idx / 2]));
42+
E f_prime = f0_px + alpha * f1_px;
43+
auto folded_eval_idx = idx / 2;
44+
folded_eval[folded_eval_idx] = folded_eval[folded_eval_idx] * alpha_sq + f_prime;
45+
}
46+
}
47+
} // namespace
48+
49+
template <typename S, typename E>
50+
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
51+
{
52+
CHK_INIT_IF_RETURN();
53+
54+
cudaStream_t stream = cfg.ctx.stream;
55+
// Allocate and move line domain evals to device if necessary
56+
E* d_eval;
57+
if (!cfg.are_evals_on_device) {
58+
auto data_size = sizeof(E) * n;
59+
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
60+
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
61+
} else {
62+
d_eval = eval;
63+
}
64+
65+
// Allocate and move domain's elements to device if necessary
66+
S* d_domain_xs;
67+
if (!cfg.are_domain_elements_on_device) {
68+
auto data_size = sizeof(S) * n / 2;
69+
CHK_IF_RETURN(cudaMallocAsync(&d_domain_xs, data_size, stream));
70+
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_xs, domain_xs, data_size, cudaMemcpyHostToDevice, stream));
71+
} else {
72+
d_domain_xs = domain_xs;
73+
}
74+
75+
// Allocate folded_eval if pointer is not a device pointer
76+
E* d_folded_eval;
77+
if (!cfg.are_results_on_device) {
78+
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
79+
} else {
80+
d_folded_eval = folded_eval;
81+
}
82+
83+
uint64_t num_threads = 256;
84+
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
85+
fold_line_kernel<<<num_blocks, num_threads, 0, stream>>>(d_eval, d_domain_xs, alpha, d_folded_eval, n);
86+
87+
// Move folded_eval back to host if requested
88+
if (!cfg.are_results_on_device) {
89+
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
90+
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
91+
}
92+
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_xs, stream)); }
93+
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }
94+
95+
// Sync if stream is default stream
96+
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
97+
98+
return CHK_LAST();
99+
}
100+
101+
template <typename S, typename E>
102+
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
103+
{
104+
CHK_INIT_IF_RETURN();
105+
106+
cudaStream_t stream = cfg.ctx.stream;
107+
// Allocate and move circle domain evals to device if necessary
108+
E* d_eval;
109+
if (!cfg.are_evals_on_device) {
110+
auto data_size = sizeof(E) * n;
111+
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
112+
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
113+
} else {
114+
d_eval = eval;
115+
}
116+
117+
// Allocate and move domain's elements to device if necessary
118+
S* d_domain_ys;
119+
if (!cfg.are_domain_elements_on_device) {
120+
auto data_size = sizeof(S) * n / 2;
121+
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
122+
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
123+
} else {
124+
d_domain_ys = domain_ys;
125+
}
126+
127+
// Allocate folded_evals if pointer is not a device pointer
128+
E* d_folded_eval;
129+
if (!cfg.are_results_on_device) {
130+
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
131+
} else {
132+
d_folded_eval = folded_eval;
133+
}
134+
135+
E alpha_sq = alpha * alpha;
136+
uint64_t num_threads = 256;
137+
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
138+
fold_circle_into_line_kernel<<<num_blocks, num_threads, 0, stream>>>(
139+
d_eval, d_domain_ys, alpha, alpha_sq, d_folded_eval, n);
140+
141+
// Move folded_evals back to host if requested
142+
if (!cfg.are_results_on_device) {
143+
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
144+
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
145+
}
146+
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_ys, stream)); }
147+
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }
148+
149+
// Sync if stream is default stream
150+
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
151+
152+
return CHK_LAST();
153+
}
154+
} // namespace fri

0 commit comments

Comments
 (0)