Add FRI folding for M31 (#618)

jeremyfelder · web-flow · commit 436f40178b07 · 2024-09-29T13:48:51.000+03:00
This PR adds FRI folding for M31 field.
diff --git a/icicle/include/fri/fri.cuh b/icicle/include/fri/fri.cuh
@@ -0,0 +1,64 @@
+#pragma once
+#ifndef FRI_H
+#define FRI_H
+
+#include <cuda_runtime.h>
+
+#include "gpu-utils/device_context.cuh"
+
+namespace fri {
+
+  struct FriConfig {
+    device_context::DeviceContext ctx;
+    bool are_evals_on_device;
+    bool are_domain_elements_on_device;
+    bool are_results_on_device;
+    bool is_async;
+  };
+
+  /**
+   * @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
+   *
+   * @param evals Pointer to the array of evaluation in the current FRI layer.
+   * @param domain_xs Pointer to a subset of line domain values.
+   * @param alpha The folding factor used in the FRI protocol.
+   * @param folded_evals Pointer to the array where the folded evaluations will be stored.
+   * @param n The number of evaluations in the original layer (before folding).
+   *
+   * @tparam S The scalar field type used for domain_xs.
+   * @tparam E The evaluation type, typically the same as the field element type.
+   *
+   * @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
+   * evaluations by half.
+   */
+  template <typename S, typename E>
+  cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);
+
+  /**
+   * @brief Folds a layer of FRI evaluations from a circle into a line.
+   *
+   * This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
+   * specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
+   * them into a line using the provided folding factor alpha.
+   *
+   * @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
+   * @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals
+   * represents.
+   * @param alpha The folding factor used in the FRI protocol.
+   * @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
+   * @param n The number of evaluations in the original layer (before folding).
+   *
+   * @tparam S The scalar field type used for alpha and domain_ys.
+   * @tparam E The evaluation type, typically the same as the field element type.
+   *
+   * @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
+   * evaluations by half.
+   * @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
+   */
+
+  template <typename S, typename E>
+  cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);
+
+} // namespace fri
+
+#endif
diff --git a/icicle/src/fields/CMakeLists.txt b/icicle/src/fields/CMakeLists.txt
@@ -4,6 +4,7 @@ endif ()
 
 SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
 SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
+SET(SUPPORTED_FIELDS_WITH_FRI m31)
 
 set(TARGET icicle_field)
 
@@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
   list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT  
 endif()
 
+if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
+  list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
+  list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
+endif()
+
 add_library(${TARGET} STATIC ${FIELD_SOURCE})
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
diff --git a/icicle/src/fri/extern.cu b/icicle/src/fri/extern.cu
@@ -0,0 +1,55 @@
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+#include "fri.cu"
+#include "utils/utils.h"
+
+namespace fri {
+  /**
+   * Extern "C" version of [fold_line](@ref fold_line) function with the following values of
+   * template parameters (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the extension field type used for evaluations and alpha
+   *  - `S` is the scalar field type used for domain elements
+   * @param line_eval Pointer to the array of evaluations on the line
+   * @param domain_elements Pointer to the array of domain elements
+   * @param alpha The folding factor
+   * @param folded_evals Pointer to the array where folded evaluations will be stored
+   * @param n The number of evaluations
+   * @param ctx The device context; if the stream is not 0, then everything is run async
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_line)(
+    extension_t* line_eval,
+    scalar_t* domain_elements,
+    extension_t alpha,
+    extension_t* folded_evals,
+    uint64_t n,
+    FriConfig& cfg)
+  {
+    return fri::fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
+  };
+
+  /**
+   * Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
+   * template parameters (where the field is given by `-DFIELD` env variable during build):
+   *  - `E` is the extension field type used for evaluations and alpha
+   *  - `S` is the scalar field type used for domain elements
+   * @param circle_evals Pointer to the array of evaluations on the circle
+   * @param domain_elements Pointer to the array of domain elements
+   * @param alpha The folding factor
+   * @param folded_line_evals Pointer to the array where folded evaluations will be stored
+   * @param n The number of evaluations
+   * @param ctx The device context; if the stream is not 0, then everything is run async
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_circle_into_line)(
+    extension_t* circle_evals,
+    scalar_t* domain_elements,
+    extension_t alpha,
+    extension_t* folded_line_evals,
+    uint64_t n,
+    FriConfig& cfg)
+  {
+    return fri::fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
+  };
+} // namespace fri
diff --git a/icicle/src/fri/fri.cu b/icicle/src/fri/fri.cu
@@ -0,0 +1,154 @@
+#include <cuda_runtime.h>
+
+#include "fri/fri.cuh"
+
+#include "fields/field.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/device_context.cuh"
+
+namespace fri {
+
+  namespace {
+    template <typename S, typename E>
+    __device__ void ibutterfly(E& v0, E& v1, const S& itwid)
+    {
+      E tmp = v0;
+      v0 = tmp + v1;
+      v1 = (tmp - v1) * itwid;
+    }
+
+    template <typename S, typename E>
+    __global__ void fold_line_kernel(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n)
+    {
+      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+      if (idx % 2 == 0 && idx < n) {
+        E f_x = eval[idx];         // even
+        E f_x_neg = eval[idx + 1]; // odd
+        S x_domain = domain_xs[idx / 2];
+        ibutterfly(f_x, f_x_neg, S::inverse(x_domain));
+        auto folded_eval_idx = idx / 2;
+        folded_eval[folded_eval_idx] = f_x + alpha * f_x_neg;
+      }
+    }
+
+    template <typename S, typename E>
+    __global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, E alpha, E alpha_sq, E* folded_eval, uint64_t n)
+    {
+      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+      if (idx % 2 == 0 && idx < n) {
+        E f0_px = eval[idx];
+        E f1_px = eval[idx + 1];
+        ibutterfly(f0_px, f1_px, S::inverse(domain_ys[idx / 2]));
+        E f_prime = f0_px + alpha * f1_px;
+        auto folded_eval_idx = idx / 2;
+        folded_eval[folded_eval_idx] = folded_eval[folded_eval_idx] * alpha_sq + f_prime;
+      }
+    }
+  } // namespace
+
+  template <typename S, typename E>
+  cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
+  {
+    CHK_INIT_IF_RETURN();
+
+    cudaStream_t stream = cfg.ctx.stream;
+    // Allocate and move line domain evals to device if necessary
+    E* d_eval;
+    if (!cfg.are_evals_on_device) {
+      auto data_size = sizeof(E) * n;
+      CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
+    } else {
+      d_eval = eval;
+    }
+
+    // Allocate and move domain's elements to device if necessary
+    S* d_domain_xs;
+    if (!cfg.are_domain_elements_on_device) {
+      auto data_size = sizeof(S) * n / 2;
+      CHK_IF_RETURN(cudaMallocAsync(&d_domain_xs, data_size, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_domain_xs, domain_xs, data_size, cudaMemcpyHostToDevice, stream));
+    } else {
+      d_domain_xs = domain_xs;
+    }
+
+    // Allocate folded_eval if pointer is not a device pointer
+    E* d_folded_eval;
+    if (!cfg.are_results_on_device) {
+      CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
+    } else {
+      d_folded_eval = folded_eval;
+    }
+
+    uint64_t num_threads = 256;
+    uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
+    fold_line_kernel<<<num_blocks, num_threads, 0, stream>>>(d_eval, d_domain_xs, alpha, d_folded_eval, n);
+
+    // Move folded_eval back to host if requested
+    if (!cfg.are_results_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
+    }
+    if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_xs, stream)); }
+    if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }
+
+    // Sync if stream is default stream
+    if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+    return CHK_LAST();
+  }
+
+  template <typename S, typename E>
+  cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
+  {
+    CHK_INIT_IF_RETURN();
+
+    cudaStream_t stream = cfg.ctx.stream;
+    // Allocate and move circle domain evals to device if necessary
+    E* d_eval;
+    if (!cfg.are_evals_on_device) {
+      auto data_size = sizeof(E) * n;
+      CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
+    } else {
+      d_eval = eval;
+    }
+
+    // Allocate and move domain's elements to device if necessary
+    S* d_domain_ys;
+    if (!cfg.are_domain_elements_on_device) {
+      auto data_size = sizeof(S) * n / 2;
+      CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
+    } else {
+      d_domain_ys = domain_ys;
+    }
+
+    // Allocate folded_evals if pointer is not a device pointer
+    E* d_folded_eval;
+    if (!cfg.are_results_on_device) {
+      CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
+    } else {
+      d_folded_eval = folded_eval;
+    }
+
+    E alpha_sq = alpha * alpha;
+    uint64_t num_threads = 256;
+    uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
+    fold_circle_into_line_kernel<<<num_blocks, num_threads, 0, stream>>>(
+      d_eval, d_domain_ys, alpha, alpha_sq, d_folded_eval, n);
+
+    // Move folded_evals back to host if requested
+    if (!cfg.are_results_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
+    }
+    if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_ys, stream)); }
+    if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }
+
+    // Sync if stream is default stream
+    if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+    return CHK_LAST();
+  }
+} // namespace fri
diff --git a/wrappers/rust/icicle-fields/icicle-m31/src/fri/mod.rs b/wrappers/rust/icicle-fields/icicle-m31/src/fri/mod.rs
diff --git a/wrappers/rust/icicle-fields/icicle-m31/src/lib.rs b/wrappers/rust/icicle-fields/icicle-m31/src/lib.rs