Skip to content

Commit ee0aee5

Browse files
committed
add cuda backend for msm
1 parent da16082 commit ee0aee5

21 files changed

+4380
-1
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,3 @@
1919
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
2020
**/build/*
2121
**tar.gz
22-
icicle/backend/cuda

icicle/backend/cuda/.clang-format

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
Language: Cpp
2+
AlignAfterOpenBracket: AlwaysBreak
3+
AlignConsecutiveMacros: true
4+
AlignTrailingComments: true
5+
AllowAllParametersOfDeclarationOnNextLine: true
6+
AllowShortBlocksOnASingleLine: true
7+
AllowShortCaseLabelsOnASingleLine: false
8+
AllowShortFunctionsOnASingleLine: All
9+
AllowShortIfStatementsOnASingleLine: true
10+
AlwaysBreakTemplateDeclarations: true
11+
BinPackArguments: true
12+
BinPackParameters: false
13+
BreakBeforeBraces: Custom
14+
BraceWrapping:
15+
AfterClass: true
16+
AfterFunction: true
17+
BreakBeforeBinaryOperators: false
18+
BreakBeforeTernaryOperators: true
19+
ColumnLimit: 120
20+
ContinuationIndentWidth: 2
21+
Cpp11BracedListStyle: true
22+
DisableFormat: false
23+
IndentFunctionDeclarationAfterType: false
24+
IndentWidth: 2
25+
KeepEmptyLinesAtTheStartOfBlocks: false
26+
MaxEmptyLinesToKeep: 1
27+
NamespaceIndentation: All
28+
PointerAlignment: Left
29+
SortIncludes: false
30+
SpaceBeforeAssignmentOperators: true
31+
SpaceBeforeParens: ControlStatements
32+
SpaceInEmptyParentheses: false
33+
SpacesBeforeTrailingComments: 1
34+
SpacesInAngles: false
35+
SpacesInContainerLiterals: false
36+
SpacesInCStyleCastParentheses: false
37+
SpacesInParentheses: false
38+
Standard: c++17
39+
UseTab: Never

icicle/backend/cuda/.gitignore

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Prerequisites
2+
*.d
3+
4+
# Compiled Object files
5+
*.slo
6+
*.lo
7+
*.o
8+
*.obj
9+
10+
# Precompiled Headers
11+
*.gch
12+
*.pch
13+
14+
# Compiled Dynamic libraries
15+
*.so
16+
*.dylib
17+
*.dll
18+
19+
# Fortran module files
20+
*.mod
21+
*.smod
22+
23+
# Compiled Static libraries
24+
*.lai
25+
*.la
26+
*.a
27+
*.lib
28+
29+
# Executables
30+
*.exe
31+
*.out
32+
*.app
33+
34+
# VScode configs
35+
**/.vscode

icicle/backend/cuda/CMakeLists.txt

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
3+
include(cmake/Common.cmake)
4+
find_cuda_compiler()
5+
6+
project(icicle_cuda_backend LANGUAGES CUDA CXX)
7+
8+
set_env()
9+
set_gpu_env()
10+
11+
find_package(CUDAToolkit REQUIRED)
12+
13+
# device API library
14+
add_library(icicle_backend_cuda_device SHARED src/cuda_device_api.cu)
15+
target_include_directories(icicle_backend_cuda_device PRIVATE include)
16+
target_include_directories(icicle_backend_cuda_device PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
17+
target_link_libraries(icicle_backend_cuda_device PUBLIC pthread)
18+
19+
install(TARGETS icicle_backend_cuda_device
20+
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
21+
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
22+
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda)
23+
24+
if(CMAKE_BUILD_TYPE STREQUAL "Release")
25+
add_custom_command(TARGET icicle_backend_cuda_device POST_BUILD
26+
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_backend_cuda_device>
27+
)
28+
endif()
29+
30+
# field API library
31+
if (FIELD)
32+
add_library(icicle_cuda_field SHARED
33+
src/field/cuda_mont.cu
34+
)
35+
target_include_directories(icicle_cuda_field PRIVATE include)
36+
set_target_properties(icicle_cuda_field PROPERTIES OUTPUT_NAME "icicle_backend_cuda_field_${FIELD}")
37+
target_include_directories(icicle_cuda_field PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
38+
target_link_libraries(icicle_cuda_field PRIVATE icicle_field ${CUDA_LIBRARIES} pthread) # Link to CUDA
39+
40+
install(TARGETS icicle_cuda_field
41+
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
42+
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
43+
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda")
44+
45+
if(CMAKE_BUILD_TYPE STREQUAL "Release")
46+
add_custom_command(TARGET icicle_cuda_field POST_BUILD
47+
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_field>
48+
)
49+
endif()
50+
endif() # FIELD
51+
52+
# curve API library
53+
if (CURVE)
54+
add_library(icicle_cuda_curve SHARED
55+
src/curve/cuda_mont.cu
56+
)
57+
if(MSM)
58+
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm.cu)
59+
endif()
60+
if(G2)
61+
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm_g2.cu)
62+
endif()
63+
target_include_directories(icicle_cuda_curve PRIVATE include)
64+
set_target_properties(icicle_cuda_curve PROPERTIES OUTPUT_NAME "icicle_backend_cuda_curve_${CURVE}")
65+
target_include_directories(icicle_cuda_curve PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
66+
target_link_libraries(icicle_cuda_curve PRIVATE icicle_cuda_field icicle_curve ${CUDA_LIBRARIES} pthread) # Link to CUDA
67+
68+
install(TARGETS icicle_cuda_curve
69+
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
70+
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
71+
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda")
72+
73+
if(CMAKE_BUILD_TYPE STREQUAL "Release")
74+
add_custom_command(TARGET icicle_cuda_curve POST_BUILD
75+
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_curve>
76+
)
77+
endif()
78+
endif()
+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
function(set_env)
2+
set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
3+
set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
4+
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
5+
set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)
6+
7+
if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
8+
message(WARNING "Note that PIC (position-independent code) is disabled.")
9+
else()
10+
set(CMAKE_POSITION_INDEPENDENT_CODE ON PARENT_SCOPE)
11+
endif()
12+
endfunction()
13+
14+
function(set_gpu_env)
15+
# add the target cuda architectures
16+
# each additional architecture increases the compilation time and output file size
17+
if(DEFINED CUDA_ARCH) # user defined arch takes priority
18+
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
19+
elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
20+
set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
21+
else()
22+
find_program(_nvidia_smi "nvidia-smi")
23+
24+
if(_nvidia_smi)
25+
execute_process(
26+
COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
27+
OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
28+
OUTPUT_STRIP_TRAILING_WHITESPACE
29+
)
30+
# Process the output to form the CUDA architectures string
31+
string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")
32+
33+
set(CUDA_ARCHITECTURES "")
34+
foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
35+
# Remove the dot in compute capability to match CMake format
36+
string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
37+
if(CUDA_ARCHITECTURES)
38+
set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
39+
else()
40+
set(CUDA_ARCHITECTURES "${CAPABILITY}")
41+
endif()
42+
endforeach()
43+
44+
message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")
45+
set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)
46+
else()
47+
# no GPUs found, like on Github CI runners
48+
message("Setting CMAKE_CUDA_ARCHITECTURES to: 50")
49+
set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
50+
endif()
51+
endif()
52+
53+
# Check CUDA version and, if possible, enable multi-threaded compilation
54+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
55+
message(STATUS "Using multi-threaded CUDA compilation.")
56+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
57+
else()
58+
message(STATUS "Can't use multi-threaded CUDA compilation.")
59+
endif()
60+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
61+
set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
62+
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
63+
endfunction()
64+
65+
function(find_cuda_compiler)
66+
# Find the CUDA compiler
67+
execute_process(
68+
COMMAND which nvcc
69+
OUTPUT_VARIABLE CUDA_COMPILER_PATH
70+
OUTPUT_STRIP_TRAILING_WHITESPACE
71+
)
72+
73+
# Set the CUDA compiler
74+
set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH} PARENT_SCOPE)
75+
endfunction()
+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#pragma once
2+
#include <cuda.h>
3+
#include <stdexcept>
4+
5+
#include "icicle/errors.h"
6+
#include "icicle/vec_ops.h"
7+
#include "gpu-utils/error_handler.h"
8+
9+
namespace montgomery {
10+
#define MAX_THREADS_PER_BLOCK 256
11+
12+
template <typename E, bool is_into>
13+
__global__ void MontgomeryKernel(const E* input, int n, E* output)
14+
{
15+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
16+
if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }
17+
}
18+
19+
template <typename E, bool is_into>
20+
cudaError_t ConvertMontgomery(const E* input, size_t n, const VecOpsConfig& config, E* output)
21+
{
22+
cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(config.stream);
23+
24+
E *d_alloc_out = nullptr, *d_alloc_in = nullptr, *d_out;
25+
const E* d_in;
26+
if (!config.is_a_on_device) {
27+
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_in, n * sizeof(E), cuda_stream));
28+
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_in, input, n * sizeof(E), cudaMemcpyHostToDevice, cuda_stream));
29+
d_in = d_alloc_in;
30+
} else {
31+
d_in = input;
32+
}
33+
34+
if (!config.is_result_on_device) {
35+
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_out, n * sizeof(E), cuda_stream));
36+
d_out = d_alloc_out;
37+
} else {
38+
d_out = output;
39+
}
40+
41+
int num_threads = MAX_THREADS_PER_BLOCK;
42+
int num_blocks = (n + num_threads - 1) / num_threads;
43+
MontgomeryKernel<E, is_into><<<num_blocks, num_threads, 0, cuda_stream>>>(d_in, n, d_out);
44+
45+
if (d_alloc_in) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_in, cuda_stream)); }
46+
if (d_alloc_out) {
47+
CHK_IF_RETURN(cudaMemcpyAsync(output, d_out, n * sizeof(E), cudaMemcpyDeviceToHost, cuda_stream));
48+
CHK_IF_RETURN(cudaFreeAsync(d_out, cuda_stream));
49+
}
50+
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(cuda_stream));
51+
52+
return CHK_LAST();
53+
}
54+
55+
} // namespace montgomery
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#pragma once
2+
3+
#include "icicle/errors.h"
4+
#include "cuda_runtime.h"
5+
6+
static eIcicleError translateCudaError(cudaError_t cudaErr)
7+
{
8+
switch (cudaErr) {
9+
case cudaSuccess:
10+
return eIcicleError::SUCCESS;
11+
case cudaErrorInvalidDevice:
12+
return eIcicleError::INVALID_DEVICE;
13+
case cudaErrorMemoryAllocation:
14+
return eIcicleError::OUT_OF_MEMORY;
15+
case cudaErrorInvalidDevicePointer:
16+
case cudaErrorInvalidHostPointer:
17+
return eIcicleError::INVALID_POINTER;
18+
case cudaErrorInitializationError:
19+
case cudaErrorInvalidResourceHandle:
20+
return eIcicleError::ALLOCATION_FAILED;
21+
case cudaErrorInvalidMemcpyDirection:
22+
return eIcicleError::COPY_FAILED;
23+
case cudaErrorSyncDepthExceeded:
24+
case cudaErrorLaunchTimeout:
25+
case cudaErrorLaunchIncompatibleTexturing:
26+
case cudaErrorLaunchFailure:
27+
return eIcicleError::SYNCHRONIZATION_FAILED;
28+
case cudaErrorInvalidValue:
29+
return eIcicleError::INVALID_ARGUMENT;
30+
default:
31+
return eIcicleError::UNKNOWN_ERROR;
32+
}
33+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#pragma once
2+
#ifndef DEVICE_CONTEXT_H
3+
#define DEVICE_CONTEXT_H
4+
5+
#include <cuda_runtime.h>
6+
7+
namespace device_context {
8+
9+
constexpr std::size_t MAX_DEVICES = 32;
10+
11+
/**
12+
* Properties of the device used in icicle functions.
13+
*/
14+
struct DeviceContext {
15+
cudaStream_t& stream; /**< Stream to use. Default value: 0. */
16+
std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
17+
cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
18+
};
19+
20+
/**
21+
* Return default device context that corresponds to using the default stream of the first GPU
22+
*/
23+
inline DeviceContext get_default_device_context() // TODO: naming convention ?
24+
{
25+
static cudaStream_t default_stream = (cudaStream_t)0;
26+
return DeviceContext{
27+
(cudaStream_t&)default_stream, // stream
28+
0, // device_id
29+
0, // mempool
30+
};
31+
}
32+
33+
} // namespace device_context
34+
35+
#endif

0 commit comments

Comments
 (0)