Skip to content

Commit b3aa3cb

Browse files
authored
Merge pull request #30 from GPUEngineering/hf/28-compilation-on-orin
C++17-proofing testTensor.cu
2 parents f97ee4e + 8d7c49d commit b3aa3cb

File tree

5 files changed

+147
-106
lines changed

5 files changed

+147
-106
lines changed

.github/workflows/ci.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ name: Continuous integration
77

88
jobs:
99
ci:
10-
runs-on: self-hosted
10+
runs-on: ${{ matrix.runner }}
11+
strategy:
12+
matrix:
13+
runner: [orin, a40]
1114
steps:
1215
- name: checkout code
1316
uses: actions/checkout@v4

CMakeLists.txt

+45-31
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,49 @@
22
# GPUtils
33
# ====================================================================
44
cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
5+
6+
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.29")
7+
cmake_policy(SET CMP0135 NEW)
8+
endif()
9+
10+
# Set C++ version and SM architecture
11+
if (NOT DEFINED CPPVERSION)
12+
set(CPPVERSION 20) # A40: 20, Orin: 17
13+
endif()
14+
if (NOT DEFINED SM_ARCH)
15+
set(SM_ARCH 86)# A40: 86, Orin: 87
16+
endif()
17+
18+
519
project(GPUtils
6-
DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
7-
HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
8-
LANGUAGES CXX
9-
)
20+
DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
21+
HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
22+
LANGUAGES CXX
23+
)
1024
# ----
1125
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels
12-
set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.3/bin/nvcc")
13-
set(CMAKE_CUDA_ARCHITECTURES 86)
14-
set(CMAKE_CUDA_STANDARD 20)
15-
set(CMAKE_CXX_STANDARD 20)
16-
set(CMAKE_CUDA_FLAGS "-std=c++20")
17-
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++20)
26+
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
27+
set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH})
28+
set(CMAKE_CUDA_STANDARD ${CPPVERSION})
29+
set(CMAKE_CXX_STANDARD ${CPPVERSION})
30+
set(CMAKE_CUDA_FLAGS "-std=c++${CPPVERSION}")
31+
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "-std=c++${CPPVERSION}")
1832
enable_language(CUDA)
1933
# ----
2034
add_library(device_compiler_flags INTERFACE)
21-
target_compile_features(device_compiler_flags INTERFACE cxx_std_20)
35+
target_compile_features(device_compiler_flags INTERFACE cxx_std_${CPPVERSION})
2236
set(CMAKE_CXX_EXTENSIONS OFF)
2337
# ----
2438
add_library(developer_flags INTERFACE)
2539
set(cxx_flags -Wall)
26-
set(cuda_flags -arch=sm_60 -std=c++20 -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
40+
set(cuda_flags -arch=sm_${SM_ARCH} -std=c++${CPPVERSION} -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
2741
target_compile_options(developer_flags
28-
INTERFACE
29-
# flags for CXX builds
30-
$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>
31-
# flags for CUDA builds
32-
$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>
33-
)
42+
INTERFACE
43+
# flags for CXX builds
44+
$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>
45+
# flags for CUDA builds
46+
$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>
47+
)
3448
target_link_libraries(device_compiler_flags INTERFACE $<BUILD_INTERFACE:developer_flags>)
3549
# ----
3650

@@ -40,21 +54,21 @@ target_link_libraries(device_compiler_flags INTERFACE $<BUILD_INTERFACE:develope
4054
# ====================================================================
4155
add_executable(main)
4256
target_sources(main
43-
PRIVATE
44-
main.cu
45-
)
57+
PRIVATE
58+
main.cu
59+
)
4660
target_link_libraries(main
47-
PRIVATE
48-
device_compiler_flags
49-
cublas
50-
cusolver
51-
cudadevrt
52-
)
61+
PRIVATE
62+
device_compiler_flags
63+
cublas
64+
cusolver
65+
cudadevrt
66+
)
5367
target_include_directories(main
54-
PRIVATE
55-
"${PROJECT_BINARY_DIR}"
56-
"${PROJECT_SOURCE_DIR}/include"
57-
)
68+
PRIVATE
69+
"${PROJECT_BINARY_DIR}"
70+
"${PROJECT_SOURCE_DIR}/include"
71+
)
5872
# ----
5973
add_subdirectory(test)
6074
# ----

ci/script.sh

+38-20
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,62 @@
11
#!/bin/bash
22
set -euxo pipefail
33

4+
45
tests() {
6+
# Where are we? (A40 or Orin?)
7+
cpp_version=17 # default
8+
sm_arch=86 # default
9+
hwInfoOrin=`lshw | grep Orin` ||
10+
if [ ! -z "${hwInfoOrin}" ]; then
11+
echo "Running on Orin";
12+
sm_arch=87
13+
cpp_version=17
14+
else
15+
echo "Not running on Orin";
16+
sm_arch=86
17+
cpp_version=20
18+
fi
19+
520
# ------------------------------------
621
# Run tensor gtests
722
# ------------------------------------
823

924
# -- create build files
10-
cmake -S . -B ./build -Wno-dev
25+
cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
1126

1227
# -- build files in build folder
1328
cmake --build ./build
1429

1530
# -- run tests
1631
ctest --test-dir ./build/test --output-on-failure
1732

18-
# -- run compute sanitizer
19-
cd ./build/test
20-
mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
21-
grep "0 errors" <<< "$mem"
22-
cd ../..
33+
if [ -z "${hwInfoOrin}" ]; then
2334

24-
# ------------------------------------
25-
# Run example executable
26-
# ------------------------------------
35+
# -- run compute sanitizer
36+
cd ./build/test
37+
mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
38+
grep "0 errors" <<< "$mem"
39+
cd ../..
2740

28-
# -- create build files
29-
cd example
30-
cmake -S . -B ./build -Wno-dev
41+
# ------------------------------------
42+
# Run example executable
43+
# ------------------------------------
3144

32-
# -- build files in build folder
33-
cmake --build ./build
45+
# -- create build files
46+
cd example
47+
cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
48+
49+
# -- build files in build folder
50+
cmake --build ./build
3451

35-
# -- run main.cu
36-
./build/example_main
52+
# -- run main.cu
53+
./build/example_main
3754

38-
# -- run compute sanitizer
39-
cd ./build
40-
mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
41-
grep "0 errors" <<< "$mem"
55+
# -- run compute sanitizer
56+
cd ./build
57+
mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
58+
grep "0 errors" <<< "$mem"
59+
fi
4260
}
4361

4462

include/tensor.cuh

+10-4
Original file line numberDiff line numberDiff line change
@@ -1326,7 +1326,7 @@ public:
13261326
};
13271327

13281328
template<>
1329-
void CholeskyBatchFactoriser<double>::factorise() {
1329+
inline void CholeskyBatchFactoriser<double>::factorise() {
13301330
if (m_factorisationDone) return;
13311331
DTensor<double *> ptrA = m_matrix->pointersToMatrices();
13321332
gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
@@ -1340,7 +1340,7 @@ void CholeskyBatchFactoriser<double>::factorise() {
13401340
}
13411341

13421342
template<>
1343-
void CholeskyBatchFactoriser<float>::factorise() {
1343+
inline void CholeskyBatchFactoriser<float>::factorise() {
13441344
if (m_factorisationDone) return;
13451345
DTensor<float *> ptrA = m_matrix->pointersToMatrices();
13461346
gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
@@ -1354,8 +1354,11 @@ void CholeskyBatchFactoriser<float>::factorise() {
13541354
}
13551355

13561356
template<>
1357-
void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
1357+
inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
13581358
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
1359+
if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
1360+
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
1361+
}
13591362
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
13601363
DTensor<double *> ptrA = m_matrix->pointersToMatrices();
13611364
DTensor<double *> ptrB = b.pointersToMatrices();
@@ -1372,8 +1375,11 @@ void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
13721375
}
13731376

13741377
template<>
1375-
void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
1378+
inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
13761379
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
1380+
if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
1381+
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
1382+
}
13771383
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
13781384
DTensor<float *> ptrA = m_matrix->pointersToMatrices();
13791385
DTensor<float *> ptrB = b.pointersToMatrices();

0 commit comments

Comments
 (0)