-
Notifications
You must be signed in to change notification settings - Fork 18
/
2_1_pageable_basic.cu
155 lines (127 loc) · 4.24 KB
/
2_1_pageable_basic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include <algorithm>
#include <nvToolsExt.h>
#include <argparse/argparse.hpp>
#include "common.hpp"
/* NOTE: A and C are column major, B is row major
*/
__global__ void mygemm(float *__restrict__ c, //<! [out] and MxN matrix
const float *a, //<! [in] an MxK matrix
const float *b, //<! [in] an KxN matrix
const int M, const int N, const int K) {
#define A(_i, _j) a[(_i) + (_j)*M]
#define B(_i, _j) b[(_i)*N + (_j)]
#define C(_i, _j) c[(_i) + (_j)*M]
int gidx = blockDim.x * blockIdx.x + threadIdx.x;
int gidy = blockDim.y * blockIdx.y + threadIdx.y;
for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
float acc = 0;
for (int k = 0; k < K; ++k) {
acc += A(i, k) * B(k, j);
}
C(i, j) = acc;
}
}
#undef A
#undef B
#undef C
}
/* Time the total transfer & matrix-multiplication time
*/
int main(int argc, char **argv) {
argparse::Parser parser;
// default matrix sizes:
// A: 1600 x 1500
// B: 1500 x 1400
// C: 1600 x 1400
int m = 1600;
int n = 1400;
int k = 1500;
int nIters = 5;
int nWarmup = 5;
parser.add_positional(m);
parser.add_positional(n);
parser.add_positional(k);
parser.add_option(nIters, "--iters");
parser.add_option(nWarmup, "--warmup");
if (!parser.parse(argc, argv)) {
parser.help();
exit(EXIT_FAILURE);
}
const int64_t flop = int64_t(m) * int64_t(n) * int64_t(k) * 2 * nIters;
// initialize host data
std::cout << "generate data\n";
nvtxRangePush("generate data");
float *aHost, *bHost, *cHost;
aHost = new float[m * k];
bHost = new float[k * n];
cHost = new float[m * n];
std::generate(aHost, aHost + m * k, random_int);
std::generate(bHost, bHost + k * n, random_int);
nvtxRangePop();
// allocate device data
float *aDev, *bDev, *cDev;
CUDA_RUNTIME(cudaMalloc(&aDev, m * k * sizeof(float)));
CUDA_RUNTIME(cudaMalloc(&bDev, k * n * sizeof(float)));
CUDA_RUNTIME(cudaMalloc(&cDev, m * n * sizeof(float)));
// create events to time GPU kernel
cudaEvent_t start, stop;
CUDA_RUNTIME(cudaEventCreate(&start));
CUDA_RUNTIME(cudaEventCreate(&stop));
// GPU kernel launch parameters
dim3 dimBlock(32, 32);
dim3 dimGrid;
dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
float kernelTime = 0;
float wallTime = 0;
for (int iter = 0; iter < nWarmup + nIters; ++iter) {
auto wallStart = Clock::now();
// copy data to device
nvtxRangePush("host-to-device");
CUDA_RUNTIME(
cudaMemcpy(aDev, aHost, m * k * sizeof(float), cudaMemcpyDefault));
CUDA_RUNTIME(
cudaMemcpy(bDev, bHost, k * n * sizeof(float), cudaMemcpyDefault));
nvtxRangePop();
// kernel time
float millis;
CUDA_RUNTIME(cudaEventRecord(start));
mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
CUDA_RUNTIME(cudaEventRecord(stop));
CUDA_RUNTIME(cudaEventSynchronize(stop));
CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
// copy data back to host
nvtxRangePush("device-to-host");
CUDA_RUNTIME(
cudaMemcpy(cHost, cDev, m * n * sizeof(float), cudaMemcpyDefault));
nvtxRangePop();
CUDA_RUNTIME(cudaDeviceSynchronize());
Duration wallElapsed = Clock::now() - wallStart;
std::cout << iter << " kernel=" << millis / 1000
<< " wall=" << wallElapsed.count()
<< (iter >= nWarmup ? " *" : " ") << "\n";
// track time if no longer during warmup
if (iter >= nWarmup) {
wallTime += wallElapsed.count();
kernelTime += millis / 1000; // seconds
}
}
// print results
double kernelGflops = flop / 1e9 / kernelTime;
std::cout << "kernel " << kernelGflops << "GFLOPS (" << flop << " flop, "
<< kernelTime << "s)\n";
double wallGflops = flop / 1e9 / wallTime;
std::cout << "wall " << wallGflops << "GFLOPS (" << flop << " flop, "
<< wallTime << "s)\n";
// release resources
CUDA_RUNTIME(cudaEventDestroy(start));
CUDA_RUNTIME(cudaEventDestroy(stop));
CUDA_RUNTIME(cudaFree(aDev));
CUDA_RUNTIME(cudaFree(bDev));
CUDA_RUNTIME(cudaFree(cDev));
delete[] aHost;
delete[] bHost;
delete[] cHost;
return 0;
}