-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd-vectors.cu
153 lines (123 loc) · 4.47 KB
/
add-vectors.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// This is a CUDA program that does the following:
//
// 1. On the host, fill the A and B arrays with random numbers
// 2. On the host, print the initial values of the A and B arrays
// 3. Copy the A and B arrays from the host to the device
// 4. On the device, add the A and B vectors and store the result in C
// 5. Copy the C array from the device to the host
// 6. On the host, print the result
//
// Author: Aaron Weeden, Shodor, 2016
// Import library so we can call printf()
#include <stdio.h>
// Import library so we can call exit(), malloc(), free(), random(), etc.
#include <stdlib.h>
// Import library so we can call time()
#include <time.h>
// Import library so we can call omp_get_wtime()
#include <omp.h>
// Define the number of numbers in each array
#define NUM_COUNT 10
// Define the number of bytes in each array
#define BYTE_COUNT ((NUM_COUNT) * sizeof(int))
// Define the number of CUDA threads in each CUDA warp (group of threads that
// execute instructions in lock-step)
#define THREADS_PER_WARP 32
// Define the maximum number of CUDA warps in each CUDA block
#define MAX_WARPS_PER_BLOCK 16
// Define the number of CUDA threads in each CUDA block
#define THREADS_PER_BLOCK ((THREADS_PER_WARP) * (MAX_WARPS_PER_BLOCK))
// Define the number of CUDA blocks in each CUDA grid
#define BLOCKS_PER_GRID 1
// Declare functions that will be defined later
void TryMalloc(void * const err);
void TryCuda(cudaError_t const err);
__global__ void AddVectors(int * const deviceA, int * const deviceB,
int * const deviceC, int const count);
// Start the program
int main()
{
// Declare variables for the host and device arrays
int * hostA;
int * hostB;
int * hostC;
int * deviceA;
int * deviceB;
int * deviceC;
// Start a timer
double startTime = omp_get_wtime();
// Allocate memory for the host arrays
TryMalloc(hostA = (int*)malloc(BYTE_COUNT));
TryMalloc(hostB = (int*)malloc(BYTE_COUNT));
TryMalloc(hostC = (int*)malloc(BYTE_COUNT));
// Allocate memory for the device arrays
TryCuda(cudaMalloc((void**)&deviceA, BYTE_COUNT));
TryCuda(cudaMalloc((void**)&deviceB, BYTE_COUNT));
TryCuda(cudaMalloc((void**)&deviceC, BYTE_COUNT));
// Initialize the random number generator
srandom(time(NULL));
// On the host, fill the A and B arrays with random numbers
printf("Expected Result:\n");
for (int i = 0; i < NUM_COUNT; i++)
{
hostA[i] = 100 * random() / RAND_MAX;
hostB[i] = 100 * random() / RAND_MAX;
printf("\thostC[%d] should be %d + %d\n", i, hostA[i], hostB[i]);
}
// Copy the A and B arrays from the host to the device
TryCuda(cudaMemcpy(deviceA, hostA, BYTE_COUNT, cudaMemcpyHostToDevice));
TryCuda(cudaMemcpy(deviceB, hostB, BYTE_COUNT, cudaMemcpyHostToDevice));
// On the device, add the A and B vectors and store the result in C
AddVectors<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(deviceA, deviceB, deviceC,
NUM_COUNT);
// Copy the C array from the device to the host
TryCuda(cudaMemcpy(hostC, deviceC, BYTE_COUNT, cudaMemcpyDeviceToHost));
// On the host, print the result
printf("Result:\n");
for (int i = 0; i < NUM_COUNT; i++)
{
printf("\thostC[%d] = %d\n", i, hostC[i]);
}
// De-allocate memory for the device arrays
TryCuda(cudaFree(deviceC));
TryCuda(cudaFree(deviceB));
TryCuda(cudaFree(deviceA));
// De-allocate memory for the host arrays
free(hostC);
free(hostB);
free(hostA);
// Stop the timer
printf("Runtime: %f seconds\n", omp_get_wtime() - startTime);
return 0;
}
// Define a function to check whether a malloc() call was successful
void TryMalloc(void * const err)
{
if (err == NULL)
{
fprintf(stderr, "malloc error\n");
exit(EXIT_FAILURE);
}
}
// Define a function to check whether a CUDA call was successful
void TryCuda(cudaError_t const err)
{
if (err != cudaSuccess)
{
fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// Define a function which will be executed on a CUDA device
__global__ void AddVectors(int * const deviceA, int * const deviceB,
int * const deviceC, int const count)
{
// Calculate the unique ID for the current CUDA thread
int const threadId = blockIdx.x * blockDim.x + threadIdx.x;
// All threads whose thread ID is >= count will NOT do the following, thus
// avoiding writing into un-allocated space.
if (threadId < count)
{
deviceC[threadId] = deviceA[threadId] + deviceB[threadId];
}
}