-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdiffusion_2d_cuda.jl
75 lines (68 loc) · 2.39 KB
/
diffusion_2d_cuda.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# 2D linear diffusion solver - GPU cuda version
using Printf
using JLD2
using CUDA
include(joinpath(@__DIR__, "../shared.jl"))
# convenience macros simply to avoid writing nested finite-difference expression
macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(ds))) end
macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(ds))) end
function diffusion_step_kernel!(params, C2, C)
(; ds, dt, D) = params
#
# !! TODO !!
#
# We want to replace the nested loop over spatial dimensions by "vecotized" CUDA indices.
# Based off of the serial kernel (see README.md or diffusion_2d_loop.jl) implement
# the CUDA variant using CUDA.jl taking care the handle to range in an appropriate
# manner (see "TODO..." below).
#
ix = # TODO # CUDA vectorised unique index
iy = # TODO # CUDA vectorised unique index
if # TODO select correct range
@inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(ds) +
(@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(ds))
end
return nothing
end
function diffusion_step!(params, C2, C)
(; nthreads, nblocks) = params
#
# !! TODO !!
#
# Complete the CPU wrapper function calling the `diffusion_step_kernel!`
# using the `@cuda` macro and appropriate launch parameters (see "TODO..." below).
#
@cuda # TODO
return nothing
end
function run_diffusion(; ns=64, nt=100, do_save=false)
params = init_params_gpu(; ns, nt, do_save)
C, C2 = init_arrays_gpu(params)
t_tic = 0.0
# Time loop
for it in 1:nt
# time after warmup (ignore first 10 iterations)
(it == 11) && (t_tic = Base.time())
# diffusion
diffusion_step!(params, C2, C)
C, C2 = C2, C # pointer swap
end
# synchronize the gpu before querying the final time
# TODO # Add synchronization
t_toc = (Base.time() - t_tic)
print_perf(params, t_toc)
do_save && jldsave(joinpath(@__DIR__, "out_gpu.jld2"); C = Array(C), l = params.L)
return nothing
end
# Running things...
# enable visualization by default
(!@isdefined do_save) && (do_save = true)
# enable execution by default
(!@isdefined do_run) && (do_run = true)
if do_run
if !isempty(ARGS)
run_diffusion(; ns=parse(Int, ARGS[1]), do_save)
else
run_diffusion(; ns=256, do_save)
end
end