Skip to content

Commit 50e55ab

Browse files
committed
Fortran version of AMReX kernels
Note that swm_fortran_driver is much slower than swm_fortran_driver, presumably because we are passing large arrays to the kernels and then only changing a single value in the arrays
1 parent 7eda333 commit 50e55ab

File tree

4 files changed

+400
-10
lines changed

4 files changed

+400
-10
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ swm_c/shallow_swap
3131
swm_c/shallow_swap.acc
3232
swm_c/shallow_swap.acc.Tile
3333
swm_fortran/swm_fortran
34+
swm_fortran/swm_fortran_driver
3435

3536
# Output
3637
*.out

swm_fortran/Makefile

+11-10
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
# Define the compiler
22
FC = gfortran
33

4-
# Source files for each executable
5-
EXEC_SRCS = swm_fortran.F90
6-
74
# Common source files used by all executables
85
COMMON_SRCS =
96

@@ -14,18 +11,22 @@ ifdef GPU
1411
FFLAGS += gpu=cc70 -Minfo=accel -Mnofma
1512
endif
1613

17-
# Define the executable names based on source files
18-
EXECS = $(basename $(EXEC_SRCS))
19-
2014
# Libraries to link to
2115
LDLIBS = -lm
2216

2317
# Default target
24-
all: $(EXECS)
18+
all: swm_fortran
2519

2620
# Implicit rule to compile the executables
27-
$(EXECS):
28-
$(FC) $(FFLAGS) -o $(EXECS) $(EXEC_SRCS)
21+
swm_fortran: swm_fortran.F90
22+
$(FC) $(FFLAGS) -o $@ swm_fortran.F90
23+
24+
# Implicit rule to compile the executables
25+
swm_fortran_driver: swm_fortran_driver.F90 swm_fortran_kernels.o
26+
$(FC) $(FFLAGS) -o $@ swm_fortran_kernels.o swm_fortran_driver.F90
27+
28+
swm_fortran_kernels.o: swm_fortran_kernels.F90
29+
$(FC) $(FFLAGS) -c $@ swm_fortran_kernels.F90
2930

3031
clean:
31-
rm -f $(EXECS) *.o
32+
rm -f swm_fortran swm_fortran_driver *.o

swm_fortran/swm_fortran_driver.F90

+332
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
#define M 256
2+
#define N 256
3+
#define M_LEN (M + 1)
4+
#define N_LEN (N + 1)
5+
#define L_OUT .true.
6+
#define VAL_OUT .false.
7+
#define ITMAX 4000
8+
9+
Program SWM_Fortran_Driver
10+
11+
use swm_fortran_kernels, only : UpdateIntermediateVariablesKernel
12+
use swm_fortran_kernels, only : UpdateNewVariablesKernel
13+
use swm_fortran_kernels, only : UpdateOldVariablesKernel
14+
15+
implicit none
16+
17+
! Solution arrays
18+
real, dimension(M_LEN,N_LEN,1), target :: u1, u2, u3, &
19+
v1, v2, v3, &
20+
p1, p2, p3
21+
22+
real, dimension(M_LEN,N_LEN,1) :: cu, &
23+
cv, &
24+
z, &
25+
h, &
26+
psi
27+
28+
real, dimension(:,:,:), pointer :: u => NULL(), &
29+
v => NULL(), &
30+
p => NULL(), &
31+
unew => NULL(), &
32+
vnew => NULL(), &
33+
pnew => NULL(), &
34+
uold => NULL(), &
35+
vold => NULL(), &
36+
pold => NULL()
37+
38+
real :: dt, tdt, dx, dy, a, alpha, el, pi
39+
real :: tpi, di, dj, pcf
40+
real :: tdts8, tdtsdx, tdtsdy, fsdx, fsdy
41+
42+
integer, parameter :: mnmin = min(M,N)
43+
integer :: ncycle
44+
integer :: i,j
45+
46+
! Timer variables
47+
real :: mfs100, mfs200, mfs300
48+
real :: t100, t200, t300
49+
real :: tstart, ctime, tcyc, time, ptime
50+
real :: c1, c2
51+
52+
! Set up pointers
53+
u => u1
54+
v => v1
55+
p => p1
56+
unew => u2
57+
vnew => v2
58+
pnew => p2
59+
uold => u3
60+
vold => v3
61+
pold => p3
62+
63+
! Initialization
64+
dt = 90.
65+
tdt = dt
66+
67+
dx = 1.e5
68+
dy = 1.e5
69+
fsdx = 4. / dx
70+
fsdy = 4. / dy
71+
72+
a = 1.e6
73+
alpha = 0.001
74+
75+
el = N * dx
76+
pi = atan2(0., -1.)
77+
tpi = pi + pi
78+
di = tpi / M
79+
dj = tpi / N
80+
pcf = pi * pi * a * a / (el * el)
81+
82+
! Initial values of the stream function and p
83+
do j=0,N_LEN-1
84+
do i=0,M_LEN-1
85+
psi(i+1,j+1,1) = a * sin((i + .5) * di) * sin((j + .5) * dj)
86+
p(i+1,j+1,1) = pcf * (cos(2. * (i) * di) + cos(2. * (j) * dj)) + 50000.
87+
end do
88+
end do
89+
90+
! Initialize velocities
91+
do j=1,N
92+
do i=1,M
93+
u(i+1,j,1) = -(psi(i+1,j+1,1) - psi(i+1,j,1)) / dy
94+
v(i,j+1,1) = (psi(i+1,j+1,1) - psi(i,j+1,1)) / dx
95+
end do
96+
end do
97+
98+
! Periodic continuation
99+
do j=1,N
100+
u(1,j,1) = u(M_LEN,j,1)
101+
v(M_LEN,j,1) = v(1,j,1)
102+
end do
103+
104+
do i=1,M
105+
u(i,N_LEN,1) = u(i,1,1)
106+
v(i,1,1) = v(i,N_LEN,1)
107+
end do
108+
109+
u(1,N_LEN,1) = u(M_LEN,1,1)
110+
v(M_LEN,1,1) = v(1,N_LEN,1)
111+
112+
do j=1,N_LEN
113+
do i=1,M_LEN
114+
uold(i,j,1) = u(i,j,1)
115+
vold(i,j,1) = v(i,j,1)
116+
pold(i,j,1) = p(i,j,1)
117+
end do
118+
end do
119+
120+
if ( L_OUT ) then
121+
write(*,"(A,I0)") " number of points in the x direction ", N
122+
write(*,"(A,I0)") " number of points in the y direction ", M
123+
write(*,"(A,F0.6)") " grid spacing in the x direction ", dx
124+
write(*,"(A,F0.6)") " grid spacing in the y direction ", dy
125+
write(*,"(A,F0.6)") " time step ", dt
126+
write(*,"(A,F0.6)") " time filter parameter ", alpha
127+
128+
write(*, "(A)") " initial diagonal elements of p"
129+
do i=1,mnmin
130+
write(*, "(F0.6, 1X)", advance="no") p(i,i,1)
131+
end do
132+
133+
write(*, "(/,A)") " initial diagonal elements of u"
134+
do i=1,mnmin
135+
write(*, "(F0.6, 1X)", advance="no") u(i,i,1)
136+
end do
137+
138+
write(*, "(/,A)") " initial diagonal elements of v"
139+
do i=1,mnmin
140+
write(*, "(F0.6, 1X)", advance="no") v(i,i,1)
141+
end do
142+
write(*,*)
143+
end if
144+
145+
! Start timer
146+
call cpu_time(tstart)
147+
time = 0.
148+
t100 = 0.
149+
t200 = 0.
150+
t300 = 0.
151+
152+
! Start of time loop
153+
do ncycle=1,ITMAX
154+
155+
call cpu_time(c1)
156+
do j=1,N
157+
do i=1,M
158+
call UpdateIntermediateVariablesKernel(i,j,1,fsdx,fsdy,p,u,v,cu,cv,h,z)
159+
end do
160+
end do
161+
call cpu_time(c2)
162+
t100 = t100 + (c2 - c1)
163+
164+
! Periodic continuation
165+
do j=1,N
166+
cu(1,j,1) = cu(M_LEN,j,1)
167+
cv(M_LEN,j+1,1) = cv(1,j+1,1)
168+
z(1,j+1,1) = z(M_LEN,j+1,1)
169+
h(M_LEN,j,1) = h(1,j,1)
170+
end do
171+
172+
do i=1,M
173+
cu(i+1, N_LEN,1) = cu(i+1,1,1)
174+
cv(i,1,1) = cv(i,N_LEN,1)
175+
z(i+1,1,1) = z(i+1,N_LEN,1)
176+
h(i,N_LEN,1) = h(i,1,1)
177+
end do
178+
179+
cu(1,N_LEN,1) = cu(M_LEN,1,1)
180+
cv(M_LEN,1,1) = cv(1,N_LEN,1)
181+
z(1,1,1) = z(M_LEN,N_LEN,1)
182+
h(M_LEN,N_LEN,1) = h(1,1,1)
183+
184+
! Compute new values of u, v, and p
185+
tdts8 = tdt / 8.
186+
tdtsdx = tdt / dx
187+
tdtsdy = tdt / dy
188+
189+
call cpu_time(c1)
190+
do j=1,N
191+
do i=1,M
192+
call UpdateNewVariablesKernel(i,j,1,tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z,pnew,unew,vnew)
193+
end do
194+
end do
195+
call cpu_time(c2)
196+
t200 = t200 + (c2-c1)
197+
198+
! Periodic continuation
199+
do j=1,N
200+
unew(1,j,1) = unew(M_LEN,j,1)
201+
vnew(M_LEN,j+1,1) = vnew(1,j+1,1)
202+
pnew(M_LEN,j,1) = pnew(1,j,1)
203+
end do
204+
205+
do i=1,M
206+
unew(i+1,N_LEN,1) = unew(i+1,1,1)
207+
vnew(i,1,1) = vnew(i,N_LEN,1)
208+
pnew(i,N_LEN,1) = pnew(i,1,1)
209+
end do
210+
211+
unew(1,N_LEN,1) = unew(M_LEN,1,1)
212+
vnew(M_LEN,1,1) = vnew(1,N_LEN,1)
213+
pnew(M_LEN,N_LEN,1) = pnew(1,1,1)
214+
215+
time = time + dt
216+
if (ncycle > 1) then
217+
call cpu_time(c1)
218+
do j=1,N_LEN
219+
do i=1,M_LEN
220+
call UpdateOldVariablesKernel(i,j,1,alpha,pnew,unew,vnew,p,u,v,pold,uold,vold)
221+
end do
222+
end do
223+
224+
#ifdef _COPY_
225+
do j=1,N_LEN
226+
do i=1,M_LEN
227+
u(i,j,1) = unew(i,j,1)
228+
v(i,j,1) = vnew(i,j,1)
229+
p(i,j,1) = pnew(i,j,1)
230+
end do
231+
end do
232+
#else
233+
call dswap(u, unew)
234+
call dswap(v, vnew)
235+
call dswap(p, pnew)
236+
#endif
237+
call cpu_time(c2)
238+
t300 = t300 + (c2 - c1)
239+
else ! ncycle = 1
240+
tdt = tdt + tdt
241+
do j=1,N_LEN
242+
do i=1,N_LEN
243+
uold(i,j,1) = u(i,j,1)
244+
vold(i,j,1) = v(i,j,1)
245+
pold(i,j,1) = p(i,j,1)
246+
end do
247+
end do
248+
call dswap(u, unew)
249+
call dswap(v, vnew)
250+
call dswap(p, pnew)
251+
end if
252+
end do ! End of time loop
253+
254+
call dswap(u, unew)
255+
call dswap(v, vnew)
256+
call dswap(p, pnew)
257+
258+
if ( VAL_OUT ) then
259+
call write_to_file(pnew, 'p.bin')
260+
call write_to_file(unew, 'u.bin')
261+
call write_to_file(vnew, 'v.bin')
262+
end if
263+
264+
if ( L_OUT ) then
265+
ptime = time / 3600.
266+
267+
write(*, "(A,I0,A,F0.6)") " cycle number ", ITMAX, &
268+
" model time in hours ", ptime
269+
write(*, "(A)") " diagonal elements of p"
270+
do i=1,mnmin
271+
write(*, "(F0.6,1X)", advance="no") pnew(i,i,1)
272+
end do
273+
write(*, "(/,A)") " diagonal elements of u"
274+
do i=1,mnmin
275+
write(*, "(F0.6,1X)", advance="no") unew(i,i,1)
276+
end do
277+
write(*, "(/,A)") " diagonal elements of v"
278+
do i=1,mnmin
279+
write(*, "(F0.6,1X)", advance="no") vnew(i,i,1)
280+
end do
281+
282+
mfs100 = 0.
283+
mfs200 = 0.
284+
mfs300 = 0.
285+
! gdr t100 etc. now an accumulation of all l100 time
286+
if ( t100 .gt. 0 ) mfs100 = real(ITMAX) * 24. * real(M*N) / t100 / 1000000.
287+
if ( t200 .gt. 0 ) mfs200 = real(ITMAX) * 26. * real(M*N) / t200 / 1000000.
288+
if ( t300 .gt. 0 ) mfs300 = real(ITMAX) * 15. * real(M*N) / t300 / 1000000.
289+
290+
call cpu_time(c2)
291+
ctime = c2 - tstart
292+
tcyc = ctime / real(ITMAX)
293+
294+
write(*, "(/,A,I0,A,F0.6,A,F0.6)") " cycle number ", ITMAX, " total computer time ", ctime, " time per cycle ", tcyc
295+
write(*, "(A,F0.6,1X,F0.6)") " time and megaflops for loop 100 ", t100, mfs100
296+
write(*, "(A,F0.6,1X,F0.6)") " time and megaflops for loop 200 ", t200, mfs200
297+
write(*, "(A,F0.6,1X,F0.6)") " time and megaflops for loop 300 ", t300, mfs300
298+
end if
299+
300+
contains
301+
302+
! This is a copy, not a pointer shuffle
303+
subroutine dswap(a, b)
304+
305+
real, dimension(:,:,:), pointer :: a, b
306+
307+
real, dimension(:,:,:), pointer :: c
308+
309+
c => a
310+
a => b
311+
b => c
312+
313+
end subroutine dswap
314+
315+
subroutine write_to_file(array, filename)
316+
real, dimension(M_LEN,N_LEN,1), intent(in) :: array
317+
character(len=*), intent(in) :: filename
318+
319+
integer :: i, j, id
320+
321+
open(newunit=id, access='stream', status='replace', file=filename)
322+
! Write this out in C ordering of array
323+
do i=1,N_LEN
324+
do j=1,M_LEN
325+
write(id) array(i,j,1)
326+
end do
327+
end do
328+
close(id)
329+
330+
end subroutine write_to_file
331+
332+
End Program SWM_Fortran_Driver

0 commit comments

Comments
 (0)