Skip to content

Commit c8a25c7

Browse files
committed
Add OpenACC directives to swm_fortran_kernels.F90
Also added -acc=gpu flag to Makefile (though this might be the default? it didn't change performance, while -noacc slowed things down)
1 parent 40fd5ce commit c8a25c7

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

swm_fortran/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ LDLIBS = -lm
1818
all: swm_fortran swm_fortran_driver
1919

2020
nvfortran:
21-
make FC=nvfortran FFLAGS="-r8" swm_fortran_driver
21+
make FC=nvfortran FFLAGS="-r8 -acc=gpu" swm_fortran_driver
2222

2323
swm_fortran: swm_fortran.F90
2424
$(FC) $(FFLAGS) -o $@ swm_fortran.F90
@@ -37,4 +37,4 @@ swm_fortran_amrex_kernels.o: swm_fortran_amrex_kernels.F90
3737
$(FC) $(FFLAGS) -c $@ swm_fortran_amrex_kernels.F90
3838

3939
clean:
40-
rm -f swm_fortran swm_fortran_driver swm_fortran_amrex_driver *.o
40+
rm -f swm_fortran swm_fortran_driver swm_fortran_amrex_driver *.o *.mod

swm_fortran/swm_fortran_kernels.F90

+12-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ subroutine UpdateIntermediateVariablesKernel(fsdx,fsdy,p,u,v,cu,cv,h,z)
1212

1313
integer :: i,j
1414

15+
!$acc enter data copyin(p,u,v,fsdx,fsdy,cu,cv,h,z)
16+
!$acc parallel loop collapse(2) present(p,u,v,fsdx,fsdy)
1517
do j=1,size(cu,2)-1
1618
do i=1,size(cu,1)-1
1719
cu(i+1,j) = 0.5 * (p(i+1,j) + p(i,j)) * u(i+1,j)
@@ -22,6 +24,8 @@ subroutine UpdateIntermediateVariablesKernel(fsdx,fsdy,p,u,v,cu,cv,h,z)
2224
v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j))
2325
end do
2426
end do
27+
!$acc exit data copyout(cu,cv,z,h)
28+
2529
end subroutine UpdateIntermediateVariablesKernel
2630

2731
subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z,pnew,unew,vnew)
@@ -32,6 +36,8 @@ subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z
3236

3337
integer :: i,j
3438

39+
!$acc enter data copyin(tdtsdx,tdtsdy,tdts8,cu,cv,z,h,pold,uold,vold,pnew,unew,vnew)
40+
!$acc parallel loop collapse(2) present(tdtsdx,tdtsdy,tdts8,cu,cv,z,h,pold,uold,vold)
3541
do j=1,size(unew,2)-1
3642
do i=1,size(unew,1)-1
3743
unew(i+1,j) = uold(i+1,j) + &
@@ -43,6 +49,8 @@ subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z
4349
pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j))
4450
end do
4551
end do
52+
!$acc exit data copyout(unew,vnew,pnew)
53+
4654
end subroutine UpdateNewVariablesKernel
4755

4856
subroutine UpdateOldVariablesKernel(alpha,pnew,unew,vnew,p,u,v,pold,uold,vold)
@@ -52,14 +60,17 @@ subroutine UpdateOldVariablesKernel(alpha,pnew,unew,vnew,p,u,v,pold,uold,vold)
5260

5361
integer :: i,j
5462

63+
!$acc enter data copyin(alpha,pold,uold,vold,p,u,v,pnew,unew,vnew)
64+
!$acc parallel loop collapse(2) present(alpha,p,u,v,pnew,unew,vnew)
5565
do j=1,size(uold,2)-1
5666
do i=1,size(uold,1)-1
5767
uold(i,j) = u(i,j) + alpha*(unew(i,j) - 2. * u(i,j) + uold(i,j))
5868
vold(i,j) = v(i,j) + alpha*(vnew(i,j) - 2. * v(i,j) + vold(i,j))
5969
pold(i,j) = p(i,j) + alpha*(pnew(i,j) - 2. * p(i,j) + pold(i,j))
6070
end do
6171
end do
72+
!$acc exit data copyout(uold,vold,pold)
6273

6374
end subroutine UpdateOldVariablesKernel
6475

65-
end module SWM_Fortran_Kernels
76+
end module SWM_Fortran_Kernels

0 commit comments

Comments
 (0)