Add OpenACC directives to swm_fortran_kernels.F90

mnlevy1981 · mnlevy1981 · commit c8a25c7334a5 · 2025-02-26T11:08:30.000-07:00
Also added -acc=gpu flag to Makefile (though this might be the default? it
didn't change performance, while -noacc slowed things down)
diff --git a/swm_fortran/Makefile b/swm_fortran/Makefile
@@ -18,7 +18,7 @@ LDLIBS = -lm
 all: swm_fortran swm_fortran_driver
 
 nvfortran:
-	make FC=nvfortran FFLAGS="-r8" swm_fortran_driver
+	make FC=nvfortran FFLAGS="-r8 -acc=gpu" swm_fortran_driver
 
 swm_fortran: swm_fortran.F90
 	$(FC) $(FFLAGS) -o $@ swm_fortran.F90
@@ -37,4 +37,4 @@ swm_fortran_amrex_kernels.o: swm_fortran_amrex_kernels.F90
 	$(FC) $(FFLAGS) -c $@ swm_fortran_amrex_kernels.F90
 
 clean:
-	rm -f swm_fortran swm_fortran_driver swm_fortran_amrex_driver *.o
+	rm -f swm_fortran swm_fortran_driver swm_fortran_amrex_driver *.o *.mod
diff --git a/swm_fortran/swm_fortran_kernels.F90 b/swm_fortran/swm_fortran_kernels.F90
@@ -12,6 +12,8 @@ subroutine UpdateIntermediateVariablesKernel(fsdx,fsdy,p,u,v,cu,cv,h,z)
 
     integer :: i,j
 
+    !$acc enter data copyin(p,u,v,fsdx,fsdy,cu,cv,h,z)
+    !$acc parallel loop collapse(2) present(p,u,v,fsdx,fsdy)
     do j=1,size(cu,2)-1
       do i=1,size(cu,1)-1
         cu(i+1,j) = 0.5 * (p(i+1,j) + p(i,j)) * u(i+1,j)
@@ -22,6 +24,8 @@ subroutine UpdateIntermediateVariablesKernel(fsdx,fsdy,p,u,v,cu,cv,h,z)
                                   v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j))
       end do
     end do
+    !$acc exit data copyout(cu,cv,z,h)
+
   end subroutine UpdateIntermediateVariablesKernel
 
   subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z,pnew,unew,vnew)
@@ -32,6 +36,8 @@ subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z
 
     integer :: i,j
 
+    !$acc enter data copyin(tdtsdx,tdtsdy,tdts8,cu,cv,z,h,pold,uold,vold,pnew,unew,vnew)
+    !$acc parallel loop collapse(2) present(tdtsdx,tdtsdy,tdts8,cu,cv,z,h,pold,uold,vold)
     do j=1,size(unew,2)-1
       do i=1,size(unew,1)-1
         unew(i+1,j) = uold(i+1,j) + &
@@ -43,6 +49,8 @@ subroutine UpdateNewVariablesKernel(tdtsdx,tdtsdy,tdts8,pold,uold,vold,cu,cv,h,z
         pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j))
       end do
     end do
+    !$acc exit data copyout(unew,vnew,pnew)
+
   end subroutine UpdateNewVariablesKernel
 
   subroutine UpdateOldVariablesKernel(alpha,pnew,unew,vnew,p,u,v,pold,uold,vold)
@@ -52,14 +60,17 @@ subroutine UpdateOldVariablesKernel(alpha,pnew,unew,vnew,p,u,v,pold,uold,vold)
 
     integer :: i,j
 
+    !$acc enter data copyin(alpha,pold,uold,vold,p,u,v,pnew,unew,vnew)
+    !$acc parallel loop collapse(2) present(alpha,p,u,v,pnew,unew,vnew)
     do j=1,size(uold,2)-1
       do i=1,size(uold,1)-1
         uold(i,j) = u(i,j) + alpha*(unew(i,j) - 2. * u(i,j) + uold(i,j))
         vold(i,j) = v(i,j) + alpha*(vnew(i,j) - 2. * v(i,j) + vold(i,j))
         pold(i,j) = p(i,j) + alpha*(pnew(i,j) - 2. * p(i,j) + pold(i,j))
       end do
     end do
+    !$acc exit data copyout(uold,vold,pold)
 
   end subroutine UpdateOldVariablesKernel
 
-end module SWM_Fortran_Kernels
+end module SWM_Fortran_Kernels