From 444e5f30848bca8846916be21a9beec3a6bb420d Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 15 Apr 2024 23:08:33 +0200 Subject: [PATCH 1/8] port trczdf to GPU --- src/General/memory.f90 | 9 +++++++-- src/General/step.f90 | 6 ++++++ src/IO/DIA_mem.f90 | 2 ++ src/PHYS/ZDF_mem.f90 | 29 ++++++++++++++++++++++++++- src/PHYS/trcadv.f90 | 8 +------- src/PHYS/trczdf.f90 | 45 +++++++++++++++++++++++++++--------------- 6 files changed, 73 insertions(+), 26 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 2af71f75..975c1480 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -516,7 +516,7 @@ subroutine alloc_tot() gphiv = huge(gphiv(1,1)) allocate(gphif(jpj,jpi)) gphif = huge(gphif(1,1)) - allocate(e1t(jpj,jpi)) + allocate(e1t(jpj,jpi)) e1t = huge(e1t(1,1)) allocate(e1u(jpj,jpi)) e1u = huge(e1u(1,1)) @@ -790,10 +790,13 @@ subroutine alloc_tot() allocate(DAY_LENGTH(jpj,jpi)) DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. + + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem #endif - + END subroutine alloc_tot @@ -802,6 +805,8 @@ subroutine clean_memory() ! myalloc (memory.f90) + !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef key_mpp !$acc exit data delete(te_send, tw_send, tn_send, ts_send) finalize diff --git a/src/General/step.f90 b/src/General/step.f90 index 80c8ef31..e776aa14 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -265,6 +265,10 @@ SUBROUTINE trcstp ! with surface boundary condition ! with IMPLICIT vertical diffusion + ! XXX: to be removed + use DIA_mem, only: diaflx + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt + IMPLICIT NONE integer jn,jk,ji,jj trcstpparttime = MPI_WTIME() ! cronometer-start @@ -296,7 +300,9 @@ SUBROUTINE trcstp IF (lbfm) CALL trcsms + !$acc update device(e1t,diaflx,e3t_back,e2t,trb,tmask,e3t,tra,avt,e3w) if (lzdf) IF (lzdf) CALL trczdf ! tracers: vertical diffusion + !$acc update host(diaflx,tra) if (lzdf) IF (lsnu) CALL snutel diff --git a/src/IO/DIA_mem.f90 b/src/IO/DIA_mem.f90 index e385f32e..223d6c27 100644 --- a/src/IO/DIA_mem.f90 +++ b/src/IO/DIA_mem.f90 @@ -50,6 +50,7 @@ SUBROUTINE alloc_DIA_local_flx() INDflxDUMP = huge(INDflxDUMP(1)) allocate(diaflx (7, Fsize, jptra )) diaflx = 0 + !$acc enter data create(diaflx) END SUBROUTINE alloc_DIA_local_flx @@ -95,6 +96,7 @@ subroutine clean_memory_dia() if (allocated(diaflx)) then deallocate(diaflx) + !$acc exit data delete(diaflx) endif if (allocated(INDflxDUMPZERO)) then diff --git a/src/PHYS/ZDF_mem.f90 b/src/PHYS/ZDF_mem.f90 index cb3fe00b..265e7004 100644 --- a/src/PHYS/ZDF_mem.f90 +++ b/src/PHYS/ZDF_mem.f90 @@ -46,6 +46,9 @@ subroutine myalloc_ZDF() jarr_zdf = huge(jarr_zdf(1,1)) allocate(jarr_zdf_flx(jpi*jpj,jpk)) jarr_zdf_flx = huge(jarr_zdf_flx(1,1)) + !$acc enter data create(jarr_zdf,jarr_zdf_flx) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifndef _OPENACC allocate(zwd(jpk, ntids)) zwd = huge(zwd(1,1)) allocate(zws(jpk, ntids)) @@ -60,6 +63,7 @@ subroutine myalloc_ZDF() zwz = huge(zwz(1,1)) allocate(zwt(jpk, ntids)) zwt = huge(zwt(1,1)) +#endif #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -67,7 +71,28 @@ subroutine myalloc_ZDF() END subroutine myalloc_ZDF - + +#ifdef _OPENACC + subroutine myalloc_ZDF_gpu() + allocate(zwd(jpk, dimen_jvzdf)) + zwd = huge(zwd(1,1)) + allocate(zws(jpk, dimen_jvzdf)) + zws = huge(zws(1,1)) + allocate(zwi(jpk, dimen_jvzdf)) + zwi = huge(zwi(1,1)) + allocate(zwx(jpk, dimen_jvzdf)) + zwx = huge(zwx(1,1)) + allocate(zwy(jpk, dimen_jvzdf)) + zwy = huge(zwy(1,1)) + allocate(zwz(jpk, dimen_jvzdf)) + zwz = huge(zwz(1,1)) + allocate(zwt(jpk, dimen_jvzdf)) + zwt = huge(zwt(1,1)) + + !$acc enter data create(zwd,zwi,zwx,zws,zwz,zwy,zwt) + !$acc update device(zwd,zwi,zwx,zws,zwz,zwy,zwt) + END subroutine myalloc_ZDF_gpu +#endif subroutine clean_memory_zdf() @@ -82,6 +107,8 @@ subroutine clean_memory_zdf() deallocate(zwz) deallocate(zwt) + !$acc exit data delete(jarr_zdf,jarr_zdf_flx,zwd,zwi,zwx,zws,zwz,zwy,zwt) + end subroutine clean_memory_zdf diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 129613be..65d0a75b 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -174,10 +174,8 @@ SUBROUTINE trcadv !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -353,10 +351,8 @@ SUBROUTINE trcadv !!trn could be allocate earlier !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc enter data create(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - !$acc enter data create( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -946,9 +942,7 @@ SUBROUTINE trcadv !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc exit data delete( tra) finalize if(use_gpu) !$acc exit data delete( trn, advmask ) finalize if(use_gpu) - !$acc exit data delete( flx_ridxt, diaflx ) finalize if(use_gpu) !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it @@ -963,7 +957,7 @@ SUBROUTINE trcadv deallocate(zbuf ) !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) - !$acc exit data delete( e1t, e2t, e3t, e1u, e2u, e3u, e1v, e2v, e3v, e3w, un, vn, wn ) finalize if(use_gpu) + !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 0a02d892..7dc67328 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -69,7 +69,7 @@ SUBROUTINE trczdf LOGICAL :: l1,l2,l3 - INTEGER :: jk,jj,ji, jn, jv, jf + INTEGER :: jk,jj,ji, jn, jv, jf, ntx ! omp variables @@ -113,7 +113,11 @@ SUBROUTINE trczdf END DO END DO END DO - + !$acc enter data create(delta_tra,int_tra) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifdef _OPENACC + call myalloc_ZDF_gpu() +#endif ENDIF @@ -129,12 +133,19 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab + !$acc parallel loop gang vector default(present) async DO jv = 1, dimen_jvzdf ji = jarr_zdf(2,jv) jj = jarr_zdf(1,jv) Aij = e1t(jj,ji) * e2t(jj,ji) +#ifdef _OPENACC + ntx=jv +#else + ntx=1 +#endif + !! I. Vertical trends associated with lateral mixing !! ------------------------------------------------- !! (excluding the vertical flux proportional to dk[t] ) @@ -155,14 +166,14 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwi(jk, 1) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) - zws(jk, 1) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) - zwd(jk, 1) = 1. - zwi(jk, 1) - zws(jk, 1) + zwi(jk, ntx) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) + zws(jk, ntx) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) + zwd(jk, ntx) = 1. - zwi(jk, ntx) - zws(jk, ntx) END DO !! Surface boundary conditions - zwi(1,1) = 0.e0 - zwd(1,1) = 1. - zws(1,1) + zwi(1,ntx) = 0.e0 + zwd(1,ntx) = 1. - zws(1,ntx) !! II.1. Vertical diffusion on tr !! ------------------------------ @@ -170,7 +181,7 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwy(jk,1) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) + zwy(jk,ntx) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) END DO !! Matrix inversion from the first level @@ -208,22 +219,22 @@ SUBROUTINE trczdf ikstp1=ikst+1 ikenm2=jpk-2 - zwt(ikst,1)=zwd(ikst,1) + zwt(ikst,ntx)=zwd(ikst,ntx) DO jk=ikstp1,jpkm1 - zwt(jk,1)=zwd(jk,1)-zwi(jk,1)*zws(jk-1,1)/zwt(jk-1,1) + zwt(jk,ntx)=zwd(jk,ntx)-zwi(jk,ntx)*zws(jk-1,ntx)/zwt(jk-1,ntx) END DO - zwz(ikst,1)=zwy(ikst,1) + zwz(ikst,ntx)=zwy(ikst,ntx) DO jk=ikstp1,jpkm1 - zwz(jk,1)=zwy(jk,1)-zwi(jk, 1)/zwt(jk-1, 1)*zwz(jk-1, 1) + zwz(jk,ntx)=zwy(jk,ntx)-zwi(jk, ntx)/zwt(jk-1, ntx)*zwz(jk-1, ntx) END DO - zwx(jpkm1, 1)=zwz(jpkm1, 1)/zwt(jpkm1, 1) + zwx(jpkm1, ntx)=zwz(jpkm1, ntx)/zwt(jpkm1, ntx) DO jk=ikenm2,ikst,-1 - zwx(jk, 1)=( zwz(jk, 1)-zws(jk, 1)*zwx(jk+1, 1) )/zwt(jk, 1) + zwx(jk, ntx)=( zwz(jk, ntx)-zws(jk, ntx)*zwx(jk+1, ntx) )/zwt(jk, ntx) END DO ! calculate flux due to vertical diffusion (on top face of the grid cell jk) @@ -232,7 +243,7 @@ SUBROUTINE trczdf DO jk=1,jpkm1 z2dtt = zdt * rdt - delta_tra(jk) = ( zwx(jk,1) - zwy(jk,1) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) + delta_tra(jk) = ( zwx(jk,ntx) - zwy(jk,ntx) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) IF (jk .EQ. 1) THEN int_tra(1) = 0 @@ -258,16 +269,18 @@ SUBROUTINE trczdf !! (c a u t i o n: tracer not its trend, Leap-frog scheme done !! it will not be done in trcnxt) DO jk = 1, jpkm1 - tra(jk,jj,ji,jn) = zwx(jk,1) * tmask(jk,jj,ji) + tra(jk,jj,ji,jn) = zwx(jk,ntx) * tmask(jk,jj,ji) END DO END DO ! jv + !$acc end parallel loop ! end if END DO TRACER_LOOP !!!$omp end parallel do + !$acc wait trczdfparttime = MPI_WTIME() - trczdfparttime !cronometer-stop From 59e50e266253961a49007128431b7ebb8896836d Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 16 Apr 2024 09:13:08 +0200 Subject: [PATCH 2/8] reduce vector length --- src/PHYS/trczdf.f90 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 7dc67328..a1b02ee3 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -133,7 +133,8 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab - !$acc parallel loop gang vector default(present) async + ! NOTE: kernel is too big, should be split + !$acc parallel loop gang vector default(present) async vector_length(32) DO jv = 1, dimen_jvzdf ji = jarr_zdf(2,jv) From c38170253b031c1d1ab2c7ec0ec50ccf268bf2f7 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 24 Apr 2024 08:48:10 +0200 Subject: [PATCH 3/8] move allocations, put async everywhere, use fsx macro --- src/General/memory.f90 | 7 +- src/IO/DIA_mem.f90 | 2 +- src/PHYS/ADV_mem.f90 | 6 ++ src/PHYS/trcadv.f90 | 231 +++++++++++++++++++---------------------- 4 files changed, 118 insertions(+), 128 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 975c1480..e8497bd9 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -791,7 +791,8 @@ subroutine alloc_tot() DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. - !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt,& + !$acc& e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -958,7 +959,7 @@ subroutine clean_memory() deallocate(tra_DIA_2d_IO_HIGH) deallocate(tra_PHYS_2d_IO) deallocate(tra_PHYS_2d_IO_HIGH) - + if(lwp) then deallocate(tottrn) @@ -1000,6 +1001,8 @@ subroutine clean_memory() deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn) + end subroutine clean_memory INTEGER FUNCTION find_index_var(string) diff --git a/src/IO/DIA_mem.f90 b/src/IO/DIA_mem.f90 index 223d6c27..4b4a896a 100644 --- a/src/IO/DIA_mem.f90 +++ b/src/IO/DIA_mem.f90 @@ -50,7 +50,7 @@ SUBROUTINE alloc_DIA_local_flx() INDflxDUMP = huge(INDflxDUMP(1)) allocate(diaflx (7, Fsize, jptra )) diaflx = 0 - !$acc enter data create(diaflx) + !$acc enter data create(flx_ridxt,diaflx) END SUBROUTINE alloc_DIA_local_flx diff --git a/src/PHYS/ADV_mem.f90 b/src/PHYS/ADV_mem.f90 index 9d201de7..d1bba762 100644 --- a/src/PHYS/ADV_mem.f90 +++ b/src/PHYS/ADV_mem.f90 @@ -114,6 +114,9 @@ subroutine myalloc_ADV() mem_all=get_mem(err) - aux_mem #endif + !$acc enter data create(advmask,zaa,zbb,zcc,inv_eu,inv_ev,inv_et,& + !$acc& big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr) + END subroutine myalloc_ADV !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1 @@ -133,6 +136,9 @@ subroutine clean_memory_adv() deallocate(big_fact_zcc) deallocate(zbtr_arr) + !$acc exit data delete(advmask,zaa,zbb,zcc,inv_eu,inv_ev,inv_et,& + !$acc& big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr) + end subroutine clean_memory_adv diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 65d0a75b..79450638 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -1,3 +1,8 @@ +#ifdef _OPENACC +! BUG?: the fsx routine causes additional H2D copies +#define fsx(pfx1, pfx2, pfu) ((((pfu) + abs(pfu)) * (pfx1) + ((pfu) - abs(pfu)) * (pfx2)) * 0.5) +#endif + SUBROUTINE trcadv !!!--------------------------------------------------------------------- !!! @@ -68,7 +73,7 @@ SUBROUTINE trcadv !! monthly weather review, pp 479-486 !! LOGICAL :: MPI_CHECK,l1,l2,l3 - INTEGER :: jk,jj,ji,jt,jn,jf,ju + INTEGER :: jk,jj,ji,jt,jn,jf,ju,queue double precision :: zbtr,zdt double precision :: junk, junki, junkj, junkk double precision :: timer @@ -78,10 +83,16 @@ SUBROUTINE trcadv double precision, allocatable,dimension(:,:,:) :: zx,zy,zz,zbuf double precision, allocatable,dimension(:,:,:) :: zkx,zky,zkz logical :: use_gpu + + queue=1 trcadvparttime = MPI_WTIME() +#ifdef _OPENACC use_gpu=.true. +#else + use_gpu=.false. +#endif !------------------------------------------------------------------- @@ -159,8 +170,22 @@ SUBROUTINE trcadv adv_initialized=.true. + endif + !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it + allocate(zy(jpk,jpj,jpi)) + allocate(zx(jpk,jpj,jpi)) + allocate(zz(jpk,jpj,jpi)) + allocate(ztj(jpk,jpj,jpi)) + allocate(zti(jpk,jpj,jpi)) + allocate(zkx(jpk,jpj,jpi)) + allocate(zky(jpk,jpj,jpi)) + allocate(zkz(jpk,jpj,jpi)) + allocate(zbuf(jpk,jpj,jpi)) + + !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! end initialization phase jk=1 @@ -168,28 +193,24 @@ SUBROUTINE trcadv zdt = rdt*ndttrc !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1) shared(zbtr_arr,e1t,e2t,e3t) default(none) + !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update device( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) + !$acc update device( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update device( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - !$acc enter data create( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update device( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e3w(1:jpk,1:jpj,1:jpi) ) + !$acc update device( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) - !$acc update device( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update device(tra(1:jpk,1:jpj,1:jpi,1:jptra)) + !$acc update device(trn(1:jpk,1:jpj,1:jpi,1:jptra)) + !$acc update device(advmask(1:jpk,1:jpj,1:jpi)) + !$acc update device(flx_ridxt(1:Fsize,1:4)) + !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -204,7 +225,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(zdt,zaa,inv_eu,e1u,e2u,e3u,un,big_fact_zaa) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -216,7 +237,7 @@ SUBROUTINE trcadv !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -227,7 +248,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -244,7 +265,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(inv_ev,e1v,e2v,e3v,vn,zdt,zbb,big_fact_zbb) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -255,7 +276,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -266,7 +287,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -281,7 +302,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(inv_et,e1t,e2t,e3w,wn,zcc,zdt,big_fact_zcc) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -292,7 +313,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -303,7 +324,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -317,55 +338,10 @@ SUBROUTINE trcadv !$OMP TASKWAIT - !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !! tracer loop parallelized (macrotasking) !! ======================================= - - - !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it - allocate(zy(jpk,jpj,jpi)) - allocate(zx(jpk,jpj,jpi)) - allocate(zz(jpk,jpj,jpi)) - allocate(ztj(jpk,jpj,jpi)) - allocate(zti(jpk,jpj,jpi)) - allocate(zkx(jpk,jpj,jpi)) - allocate(zky(jpk,jpj,jpi)) - allocate(zkz(jpk,jpj,jpi)) - allocate(zbuf(jpk,jpj,jpi)) - - zy(:,:,:) = 0 - zz(:,:,:) = 0 - zx(:,:,:) = 0 - ztj(:,:,:)= 0 - zti(:,:,:)= 0 - zbuf(:,:,:) = 0. - zkx(:,:,:)=0. - zky(:,:,:)=0. - zkz(:,:,:)=0. - - !!trn could be allocate earlier - !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) - !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - - !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc update device(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc update device(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc update device(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) - !$acc update device(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) - - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1, jpi DO jj = 1, jpj DO jk = 1, jpk @@ -382,6 +358,7 @@ SUBROUTINE trcadv ENDDO ENDDO !$acc end kernels + !$acc wait(queue) !$omp taskloop default(none) private(jf,junk,junki,junkj,junkk,zbtr) & !$omp private(zkx,zky,zkz,zti,ztj,zx,zy,zz,zbuf) shared(diaflx,jarrt,tra,zdt) & @@ -400,8 +377,8 @@ SUBROUTINE trcadv !! and mass fluxes calculated above !! calcul of tracer flux in the i and j direction - !$acc kernels async default(present) if(use_gpu) - !$acc loop independent + + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned DO jj = 2,jpjm1 @@ -410,7 +387,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned !$acc loop independent @@ -420,7 +397,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -430,7 +407,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -440,7 +417,7 @@ SUBROUTINE trcadv END DO !$acc end kernels ! loop unfusion - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -450,7 +427,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -460,7 +437,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 DO jj = 2,jpjm1 @@ -472,7 +449,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !$acc loop independent DO jj = 2,jpjm1 @@ -484,7 +461,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc parallel loop async collapse(3) gang vector default(present) if(use_gpu) + !$acc parallel loop collapse(3) gang vector default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -494,7 +471,7 @@ SUBROUTINE trcadv END DO END DO !$acc end parallel loop - !$acc wait + !$acc wait(queue) ! ... Lateral boundary conditions on zk[xy] #ifdef key_mpp @@ -522,7 +499,7 @@ SUBROUTINE trcadv !! 2. calcul of after field using an upstream advection scheme !! ----------------------------------------------------------- - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -535,7 +512,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -561,7 +538,7 @@ SUBROUTINE trcadv if(jt .EQ. 1) then if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -573,7 +550,7 @@ SUBROUTINE trcadv !$acc end kernels else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -585,7 +562,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -600,7 +577,7 @@ SUBROUTINE trcadv endif else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -611,7 +588,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -623,8 +600,7 @@ SUBROUTINE trcadv !$acc end kernels endif - - + !$acc wait(queue) !! ... Lateral boundary conditions on zti #ifdef key_mpp @@ -644,7 +620,7 @@ SUBROUTINE trcadv !jk = 1 ! DO jk = 1,jpkm1 - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 junk = zti(1,jj,ji ) @@ -657,7 +633,7 @@ SUBROUTINE trcadv !$acc end kernels !DO ju=1, dimen_jarr2 - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -683,6 +659,7 @@ SUBROUTINE trcadv !$acc end kernels ! endif + !$acc wait(queue) ! ... Lateral boundary conditions on z[xyz] #ifdef key_mpp @@ -709,7 +686,7 @@ SUBROUTINE trcadv !! 2.5 calcul of the final field: !! advection by antidiffusive mass fluxes and an upstream scheme - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 !dir$ vector aligned @@ -719,7 +696,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned !$acc loop independent @@ -729,7 +706,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -739,7 +716,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -749,7 +726,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -759,7 +736,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -769,7 +746,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 DO jj = 2,jpjm1 @@ -783,7 +760,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !$acc loop independent DO jj = 2,jpjm1 @@ -797,7 +774,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -811,6 +788,7 @@ SUBROUTINE trcadv END DO !$acc end kernels + !$acc wait(queue) !... Lateral boundary conditions on zk[xy] #ifdef key_mpp @@ -833,7 +811,7 @@ SUBROUTINE trcadv if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -844,7 +822,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -859,7 +837,7 @@ SUBROUTINE trcadv else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -870,7 +848,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -882,8 +860,8 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - endif + !$acc wait(queue) ENDDO ANTIDIFF_CORR @@ -894,7 +872,7 @@ SUBROUTINE trcadv if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) do ji=1,jpi do jj=1,jpj do jk=1,jpk @@ -906,7 +884,7 @@ SUBROUTINE trcadv else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) do ji=1,jpi do jj=1,jpj do jk=1,jpk @@ -932,19 +910,24 @@ SUBROUTINE trcadv END DO TRACER_LOOP + !$acc wait(queue) + !$OMP end taskloop - !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) if(use_gpu) - !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) if(use_gpu) + !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update host( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) + !$acc update host( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - !$acc update host( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) + !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) - !$acc exit data delete( trn, advmask ) finalize if(use_gpu) - !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) + !$acc update host( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) + !$acc update host( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) + !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it deallocate(zy ) deallocate(zx ) @@ -956,24 +939,22 @@ SUBROUTINE trcadv deallocate(zkz ) deallocate(zbuf ) - !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) - !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) - trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime !!!! contains +#ifndef _OPENACC double precision function fsx(pfx1, pfx2, pfu) !dir$ attributes vector :: fsx - !$acc routine seq IMPLICIT NONE double precision, INTENT(IN) :: pfx1, pfx2, pfu double precision :: abspfu abspfu = abs(pfu) fsx = ((pfu + abspfu) * pfx1 + (pfu - abspfu) * pfx2) * 0.5 end function fsx +#endif END SUBROUTINE trcadv From 1c99915cc369c1c8cce7e36412be28cc7f8607f5 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 24 Apr 2024 15:32:51 +0200 Subject: [PATCH 4/8] fix serial kernel --- src/MPI/ogstm_mpi.f90 | 2 ++ src/PHYS/trcadv.f90 | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/MPI/ogstm_mpi.f90 b/src/MPI/ogstm_mpi.f90 index b6c0bcd4..f25a754a 100644 --- a/src/MPI/ogstm_mpi.f90 +++ b/src/MPI/ogstm_mpi.f90 @@ -413,6 +413,7 @@ SUBROUTINE mpplnk_my_openacc(ptab,gpu) !$acc kernels default(present) if(use_gpu) IF(nbondj.eq.0.or.nbondj.eq.1) THEN ! All but south boundary, we received from south + !$acc loop independent DO jw=1,SOUTH_count_recv ji = SOUTHpoints_recv(1,jw) jk = SOUTHpoints_recv(2,jw) @@ -423,6 +424,7 @@ SUBROUTINE mpplnk_my_openacc(ptab,gpu) IF(nbondj.eq.-1.or.nbondj.eq.0) THEN ! All but north boundary, we received from north + !$acc loop independent DO jw=1,NORTH_count_recv ji = NORTHpoints_recv(1,jw) jk = NORTHpoints_recv(2,jw) diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 79450638..28e638c9 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -861,7 +861,6 @@ SUBROUTINE trcadv !$acc end kernels endif - !$acc wait(queue) ENDDO ANTIDIFF_CORR From 539824a69ea9bbe969d21bbe8cef345ba71c1699 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 11:35:10 +0200 Subject: [PATCH 5/8] remove duplicated mpplnk_my_openacc routine, just use mpplnk_my --- src/MPI/ogstm_mpi.f90 | 163 +----------------------------------------- src/PHYS/trcadv.f90 | 37 +++------- 2 files changed, 9 insertions(+), 191 deletions(-) diff --git a/src/MPI/ogstm_mpi.f90 b/src/MPI/ogstm_mpi.f90 index f25a754a..1cf8a3e2 100644 --- a/src/MPI/ogstm_mpi.f90 +++ b/src/MPI/ogstm_mpi.f90 @@ -118,168 +118,7 @@ SUBROUTINE mynode !! nono : number for local neighboring processors !! !!---------------------------------------------------------------------- - SUBROUTINE mpplnk_my(ptab) - - double precision ptab(jpk,jpj,jpi) - - -#ifdef key_mpp_mpi - - INTEGER jk,jj,ji - INTEGER reqs1, reqs2, reqr1, reqr2 - INTEGER reqs3, reqs4, reqr3, reqr4 - INTEGER jw, packsize - -!! trcadvparttime = MPI_WTIME() - -!! -!!2. East and west directions exchange -!!------------------------------------ - - - -!! -!!2.2 Migrations -!! -!! -! 3 4 -! | ^ -! | | -! v | -! ________________ -! | | -! 1<-- | | 1 <-- -! 2--> | | 2 --> -! |________________| -! 3 4 -! | ^ -! | | -! v | - - packsize=jpk*jpj - - IF(nbondi.eq.-1) THEN ! We are at the west side of the domain - - CALL mppsend(2,ptab(:,:,jpi-1),packsize,noea,0,reqs1) - CALL mpprecv(1,ptab(:,:, jpi),packsize,reqr1) - - ELSE IF(nbondi.eq.0) THEN - CALL mppsend(1, ptab(:,: ,2),packsize,nowe,0,reqs1) - CALL mppsend(2, ptab(:,:,jpi-1),packsize,noea,0,reqs2) - - CALL mpprecv(1,ptab(:,:,jpi),packsize,reqr1) - CALL mpprecv(2,ptab(:,:, 1),packsize,reqr2) - - ELSE IF(nbondi.eq.1) THEN ! We are at the east side of the domain - - CALL mppsend(1,ptab(:,:,2), packsize, nowe,0, reqs1) - CALL mpprecv(2,ptab(:,:,1), packsize, reqr1) - - - ENDIF - - -!! -!! -!!3. North and south directions -!!----------------------------- -!! -!!3.1 Read Dirichlet lateral conditions -!! - - - IF(nbondj.eq.0.or.nbondj.eq.-1) THEN - DO jw=1,NORTH_count_send - ji = NORTHpoints_send(1,jw) - jk = NORTHpoints_send(2,jw) - tn_send(jw) = ptab(jk,jpj-1,ji) - ENDDO - ENDIF - IF(nbondj.eq.0.or.nbondj.eq.1) THEN - DO jw=1,SOUTH_count_send - ji = SOUTHpoints_send(1,jw) - jk = SOUTHpoints_send(2,jw) - ts_send(jw) = ptab(jk,2,ji) - ENDDO - - - ENDIF! PACK_LOOP4 - - -!! -!!2.2 Migrations -!! -!! - - IF(nbondj.eq.-1) THEN ! We are at the south side of the domain - CALL mppsend(4,tn_send,NORTH_count_send,nono,0,reqs4) - CALL mpprecv(3,tn_recv,NORTH_count_recv,reqr3) - CALL mppwait(reqs4) - CALL mppwait(reqr3) - ELSE IF(nbondj.eq.0) THEN - CALL mppsend(4, tn_send,NORTH_count_send,nono,0,reqs4) - CALL mppsend(3, ts_send,SOUTH_count_send,noso,0,reqs3) - CALL mpprecv(3,tn_recv,NORTH_count_recv,reqr3) - CALL mpprecv(4,ts_recv,SOUTH_count_recv,reqr4) - - CALL mppwait(reqs4) - CALL mppwait(reqs3) - CALL mppwait(reqr3) - CALL mppwait(reqr4) - ELSE IF(nbondj.eq.1) THEN ! We are at the north side of the domain - CALL mppsend(3,ts_send, SOUTH_count_send, noso,0, reqs3) - CALL mpprecv(4,ts_recv, SOUTH_count_recv, reqr4) - CALL mppwait(reqs3) - CALL mppwait(reqr4) - ENDIF - - - -!! -!!2.3 Write Dirichlet lateral conditions -!! - - IF(nbondj.eq.0.or.nbondj.eq.1) THEN ! All but south boundary, we received from south - - DO jw=1,SOUTH_count_recv - ji = SOUTHpoints_recv(1,jw) - jk = SOUTHpoints_recv(2,jw) - ptab(jk,1,ji)= ts_recv(jw) - ENDDO - - ENDIF - - IF(nbondj.eq.-1.or.nbondj.eq.0) THEN ! All but north boundary, we received from north - - DO jw=1,NORTH_count_recv - ji = NORTHpoints_recv(1,jw) - jk = NORTHpoints_recv(2,jw) - ptab(jk,jpj,ji)= tn_recv(jw) - ENDDO - - ENDIF ! PACK_LOOP5 - - -!!! East - West waits - - IF(nbondi.eq.-1) THEN ! We are at the west side of the domain - CALL mppwait(reqs1) - CALL mppwait(reqr1) - ELSE IF(nbondi.eq.0) THEN - CALL mppwait(reqs1) - CALL mppwait(reqs2) - CALL mppwait(reqr1) - CALL mppwait(reqr2) - ELSE IF(nbondi.eq.1) THEN ! We are at the east side of the domain - CALL mppwait(reqs1) - CALL mppwait(reqr1) - ENDIF - -#endif - -END SUBROUTINE - -SUBROUTINE mpplnk_my_openacc(ptab,gpu) +SUBROUTINE mpplnk_my(ptab,gpu) double precision ptab(jpk,jpj,jpi) #ifdef key_mpp_mpi diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 28e638c9..9fa07701 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -478,14 +478,8 @@ SUBROUTINE trcadv ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zkx) - CALL mpplnk_my(zky) -#else - CALL mpplnk_my_openacc(zkx,gpu=use_gpu) - CALL mpplnk_my_openacc(zky,gpu=use_gpu) - -#endif + CALL mpplnk_my(zkx,gpu=use_gpu) + CALL mpplnk_my(zky,gpu=use_gpu) #else @@ -605,11 +599,7 @@ SUBROUTINE trcadv !! ... Lateral boundary conditions on zti #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zti) -#else - CALL mpplnk_my_openacc(zti,gpu=use_gpu) -#endif + CALL mpplnk_my(zti,gpu=use_gpu) #else ! ... T-point, 3D array, full local array zti is initialised CALL lbc( zti(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) @@ -665,15 +655,9 @@ SUBROUTINE trcadv #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zx) - CALL mpplnk_my(zy) - CALL mpplnk_my(zz) -#else - CALL mpplnk_my_openacc(zx,gpu=use_gpu) - CALL mpplnk_my_openacc(zy,gpu=use_gpu) - CALL mpplnk_my_openacc(zz,gpu=use_gpu) -#endif + CALL mpplnk_my(zx,gpu=use_gpu) + CALL mpplnk_my(zy,gpu=use_gpu) + CALL mpplnk_my(zz,gpu=use_gpu) #else ! ... T-point, 3D array, full local array z[xyz] are initialised @@ -793,13 +777,8 @@ SUBROUTINE trcadv !... Lateral boundary conditions on zk[xy] #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zkx) - CALL mpplnk_my(zky) -#else - CALL mpplnk_my_openacc(zkx,gpu=use_gpu) - CALL mpplnk_my_openacc(zky,gpu=use_gpu) -#endif + CALL mpplnk_my(zkx,gpu=use_gpu) + CALL mpplnk_my(zky,gpu=use_gpu) #else ! ... T-point, 3D array, full local array zk[xy] are initialised CALL lbc( zkx(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) From 8542c330acce1ca86a7a269eec13992b469b75be Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 11:35:44 +0200 Subject: [PATCH 6/8] port trchdf on GPU --- src/General/memory.f90 | 8 +++--- src/General/step.f90 | 6 ++-- src/PHYS/trchdf.f90 | 62 +++++++++++++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index e8497bd9..c58c5f87 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -791,8 +791,8 @@ subroutine alloc_tot() DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. - !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt,& - !$acc& e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn) + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,& + !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -806,7 +806,7 @@ subroutine clean_memory() ! myalloc (memory.f90) - !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,vmask,avt) #ifdef key_mpp @@ -1001,7 +1001,7 @@ subroutine clean_memory() deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) - !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn, ahtt) end subroutine clean_memory diff --git a/src/General/step.f90 b/src/General/step.f90 index e776aa14..3009b5d7 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -266,8 +266,8 @@ SUBROUTINE trcstp ! with IMPLICIT vertical diffusion ! XXX: to be removed - use DIA_mem, only: diaflx - use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt + use DIA_mem, only: diaflx,flx_ridxt + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,umask,vmask,tmask,avt,ahtt IMPLICIT NONE integer jn,jk,ji,jj @@ -293,7 +293,9 @@ SUBROUTINE trcstp ! tracers: horizontal diffusion IF namelist flags are activated ! ----------------------------- + !$acc update device(umask,vmask,tmask,trb,ahtt,tra,diaflx,flx_ridxt) if(lhdf) IF (lhdf) CALL trchdf + !$acc update host(diaflx,tra) if(lhdf) ! tracers: sink and source (must be parallelized on vertical slab) IF (lsbc) CALL trcsbc ! surface cell processes, default lsbc = False diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index a70bfc9d..6bb1d1aa 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -109,10 +109,19 @@ SUBROUTINE trchdf INTEGER :: locsum,jklef,jjlef,jilef,jkrig,jjrig,jirig !INTEGER, allocatable :: jarr_hdf(:,:,:),jarr_hdf_flx(:) double precision, allocatable,dimension(:,:,:) :: zlt, ztu, ztv + integer :: queue + logical :: use_gpu !!---------------------------------------------------------------------- !! statement functions !! =================== + +#ifdef _OPENACC + use_gpu=.true. +#else + use_gpu=.false. +#endif + ! #include "BFM_var_list.h" trcbilaphdfparttime = MPI_WTIME() @@ -132,8 +141,9 @@ SUBROUTINE trchdf allocate(zeev (jpk,jpj,jpi )) zeev = huge(zeev(1,1,1)) allocate(zbtr (jpk,jpj,jpi )) - zbtr = huge(zbtr(1,1,1)) + zbtr = huge(zbtr(1,1,1)) + !$acc enter data create(hdfmask,zeeu,zeev,zbtr) DO ji = 1,jpi DO jj = 1,jpj @@ -168,10 +178,20 @@ SUBROUTINE trchdf END DO END DO + queue=1 + !$acc update device(hdfmask) hdf_initialized=.true. + ENDIF + + allocate(zlt (jpk,jpj,jpi)) + allocate(ztu (jpk,jpj,jpi)) + allocate(ztv (jpk,jpj,jpi)) + !$acc enter data create(zlt,ztu,ztv) + ! Metric arrays calculated out of the initialisation phase(for z- or s-coordinates) ! !! ---------------------------------- + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1, jpi DO jj = 1, jpj DO jk=1,jpk @@ -182,13 +202,11 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop - allocate(zlt (jpk,jpj,jpi)) - allocate(ztu (jpk,jpj,jpi)) - allocate(ztv (jpk,jpj,jpi)) !! tracer slab !! ============= @@ -199,9 +217,11 @@ SUBROUTINE trchdf TRACER_LOOP: DO jn = 1, jptra + !$acc kernels default(present) async(queue) zlt = 0. ztu = 0. ztv = 0. + !$acc end kernels !! 1. Laplacian !! ------------ @@ -215,6 +235,7 @@ SUBROUTINE trchdf ! jk = jarr_hdf(1,jv,1a) ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -228,8 +249,10 @@ SUBROUTINE trchdf END DO END DO ! $OMP END TASK + !$acc end parallel loop ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -242,6 +265,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK ! $OMP TASKWAIT @@ -254,6 +278,7 @@ SUBROUTINE trchdf ! ji = jarr_hdf(3,jv,2) ! jj = jarr_hdf(2,jv,2) ! jk = jarr_hdf(1,jv,2) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 2,jpi-1 DO jj = 2,jpj-1 DO jk = 1,jpk @@ -266,6 +291,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop @@ -274,11 +300,12 @@ SUBROUTINE trchdf !! ... Lateral boundary conditions on the laplacian (zlt,zls) + !$acc wait(queue) #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors - - CALL mpplnk_my(zlt) + CALL mpplnk_my(zlt,gpu=use_gpu) #else +#error CALL lbc( zlt(:,:,:), 1, 1, 1, 1, jpk, 1 ) #endif @@ -291,6 +318,7 @@ SUBROUTINE trchdf !!!&omp& dimen_jvhdf3,zta,zbtr,tra,jarr_hdf_flx,diaflx,Fsize) ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -301,9 +329,11 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -315,6 +345,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK !! ... fourth derivative (divergence) and add to the general tracer trend @@ -326,7 +357,8 @@ SUBROUTINE trchdf ! jk = jarr_hdf(1,jv,2) ! jf = jarr_hdf_flx(jv) - ! $OMP TASKWAIT + ! $OMP TASKWAIT + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 DO jk = 1,jpk @@ -339,9 +371,10 @@ SUBROUTINE trchdf !tra(jk,jj,ji,jn ) = tra(jk,jj,ji,jn ) + zta END DO END DO - END DO - + END DO + !$acc end parallel loop + !$acc parallel loop gang vector default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -351,6 +384,7 @@ SUBROUTINE trchdf diaflx(6,jf, jn) = diaflx(6,jf, jn) - ztv(jk,jj,ji)*rdt ENDDO + !$acc end parallel loop @@ -359,16 +393,18 @@ SUBROUTINE trchdf END DO TRACER_LOOP ! $OMP END TASKLOOP + !$acc wait(queue) ! deallocate(hdfmask) ! deallocate(zeeu) ! deallocate(zeev) ! deallocate(zbtr) - deallocate(zlt) - deallocate(ztu) - deallocate(ztv) - + !$acc exit data delete(zlt,ztu,ztv) + deallocate(zlt) + deallocate(ztu) + deallocate(ztv) + trcbilaphdfparttime = MPI_WTIME() - trcbilaphdfparttime trcbilaphdftottime = trcbilaphdftottime + trcbilaphdfparttime From 277e62372d50d563ead8d8ff5b8ad9fe12e1fe26 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 17:33:52 +0200 Subject: [PATCH 7/8] trcave GPU port --- src/BIO/trclec.f90 | 3 +++ src/General/memory.f90 | 10 +++++++-- src/PHYS/trcave.f90 | 47 +++++++++++++++++++++++++++++++++++------- src/PHYS/trczdf.f90 | 3 ++- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/BIO/trclec.f90 b/src/BIO/trclec.f90 index d6af01f4..b99be8a3 100644 --- a/src/BIO/trclec.f90 +++ b/src/BIO/trclec.f90 @@ -141,6 +141,7 @@ SUBROUTINE trclec if (ctr_hf(ji).eq.1) jptra_high = jptra_high + 1 enddo allocate(highfreq_table(jptra_HIGH)) + !$acc enter data create(highfreq_table) highfreq_table = huge(highfreq_table(1)) jptra_high = 0 @@ -165,6 +166,7 @@ SUBROUTINE trclec if (lwp) write(*,*) 'High freq diagnostics number :', jptra_dia_HIGH allocate(highfreq_table_dia(jptra_dia_HIGH)) + !$acc enter data create(highfreq_table_dia) jptra_dia_high = 0 @@ -188,6 +190,7 @@ SUBROUTINE trclec if (lwp) write(*,*) 'High freq diagnostics number 2d:', jptra_dia2d_HIGH allocate(highfreq_table_dia2d(jptra_dia2d_HIGH)) + !$acc enter data create(highfreq_table_dia2d) jptra_dia2d_high = 0 diff --git a/src/General/memory.f90 b/src/General/memory.f90 index c58c5f87..96db7c26 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -792,7 +792,10 @@ subroutine alloc_tot() forcing_phys_initialized = .false. !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,& - !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt) + !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt,traIO,traIO_HIGH,& + !$acc& snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,sn,tn,& + !$acc& tra_DIA,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,& + !$acc& vatm,emp,qsr,tra_DIA_2d_IO_HIGH) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -997,11 +1000,14 @@ subroutine clean_memory() ! trclec + !$acc exit data delete(highfreq_table,highfreq_table_dia,highfreq_table_dia2d) deallocate(highfreq_table) deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) - !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn, ahtt) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn,& + !$acc& ahtt, traio,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,sn,tn,& + !$acc& tra_DIA,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,vatm,emp,qsr,tra_DIA_2d_IO_HIGH) end subroutine clean_memory diff --git a/src/PHYS/trcave.f90 b/src/PHYS/trcave.f90 index 8d8313a5..617a0212 100644 --- a/src/PHYS/trcave.f90 +++ b/src/PHYS/trcave.f90 @@ -10,14 +10,19 @@ SUBROUTINE trcave integer :: jn_high, jn_on_all double precision :: Miss_val =1.e20 double precision :: elapsed_time, inv_incremented_time + integer :: queue ave_partTime = MPI_WTIME() + queue=1 ! FIRST, LOW FREQUENCY elapsed_time = elapsed_time_2 inv_incremented_time = 1./(elapsed_time_2 + rdt) + !$acc update device(traIO,trn,umask,vmask,tmask,traIO_HIGH,highfreq_table,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,sn,tn,wn,avt,e3t,un,vn,tra_DIA_IO,tra_DIA,tra_DIA_2d_IO,tra_DIA_2d,vatmIO,empIO,qsrIO,vatm,emp,qsr,highfreq_table_dia,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH,highfreq_table_dia2d) + + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn=1 ,jptra DO ji=1, jpi @@ -33,6 +38,7 @@ SUBROUTINE trcave END DO END DO + !$acc end parallel loop @@ -40,15 +46,21 @@ SUBROUTINE trcave elapsed_time = elapsed_time_1 inv_incremented_time = 1./(elapsed_time_1 + rdt)! ****************** HIGH FREQUENCY + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn_high=1 ,jptra_high +#ifndef _OPENACC jn_on_all = highfreq_table(jn_high) +#endif DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk +#ifdef _OPENACC + jn_on_all = highfreq_table(jn_high) +#endif IF(tmask(jk,jj,ji) .NE. 0.) THEN traIO_HIGH(jk,jj,ji,jn_high )= & & (traIO_HIGH(jk,jj,ji,jn_high )*elapsed_time+trn(jk,jj,ji,jn_on_all)*rdt)*inv_incremented_time @@ -61,6 +73,7 @@ SUBROUTINE trcave END DO + !$acc end parallel loop ! ***************** PHYS ***************************************************** @@ -73,6 +86,7 @@ SUBROUTINE trcave endif + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk @@ -107,7 +121,9 @@ SUBROUTINE trcave END DO END DO END DO + !$acc end parallel loop + !$acc parallel loop gang vector collapse(2) default(present) async(queue) DO jj=1, jpj DO ji=1, jpi IF (tmask(1,jj,ji) .NE. 0.) THEN @@ -121,6 +137,7 @@ SUBROUTINE trcave ENDIF END DO END DO + !$acc end parallel loop ! ***************** END PHYS ************************************************* @@ -135,6 +152,7 @@ SUBROUTINE trcave inv_incremented_time = 1./(elapsed_time_2 + rdt) + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn = 1,jptra_dia DO ji=1, jpi DO jj=1, jpj @@ -148,10 +166,12 @@ SUBROUTINE trcave END DO END DO ENDDO + !$acc end parallel loop ! ********************* DIAGNOSTICS 2D ********** + !$acc parallel loop gang vector collapse(2) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj IF(tmask(1,jj,ji) .NE. 0.) THEN ! Warning ! Tested only for surface @@ -162,6 +182,7 @@ SUBROUTINE trcave ENDIF END DO END DO + !$acc end parallel loop @@ -172,11 +193,17 @@ SUBROUTINE trcave if (jptra_dia_high.gt.0) THEN + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn_high=1, jptra_dia_high +#ifndef _OPENACC jn_on_all = highfreq_table_dia(jn_high ) +#endif DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk +#ifdef _OPENACC + jn_on_all = highfreq_table_dia(jn_high ) +#endif IF(tmask(jk,jj,ji) .NE. 0.) THEN tra_DIA_IO_HIGH(jk,jj,ji,jn_high )= & & (tra_DIA_IO_HIGH(jk,jj,ji,jn_high )*elapsed_time+tra_DIA(jk,jj,ji,jn_on_all)*rdt)*inv_incremented_time @@ -187,6 +214,7 @@ SUBROUTINE trcave END DO END DO END DO + !$acc end parallel loop endif @@ -194,24 +222,29 @@ SUBROUTINE trcave if (jptra_dia2d_high.gt.0) THEN + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj - IF(tmask(1,jj,ji) .NE. 0.) THEN DO jn_high=1, jptra_dia2d_high - jn_on_all = highfreq_table_dia2d(jn_high) - tra_DIA_2d_IO_HIGH(jn_high,jj,ji)= & - & (tra_DIA_2d_IO_HIGH(jn_high,jj,ji)*elapsed_time+tra_DIA_2d(jn_on_all,jj,ji)*rdt)*inv_incremented_time + IF(tmask(1,jj,ji) .NE. 0.) THEN + jn_on_all = highfreq_table_dia2d(jn_high) + tra_DIA_2d_IO_HIGH(jn_high,jj,ji)= & + & (tra_DIA_2d_IO_HIGH(jn_high,jj,ji)*elapsed_time+tra_DIA_2d(jn_on_all,jj,ji)*rdt)*inv_incremented_time + ELSE + tra_DIA_2d_IO_HIGH(jn_high,jj,ji)=Miss_val + ENDIF END DO - ELSE - tra_DIA_2d_IO_HIGH(:,jj,ji)=Miss_val - ENDIF END DO END DO + !$acc end parallel loop endif endif ! lfbm + !$acc wait(queue) + !$acc update host(traIO,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH) + ave_partTime = MPI_WTIME() - ave_partTime ave_TotTime = ave_TotTime + ave_partTime diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index a1b02ee3..76de5ef0 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -133,7 +133,8 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab - ! NOTE: kernel is too big, should be split + ! NOTE: kernel is too big, should be split by adding a new jv dimension + ! on zwi zws zwd zwy zwt zwz zwx !$acc parallel loop gang vector default(present) async vector_length(32) DO jv = 1, dimen_jvzdf From 6b7c754e53b661ab5b1167304458608e4d37d907 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 13 May 2024 10:48:35 +0200 Subject: [PATCH 8/8] remove debug statement --- src/PHYS/trchdf.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index 6bb1d1aa..f43b8198 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -305,7 +305,6 @@ SUBROUTINE trchdf ! ... Mpp : export boundary values to neighboring processors CALL mpplnk_my(zlt,gpu=use_gpu) #else -#error CALL lbc( zlt(:,:,:), 1, 1, 1, 1, jpk, 1 ) #endif