apes-suite · haraldkl · May 16, 2018 · May 16, 2018 · May 18, 2018 · Jul 5, 2018
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 Polynomials Library
 ===================
 
+*This is the OpenMP implementation branch*
+
 This project is a supporting library for [TreElM](https://bitbucket.org/apesteam/treelm).
 It does not work on its own, but rather needs to be included in
 other projects, which also include TreElM.

diff --git a/source/fpt/ply_chebPoint_module.f90 b/source/fpt/ply_chebPoint_module.f90
diff --git a/source/fpt/ply_legFpt_2D_module.fpp b/source/fpt/ply_legFpt_2D_module.fpp
@@ -83,6 +83,7 @@ contains
 
     allocate(alph(n**2))
 
+
     ! original layout (n = 3):
     !  1  2  3
     !  4  5  6

diff --git a/source/fpt/ply_legFpt_3D_module.fpp b/source/fpt/ply_legFpt_3D_module.fpp
@@ -145,6 +145,7 @@ contains
       &                pntVal    = pntVal   )
     ! <<<<< Z-Direction <<<<< !
 
+
   end subroutine ply_legToPnt_3D_singVar
   ! ------------------------------------------------------------------------ !
 
@@ -266,6 +267,7 @@ contains
       &                pntVal    = alph       )
     ! <<<<< X-Direction <<<<< !
 
+
   end subroutine ply_pntToLeg_3D_singVar
   ! ------------------------------------------------------------------------ !
 

diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
@@ -3,6 +3,7 @@
 ! Copyright (c) 2013-2014, 2017 Peter Vitt <peter.vitt2@uni-siegen.de>
 ! Copyright (c) 2013-2014 Verena Krupp
 ! Copyright (c) 2016 Langhammer Kay <kay.langhammer@student.uni-siegen.de>
+! Copyright (c) 2020 Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
 !
 ! Parts of this file were written by Jens Zudrop and Harald Klimach
 ! for German Research School for Simulation Sciences GmbH.
@@ -344,8 +345,10 @@ subroutine ply_legToPnt_single( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
+    !$OMP DO
     do iDof = 1, nIndeps*n, n
       call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
         &                  gam    = cheb,                     &
@@ -360,6 +363,9 @@ subroutine ply_legToPnt_single( fpt, legCoeffs, pntVal, nIndeps )
         &                    cheb,                 &
         &                    pntVal(iDof:iDof+n-1) )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_legToPnt_single
   ! ------------------------------------------------------------------------ !
@@ -417,8 +423,10 @@ subroutine ply_legToPnt_lobatto_single( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
+    !$OMP DO
     do iDof = 1, nIndeps*n, n
       call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
         &                  gam    = cheb,                     &
@@ -432,6 +440,9 @@ subroutine ply_legToPnt_lobatto_single( fpt, legCoeffs, pntVal, nIndeps )
         &                    cheb,                 &
         &                    pntVal(iDof:iDof+n-1) )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_legToPnt_lobatto_single
   ! ------------------------------------------------------------------------ !
@@ -489,9 +500,12 @@ subroutine ply_pntToLeg_single( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     normFactor = 1.0_rk / real(n,kind=rk)
+
+    !$OMP DO
     do iDof = 1, nIndeps*n, n
       call fftw_execute_r2r( fpt%planPntToCheb,     &
         &                    pntVal(iDof:iDof+n-1), &
@@ -506,6 +520,9 @@ subroutine ply_pntToLeg_single( fpt, pntVal, legCoeffs, nIndeps )
         &                  alph   = cheb,                     &
         &                  params = fpt%ChebToLegParams       )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg_single
   ! ------------------------------------------------------------------------ !
@@ -567,9 +584,12 @@ subroutine ply_pntToLeg_lobatto_single( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     normFactor = 0.5_rk / real(n-1,kind=rk)
+
+    !$OMP DO
     do iDof = 1, nIndeps*n, n
       call fftw_execute_r2r( fpt%planPntToCheb,     &
         &                    pntVal(iDof:iDof+n-1), &
@@ -584,6 +604,9 @@ subroutine ply_pntToLeg_lobatto_single( fpt, pntVal, legCoeffs, nIndeps )
         &                  alph   = cheb,                     &
         &                  params = fpt%ChebToLegParams       )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg_lobatto_single
   ! ------------------------------------------------------------------------ !

diff --git a/source/fpt/ply_polyBaseExc_module.fpp b/source/fpt/ply_polyBaseExc_module.fpp
@@ -1113,7 +1113,7 @@ contains
   !> Convert strip of coefficients of a modal representation in terms of
   !! Legendre polynomials to modal coefficients in terms of Chebyshev
   !! polynomials.
-  subroutine ply_fpt_single( alph, gam, params )
+  subroutine ply_fpt_single( alph, gam, params)
     ! -------------------------------------------------------------------- !
     !> The parameters of the fast polynomial transformation.
     type(ply_trafo_params_type), intent(inout) :: params

diff --git a/source/ply_LegPolyProjection_module.f90 b/source/ply_LegPolyProjection_module.f90
@@ -158,6 +158,7 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     real(kind=rk), allocatable :: newWorkDat(:)
     integer :: nChildDofs, oneDof
     ! -------------------------------------------------------------------- !
+
     if (subsamp%projectionType.ne.ply_QLegendrePoly_prp) then
       call tem_abort( 'ERROR in ply_QPolyProjection: subsampling is ' &
         & // 'only implemented for Q-Legendre-Polynomials'            )
@@ -281,6 +282,7 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
     real(kind=rk), allocatable :: projCoeffOneDim(:,:,:)
     real(kind=rk) :: dimexp
     ! -------------------------------------------------------------------- !
+
     select case(dofType)
     case(ply_QLegendrePoly_prp)
       allocate(projection%projCoeff(nDofs, nChildDofs, nChilds))
@@ -352,6 +354,7 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
         & // 'for Q-Legendre polynomials'                                   )
     end select
     deallocate(projCoeffOneDim)
+
   end subroutine ply_initQLegProjCoeff
   ! ************************************************************************ !
 
@@ -595,6 +598,7 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
     integer :: oneDof, noChilds, childpos
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
+
     nChilds = 2**ndims
     nElems = tree%nElems
     nElemsToRefine = count(new_refine_tree)
@@ -794,6 +798,7 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
     integer :: childDof_pos, parentDof_pos
     real(kind=rk) :: projCoeff
     ! -------------------------------------------------------------------- !
+
     childData(:) = 0.0_rk
 
     childLoop: do iChild = 1, nChilds

diff --git a/source/ply_fxt_module.f90 b/source/ply_fxt_module.f90
@@ -255,6 +255,7 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1)  )
     end do
     modal_data = nodal_data
+
   end subroutine ply_fxt_n2m_2D
   ! ************************************************************************ !
 

diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
@@ -245,37 +245,97 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
+! Original version (for reference)
+!!    if (nDofs > 1) then
+!!
+!!      do iStrip=1,nIndeps,vlen
+!!
+!!        ! Calculate the upper bound of the current strip
+!!        strip_ub = iStrip-1 + min(vlen, nIndeps-iStrip+1)
+!!
+!!        do iRow = 1, nDofs
+!!
+!!          do iCell = iStrip, strip_ub
+!!            projected(iCell, iRow) = 0.0_rk
+!!          end do
+!!
+!!          do iCol = 1, nDofs
+!!            mval =  matrix(iCol,iRow)
+!!            do iCell = iStrip, strip_ub
+!!              ! on SX-ACE, this can be identified as matrix multiplication
+!!              ! which results in VERY HIGH performance
+!!              projected(iCell, iRow) = projected(iCell, iRow) &
+!!                &                   + mval * original(iCol, iCell)
+!!            end do ! iCell
+!!          end do ! iCol = 1, nCols
+!!        end do ! iRow = 1, nRows
+!!      end do ! iStrip
+!!
+!!    else
+!!
+!!      projected = matrix(nDofs,1) * original
+!!
+!!    end if
+
     if (nDofs > 1) then
 
-      do iStrip=0,nIndeps-1,vlen
+      !$OMP PARALLEL DO DEFAULT(SHARED), &
+      !$OMP PRIVATE(iStrip, iRow, iCell, iCol, mval)
+      do iStrip=1,nIndeps,vlen
 
         ! Calculate the upper bound of the current strip
-        strip_ub = min(iStrip + vlen, nIndeps) - iStrip
+        strip_ub = iStrip-1 + min(vlen, nIndeps-iStrip+1)
 
         do iRow = 1, nDofs
 
-          do iCell = iStrip+1, iStrip+strip_ub
+          do iCell = iStrip, strip_ub
             projected(iCell, iRow) = 0.0_rk
           end do
+
           do iCol = 1, nDofs
             mval =  matrix(iCol,iRow)
-            do iCell = iStrip+1, iStrip+strip_ub
+            do iCell = iStrip, strip_ub
               ! on SX-ACE, this can be identified as matrix multiplication
               ! which results in VERY HIGH performance
               projected(iCell, iRow) = projected(iCell, iRow) &
                 &                   + mval * original(iCol, iCell)
             end do ! iCell
           end do ! iCol = 1, nCols
-
         end do ! iRow = 1, nRows
       end do ! iStrip
+      !$OMP END PARALLEL DO
 
     else
 
       projected = matrix(nDofs,1) * original
 
     end if
 
+
+! test-version of the loop (will be removed later)
+!!    if (nDofs > 1) then
+!!
+!!      projected(:, :) = 0.0_rk
+!!
+!!      !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(SHARED), &
+!!      !$OMP PRIVATE(iStrip, iRow, iCell, iCol, mval)
+!!      do iRow = 1, nDofs
+!!        do iCol = 1, nDofs
+!!          mval =  matrix(iCol,iRow)
+!!          do iStrip=1,nIndeps
+!!            projected(iStrip, iRow) = projected(iStrip, iRow) &
+!!              &                   + mval * original(iCol, iStrip)
+!!          end do
+!!        end do
+!!      end do
+!!      !$OMP END PARALLEL DO
+!!
+!!    else
+!!
+!!      projected = matrix(nDofs,1) * original
+!!
+!!    end if
+
   end subroutine ply_l2_projection
   ! ************************************************************************ !
-Original file line number
+Diff line change
@@ Expand Up / @@ -83,6 +83,7 @@ contains @@
         allocate(alph(n**2))
         ! original layout (n = 3):
         !  1  2  3
         !  4  5  6
@@ Expand Down @@