From f6be44fd7232bc81c912415996a40922abe9c59f Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald@klimachs.de>
Date: Wed, 16 May 2018 13:06:25 +0200
Subject: [PATCH 01/21] Started a new branch to work on OpenMP.

--HG--
branch : feature/openmp
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 0f403ee..4e6d623 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 Polynomials Library
 ===================
 
+*This is the OpenMP implementation branch*
+
 This project is a supporting library for [TreElM](https://bitbucket.org/apesteam/treelm).
 It does not work on its own, but rather needs to be included in
 other projects, which also include TreElM.

From 48bfb6eefca7616594789ef9261f9eb699612d4f Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald@klimachs.de>
Date: Fri, 18 May 2018 18:35:39 +0200
Subject: [PATCH 02/21] Added OpenMP directives to the FPT.

Also added a performance utest to check the implementation.
Note: this first implementation with OpenMP breaks for polynomial
degrees of 255 and higher (the algorithm starts to use approximated
blocks then).
This is probably due to the usage of memory allocated in the
initialization for the FPT, that is shared between threads.
There is no problem when using OMP_NUM_THREADS=1 for the
ply_fpt_3D_performance_test.

--HG--
branch : feature/openmp
---
 source/fpt/ply_legFpt_module.f90              | 12 +++
 .../with_fftw/ply_fpt_3D_performance_test.f90 | 98 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 utests/with_fftw/ply_fpt_3D_performance_test.f90

diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 42f70dd..8d0678f 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -227,10 +227,12 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_points) then
 
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -245,9 +247,11 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
+      !$OMP END DO
 
     else
 
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -261,8 +265,10 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
+      !$OMP END DO
 
     end if ! lobattoPoints
+    !$OMP END PARALLEL
 
   end subroutine ply_legToPnt
   ! ************************************************************************ !
@@ -284,11 +290,13 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb, normFactor)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_Points) then
 
       normFactor = 1.0_rk / real(n,kind=rk)
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -303,10 +311,12 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
+      !$OMP END DO
 
     else
 
       normFactor = 0.5_rk / real(n-1,kind=rk)
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -321,8 +331,10 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
+      !$OMP END DO
 
     end if ! lobattoPoints
+    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg
   ! ************************************************************************ !
diff --git a/utests/with_fftw/ply_fpt_3D_performance_test.f90 b/utests/with_fftw/ply_fpt_3D_performance_test.f90
new file mode 100644
index 0000000..d6ad78b
--- /dev/null
+++ b/utests/with_fftw/ply_fpt_3D_performance_test.f90
@@ -0,0 +1,98 @@
+!> Unit test to check functionallity of fast polynomial transformations.
+!! \author{Jens Zudrop}
+program ply_fpt_3D_performance_test
+  use mpi, only: mpi_wtime
+  use env_module,               only: rk, fin_env
+  use tem_logging_module,       only: logUnit, tem_logging_init_primary
+  use tem_general_module,       only: tem_general_type, tem_start
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_3D_module,     only: ply_legToPnt_3D, &
+    &                                 ply_pntToLeg_3D
+
+  implicit none
+
+  integer :: iPower
+  integer, parameter :: maxpower = 8
+  real(kind=rk) :: res, newRes
+  type(tem_general_type) :: general
+
+  ! Init the Treelm environment, needed to init the log Unit
+  call tem_start(codeName = 'FPT 3D Performance Test', &
+    &            version  = '1',                       &
+    &            general  = general                    )
+  call tem_logging_init_primary( level = 1,               &
+    &                            rank = general%proc%rank )
+
+  res = 0.0_rk
+  do iPower = 1,maxpower
+    call ply_check_legToPnt_3D(iPower, newRes)
+    if (newRes.gt.res) then
+      res = newRes
+    end if
+  end do
+
+  write(*,*) 'Maximal deviation:', res
+  if (res <  1.e-08) then
+    write(logUnit(1),*) 'PASSED'
+  end if 
+
+  call fin_env()
+
+contains
+
+  subroutine ply_check_legToPnt_3D(power,res)
+    integer, intent(in) :: power
+    real(kind=rk) :: res
+    integer :: maxPolyDegree, iVar, nVars
+    real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
+    real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
+    type(ply_legFpt_type) :: fpt
+    real(kind=rk) :: starttime, stoptime
+
+    ! Define the maximal polynomial degree we want to calculate the
+    ! bases exchange for.
+    maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
+    nVars = 3
+    write(logUnit(10),*) '------------------------------------' // &
+      & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
+    write(logUnit(10),*) '------------------------------------' // &
+      & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
+  
+    ! Create the Legendre expansion coefficients
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    do iVar = 1, nVars
+      legCoeffs(:,iVar) = real(iVar, rk)
+    end do
+  
+    ! Init the FPT 
+    call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
+      &                   nIndeps       = (maxpolydegree+1)**2, &
+      &                   fpt           = fpt                   )
+  
+    ! now transform to the Chebyshev nodes
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    legCoeffsIn = legCoeffs
+    starttime = MPI_Wtime()
+    call ply_legToPnt_3D( fpt       = fpt,         &
+      &                   legCoeffs = legCoeffsIn, &
+      &                   pntVal    = pntVal,      &
+      &                   nVars     = nVars        ) 
+    stoptime = MPI_Wtime()
+    write(*,*) 'Time for degree ', maxpolydegree, ' trafo:   ', stoptime - starttime
+
+    ! now transform back to Legendre coefficients
+    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    starttime = MPI_Wtime()
+    call ply_pntToLeg_3D( fpt       = fpt,    &
+      &                   pntVal    = pntVal, &
+      &                   legCoeffs = legVal, &
+      &                   nVars     = nVars   )
+    stoptime = MPI_Wtime()
+    write(*,*) 'Time for degree ', maxpolydegree, ' inverse: ', stoptime - starttime
+  
+    res = maxval(abs(legVal(:,:) - legCoeffs(:,:)))
+
+  end subroutine ply_check_legToPnt_3D
+
+end program ply_fpt_3D_performance_test

From 544896fa445d989ecb7c78ac37ae90c35ccbc36b Mon Sep 17 00:00:00 2001
From: Robin Weihe <robin.weihe@student.uni-siegen.de>
Date: Thu, 5 Jul 2018 15:33:13 +0200
Subject: [PATCH 03/21] reimplemantation of openmp statements

--HG--
branch : feature/openmp
---
 source/fpt/ply_chebPoint_module.f90       | 74 +++++++++++++++++++++++
 source/fpt/ply_legFpt_2D_module.fpp       | 16 ++++-
 source/fpt/ply_legFpt_3D_module.fpp       |  2 +
 source/fpt/ply_polyBaseExc_module.fpp     |  2 +-
 source/ply_LegPolyProjection_module.f90   | 28 +++++++++
 source/ply_fxt_module.f90                 | 48 +++++++++++++--
 source/ply_l2p_module.f90                 |  6 +-
 source/ply_leg_diff_module.fpp            | 26 +++++++-
 source/ply_legser_module.f90              |  7 +++
 source/ply_modg_basis_module.fpp          | 16 +++++
 source/ply_nodes_module.f90               | 14 ++---
 source/ply_poly_project_module.fpp        | 46 ++++++++++++--
 source/ply_poly_transformation_module.f90 | 25 ++++++++
 source/ply_sampling_adaptive_module.f90   |  6 +-
 source/ply_sampling_module.fpp            |  6 ++
 source/ply_sampling_varsys_module.f90     | 14 +++++
 source/ply_space_integration_module.f90   | 48 +++++++++++++++
 source/ply_split_element_module.f90       |  9 ++-
 source/ply_split_legendre_module.f90      |  2 +-
 source/ply_transfer_module.fpp            |  2 +
 20 files changed, 371 insertions(+), 26 deletions(-)

diff --git a/source/fpt/ply_chebPoint_module.f90 b/source/fpt/ply_chebPoint_module.f90
index 0a40640..01dc4f0 100644
--- a/source/fpt/ply_chebPoint_module.f90
+++ b/source/fpt/ply_chebPoint_module.f90
@@ -39,12 +39,17 @@ subroutine ply_chebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
 
     allocate(chebPnt1D(nPoints))
+    !$OMP DO
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = -1.0_rk &
         & * cos( PI / nPoints * ( (iPoint - 1.0_rk) + 1.0_rk / 2.0_rk ) )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_chebPoint_1D
   ! ************************************************************************ !
@@ -65,11 +70,16 @@ subroutine ply_lobattoChebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
 
     allocate(chebPnt1D(nPoints))
+    !$OMP DO
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = cos( ( iPoint - 1.0_rk ) * PI / ( nPoints - 1.0_rk ) )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_lobattoChebPoint_1D
   ! ************************************************************************ !
@@ -86,12 +96,14 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
     nquadpoints = num_intp_per_direction**3
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -104,6 +116,9 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
         end do
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube
   ! ************************************************************************ !
@@ -121,12 +136,14 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
     nquadpoints = num_intp_per_direction**3
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -139,6 +156,9 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
         end do
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube
   ! ************************************************************************ !
@@ -156,12 +176,16 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
+
+
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -172,6 +196,9 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
         pointNumber = pointNumber + 1
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_2d
   ! ************************************************************************ !
@@ -191,12 +218,14 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -207,6 +236,9 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
         pointNumber = pointNumber + 1
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_2d
   ! ************************************************************************ !
@@ -222,11 +254,13 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -235,6 +269,9 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_1d
   ! ************************************************************************ !
@@ -253,11 +290,13 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
+    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -266,6 +305,9 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_1d
   ! ************************************************************************ !
@@ -291,6 +333,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
@@ -301,6 +344,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -309,7 +353,9 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -318,7 +364,9 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber , 1) = chebPnt1D(i)
@@ -327,11 +375,14 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube: unknown ' &
         & // 'face direction'                                              )
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine create_surface_cheb_points_cube
   ! ************************************************************************ !
 
@@ -356,6 +407,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
@@ -366,6 +418,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -374,7 +427,9 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -383,7 +438,9 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -392,11 +449,14 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_lobattocheb_points_cube:' &
         & // ' unknown face direction'                                   )
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine create_surface_lobattocheb_points_cube
   ! ************************************************************************ !
 
@@ -420,6 +480,7 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
     allocate(points(nquadpoints,3))
@@ -430,24 +491,30 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine create_surface_cheb_points_cube_2d
   ! ************************************************************************ !
 
@@ -471,6 +538,7 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
     allocate(points(nquadpoints,3))
@@ -481,24 +549,30 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine create_surface_lobattocheb_points_cube_2d
   ! ************************************************************************ !
 
diff --git a/source/fpt/ply_legFpt_2D_module.fpp b/source/fpt/ply_legFpt_2D_module.fpp
index 6ba78c3..2f62583 100644
--- a/source/fpt/ply_legFpt_2D_module.fpp
+++ b/source/fpt/ply_legFpt_2D_module.fpp
@@ -54,6 +54,7 @@ contains
    real(kind=rk), dimension(:), allocatable :: alph
    real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iAlph, nIndeps)
 
    striplen = fpt%legToChebParams%striplen
    n = fpt%legToChebParams%n
@@ -69,6 +70,7 @@ contains
    !  1  4  7
    !  2  5  8
    !  3  6  9
+   !$OMP DO
    yStripLoop: do iStrip = 1, n, striplen
      ! iAlph is the index of the first element in a line for the transformation
      ! in y-direction.
@@ -88,8 +90,10 @@ contains
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do yStripLoop
+   !$OMP END DO
 
-  ! x-direction
+   ! x-direction
+   !$OMP DO
    xStripLoop: do iStrip = 1, n, striplen
      do iAlph = iStrip, min(iStrip+striplen-1, n)
        alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n) !ztrafo
@@ -106,6 +110,9 @@ contains
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do xStripLoop
+   !$OMP END DO
+
+   !$OMP END PARALLEL
 
   end subroutine ply_legToPnt_2D_singVar
   ! ************************************************************************ !
@@ -160,6 +167,7 @@ contains
     real(kind=rk), dimension(:), allocatable :: alph
     real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(nIndeps, iStrip, iAlph)
 
     striplen = fpt%chebToLegParams%striplen
     n = fpt%legToChebParams%n
@@ -168,6 +176,7 @@ contains
     allocate(alph(min(striplen, n)*n))
     allocate(gam(min(striplen, n)*n))
 
+    !$OMP DO
     yStripLoop: do iStrip = 1, n, striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n) !y_Trafo
         alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n)
@@ -185,8 +194,10 @@ contains
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do yStripLoop ! iStrip
+    !$OMP END DO
 
     ! x-direction
+    !$OMP DO
     xStripLoop: do iStrip = 1,n,striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n)
         !ztrafo
@@ -204,6 +215,9 @@ contains
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do xStripLoop
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg_2D_singVar
   ! ************************************************************************ !
diff --git a/source/fpt/ply_legFpt_3D_module.fpp b/source/fpt/ply_legFpt_3D_module.fpp
index d20ad88..4acfeef 100644
--- a/source/fpt/ply_legFpt_3D_module.fpp
+++ b/source/fpt/ply_legFpt_3D_module.fpp
@@ -158,6 +158,7 @@ contains
 
    end do xStripLoop
 
+
   end subroutine ply_legToPnt_3D_singVar
   ! ************************************************************************ !
 
@@ -318,6 +319,7 @@ contains
 
     end do xStripLoop
 
+
   end subroutine ply_pntToLeg_3D_singVar
   ! ************************************************************************ !
 
diff --git a/source/fpt/ply_polyBaseExc_module.fpp b/source/fpt/ply_polyBaseExc_module.fpp
index cea7281..496a912 100644
--- a/source/fpt/ply_polyBaseExc_module.fpp
+++ b/source/fpt/ply_polyBaseExc_module.fpp
@@ -1048,7 +1048,7 @@ contains
   !> Convert strip of coefficients of a modal representation in terms of
   !! Legendre polynomials to modal coefficients in terms of Chebyshev
   !! polynomials.
-  subroutine ply_fpt_single( alph, gam, params )
+  subroutine ply_fpt_single( alph, gam, params)
     ! -------------------------------------------------------------------- !
     !> The parameters of the fast polynomial transformation.
     type(ply_trafo_params_type), intent(inout) :: params
diff --git a/source/ply_LegPolyProjection_module.f90 b/source/ply_LegPolyProjection_module.f90
index 175bdfe..a36db61 100644
--- a/source/ply_LegPolyProjection_module.f90
+++ b/source/ply_LegPolyProjection_module.f90
@@ -139,6 +139,9 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     real(kind=rk), allocatable :: newWorkDat(:)
     integer :: nChildDofs, oneDof
     ! -------------------------------------------------------------------- !
+
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nDofs, nComponents, nChilds, nChildDofs, workDat)
+
     if (subsamp%projectionType.ne.ply_QLegendrePoly_prp) then
       call tem_abort( 'ERROR in ply_QPolyProjection: subsampling is ' &
         & // 'only implemented for Q-Legendre-Polynomials'            )
@@ -148,6 +151,7 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
+    !$OMP DO
     varLoop: do iVar=1,nVars
       nDofs = varDofs(iVar)
       nComponents = varcomps(iVar)
@@ -221,6 +225,9 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
       deallocate(projection_oneDof%projCoeff)
 
     end do varLoop
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_QPolyProjection
   ! ************************************************************************ !
@@ -262,6 +269,9 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
     real(kind=rk), allocatable :: projCoeffOneDim(:,:,:)
     real(kind=rk) :: dimexp
     ! -------------------------------------------------------------------- !
+
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, xShift, yShift, zShift)
+
     select case(dofType)
     case(ply_QLegendrePoly_prp)
       allocate(projection%projCoeff(nDofs, nChildDofs, nChilds))
@@ -274,6 +284,7 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
       projCoeffOneDim = ply_QLegOneDimCoeff( nint(nDofs**dimexp),     &
         &                                    nint(nChildDofs**dimexp) )
 
+      !$OMP DO
       ! Loop over the children of this element
       childLoop: do iChild = 1, nChilds
 
@@ -326,12 +337,14 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
           end do childDofLoop
         end do parentDofLoop
       end do childLoop
+      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in ply_initProjCoeff: initialization of '      &
         & // 'projection coefficients for subsampling is implemented only ' &
         & // 'for Q-Legendre polynomials'                                   )
     end select
+    !$OMP END PARALLEL
     deallocate(projCoeffOneDim)
   end subroutine ply_initQLegProjCoeff
   ! ************************************************************************ !
@@ -576,6 +589,9 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
     integer :: oneDof, noChilds, childpos
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
+
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iElem, lowElemIndex, upElemIndex, iChild, lowChildIndex, upChildIndex)
+
     nChilds = 2**ndims
     nElems = tree%nElems
     nElemsToRefine = count(new_refine_tree)
@@ -592,6 +608,7 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
 
     if (subsamp%sampling_lvl > 1) then
 
+      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -683,9 +700,11 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
         end if
 
       end do elementLoop
+      !$OMP END DO
 
     else
 
+      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           allocate(childData(nChildDofs*nChilds*nComponents))
@@ -736,8 +755,11 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
           deallocate(childData)
         end if
       end do elemLoop
+      !$OMP END DO
     end if
 
+    !$OMP END PARALLEL
+
   end subroutine ply_subsampleData
   ! ************************************************************************ !
 
@@ -775,8 +797,11 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
     integer :: childDof_pos, parentDof_pos
     real(kind=rk) :: projCoeff
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, iComp, projCoeff, childDof_pos, parentDof_pos)
+
     childData(:) = 0.0_rk
 
+    !$OMP DO
     childLoop: do iChild = 1, nChilds
       parentDofLoop: do iParentDof = 1, nParentDofs
         childDofLoop: do iChildDof = 1, nChildDofs
@@ -796,6 +821,9 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
         end do childDofLoop
       end do parentDofLoop
     end do childLoop
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_projDataToChild
   ! ************************************************************************ !
diff --git a/source/ply_fxt_module.f90 b/source/ply_fxt_module.f90
index e35f65f..616c31c 100644
--- a/source/ply_fxt_module.f90
+++ b/source/ply_fxt_module.f90
@@ -75,7 +75,7 @@ subroutine ply_init_fxt( fxt, header, degree, nDims, nodes, faces )
     leg1D_at_gauss = legendre_1D(gaussp1D, degree)
 
     select case(nDims)
-    case(3) 
+    case(3)
       call fxtf_flptld_init( flpt    = fxt%flpt,   &
          &                   degree  = degree,     &
          &                   nPoints = degree+1,   &
@@ -114,13 +114,13 @@ subroutine ply_init_fxt( fxt, header, degree, nDims, nodes, faces )
         end do
       end do
 
-    case(2) 
+    case(2)
       call fxtf_flptld_init( flpt    = fxt%flpt,   &
         &                    degree  = degree,     &
         &                    nPoints = degree+1,   &
-        &                    prec    = header%prec )  
- 
-      ! Fill up the nodes and the face with gauss legendre points   
+        &                    prec    = header%prec )
+
+      ! Fill up the nodes and the face with gauss legendre points
       allocate( nodes(nPoints**2, 3) )
       do iPoint=1,nPoints
         lb = (iPoint-1)*nPoints + 1
@@ -145,7 +145,7 @@ subroutine ply_init_fxt( fxt, header, degree, nDims, nodes, faces )
         end do
       end do
 
-    case(1) 
+    case(1)
       call fxtf_flptld_init( flpt    = fxt%flpt,   &
         &                    degree  = degree,     &
         &                    nPoints = degree+1,   &
@@ -211,10 +211,12 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
 
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
+    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -222,15 +224,20 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = modal_data(lb:ub), &
         &                   nodal_data = nodal_data(lb:ub)  )
     end do
+    !$OMP END DO
 
+    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,                             &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1), &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1)  )
     end do
+    !$OMP END DO
     nodal_data = modal_data
 
+    !$OMP END PARALLEL
+
   end subroutine ply_fxt_m2n_2D
   ! ************************************************************************ !
 
@@ -251,6 +258,8 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     real(kind=rk), pointer :: tmp_in(:), tmp_out(:)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, tmp_in, tmp_out, iColumn)
+
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
@@ -259,6 +268,7 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     tmp_in = -42
     tmp_out = -42
 
+    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -269,7 +279,9 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   nodal_data = tmp_out   )
       nodal_data(lb:ub) = tmp_out
     end do
+    !$OMP END DO
 
+    !$OMP DO
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1 ) / nModesPerDim ) * msq &
@@ -280,15 +292,20 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = nodal_data(lb:ub:nModesPerDim), &
         &                   nodal_data = modal_data(lb:ub:nModesPerDim)  )
     end do
+    !$OMP END DO
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
+    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,              &
         &                   modal_data = modal_data(lb:ub:msq), &
         &                   nodal_data = nodal_data(lb:ub:msq)  )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_fxt_m2n_3D
   ! ************************************************************************ !
@@ -333,10 +350,12 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
 
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
+    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -344,14 +363,21 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
+    !$OMP END DO
 
+
+    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,                             &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1), &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1)  )
     end do
+    !$OMP END DO
     modal_data = nodal_data
+
+    !$OMP END PARALLEL
+
   end subroutine ply_fxt_n2m_2D
   ! ************************************************************************ !
 
@@ -370,10 +396,13 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq, ntotalDofs
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, iColumn, ub, lb)
+
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
 
+    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -382,7 +411,9 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
+    !$OMP END DO
 
+    !$OMP DO
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1) / nModesPerDim ) * msq &
@@ -393,15 +424,20 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = modal_data(lb:ub:nModesPerDim), &
         &                   modal_data = nodal_data(lb:ub:nModesPerDim)  )
     end do
+    !$OMP END DO
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
+    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,              &
         &                   nodal_data = nodal_data(lb:ub:msq), &
         &                   modal_data = modal_data(lb:ub:msq)  )
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_fxt_n2m_3D
   ! ************************************************************************ !
diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index 8d25e8c..8e1ce65 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -241,8 +241,10 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iRow, iCell, iCol)
     if (nDofs > 1) then
 
+      !$OMP DO
       do iStrip=0,nIndeps-1,vlen
 
         ! Calculate the upper bound of the current strip
@@ -265,6 +267,7 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
         end do ! iRow = 1, nRows
       end do ! iStrip
+      !$OMP END DO
 
     else
 
@@ -272,9 +275,10 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
     end if
 
+    !$OMP END PARALLEL
   end subroutine ply_l2_projection
   ! ************************************************************************ !
- 
+
 
   ! ************************************************************************ !
   !> Transformation between modal and nodal values in 1D via L2 projection.
diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index 573e17b..4034bc0 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -46,6 +46,8 @@ contains
     integer :: leg(3), iDeg, iDeg1, iDeg2, iDeg3, DV(3)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, dofPosPrev, iDeg2, iDeg3, iVar,leg, dofPos, iDeg)
+
     if (present(dirVec)) then
       DV = dirvec
     else
@@ -58,6 +60,7 @@ contains
       endif
     endif
 
+    !$OMP DO
     do iDeg = 1, (mpd+1)**2
       iDeg1 = (iDeg-1)/(mpd+1) + 1      !! do IDeg1 = 1, mPd+1
       iDeg2 = iDeg - (iDeg1-1)*(mpd+1)  !! do IDeg2 = 1, mPd=1   !! iDeg2 = mod(iDeg-1,mpd+1)+1
@@ -116,7 +119,9 @@ contains
         end do
       end do
     end do
+    !$OMP END DO
 
+    !$OMP DO
     ! Scale the results due to the Jacobians of the mappings
     do dofpos=1,(mpd+1)**3
       ideg3 = (dofpos-1)/(mpd+1)**2 + 1
@@ -128,6 +133,7 @@ contains
         &                         * (2.0_rk/elemLength)         &
         &                         * (2.0_rk*leg(iDir) - 1.0_rk)
     end do
+    !$OMP END DO
 
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    ! Uncollapsed version of the scaling !
@@ -147,6 +153,8 @@ contains
 !!     end do
 !!   end do
 
+    !$OMP END PARALLEL
+
   end subroutine calcDiff_leg_normal
   ! ************************************************************************ !
 
@@ -178,6 +186,8 @@ contains
     integer :: leg(2), iDeg1, iDeg2, DV(2)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, iDeg2, leg, dofPos, dofPosPrev, iVar)
+
     if (present(dirVec)) then
       DV = dirvec
     else
@@ -188,6 +198,7 @@ contains
       endif
     endif
 
+    !$OMP DO
     do iDeg1 = 1, mPd+1
       iDeg2 =  mPd+1
       leg = (/iDeg1, iDeg2/)
@@ -244,7 +255,9 @@ contains
         end do
       end do
     end do
+    !$OMP END DO
 
+    !$OMP DO
     ! Scale the results due to the Jacobians of the mappings
     do iDeg1 = 1, mPd+1
       do iDeg2 = 1, mPd+1
@@ -256,6 +269,9 @@ contains
           &                         * (2.0_rk*leg(iDir) - 1.0_rk)
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine calcDiff_leg_2d_normal
   ! ************************************************************************ !
@@ -336,7 +352,7 @@ contains
           &                          elemLength    = elemLength,              &
           &                          dirvec        = dirvec(:,iDir),          &
           &                          iDir          = iDir                     )
-      enddo
+      end do
     endif
 
   end subroutine calcDiff_leg_2d
@@ -361,6 +377,8 @@ contains
     integer :: dofPos, dofPosPrev, dofPos2Prev
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(dofPos, iDegX, dofPosPrev, dofPos2Prev)
+
     ! Build the derivative in x direction
     dofPos = 1 + maxPolyDegree
     legCoeffsDiff(dofPos,:) = 0.0_rk
@@ -368,6 +386,7 @@ contains
       dofPosPrev = dofPos
       dofPos = 1 + (maxPolyDegree-1)
       legCoeffsDiff(dofPos,:) = legCoeffs(dofPosPrev,:)
+      !$OMP DO
       do iDegX = maxPolyDegree-1, 1, -1
         dofPos = 1 + (iDegX-1)
         dofPosPrev = 1 + (iDegX)
@@ -375,14 +394,19 @@ contains
         legCoeffsDiff(dofPos,:) = legCoeffsDiff(dofPos2Prev,:) &
           &                         + legCoeffs(dofPosPrev,:)
       end do
+      !$OMP END DO
     end if
 
+    !$OMP DO
     do iDegX = 1, maxPolyDegree+1
       dofPos = 1 + (iDegX-1)
       legCoeffsDiff(dofPos,:) = legCoeffsDiff(dofPos,:)     &
         &                         * (2.0_rk/elemLength)     &
         &                         * (2.0_rk*iDegX - 1.0_rk)
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine calcDiff_leg_1d
   ! ************************************************************************ !
diff --git a/source/ply_legser_module.f90 b/source/ply_legser_module.f90
index 0649a2a..dc3ad15 100644
--- a/source/ply_legser_module.f90
+++ b/source/ply_legser_module.f90
@@ -41,19 +41,23 @@ subroutine legser(A, B, n)
     real(kind=rk) :: ak, al, bb, c, d
     integer :: k, l, ll
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, d, c, l, ll, al, bb, ak)
 
     ak = 0.0_rk
 
     ! Calculation of the first Legendre coefficient
     b(1) = 0.5_rk * a(1)
+    !$OMP DO
     do k=3,n,2
       ak = ak + 2.0_rk
       b(1) = b(1) - a(k)/(ak*ak - 1.0_rk)
     end do
+    !$OMP END DO
     c = 2.0_rk / 3.0_rk
     al = 0.0_rk
 
     ! Start main loop (remaining Legendre coefficients)
+    !$OMP DO
     do l=2,n
       ! Calculation of the Lth coefficient
       ll = l+2
@@ -73,6 +77,9 @@ subroutine legser(A, B, n)
         &   / ( (al+al+3.0_rk)*(al+al+2.0_rk) )
       b(l) = (al+0.5_rk)*bb
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine legser
   ! ************************************************************************ !
diff --git a/source/ply_modg_basis_module.fpp b/source/ply_modg_basis_module.fpp
index 526a7e0..acd25c3 100644
--- a/source/ply_modg_basis_module.fpp
+++ b/source/ply_modg_basis_module.fpp
@@ -159,6 +159,8 @@ contains
     integer :: iFunc, jFunc
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(jFunc, iFunc)
+
     allocate(GaussPoints(nPoints))
     allocate(GaussPoints_left(nPoints))
     allocate(GaussPoints_right(nPoints))
@@ -198,6 +200,7 @@ contains
 
     allocate( integral%anz_anzShift(1:nFunc, 1:nFunc, 2))
 
+    !$OMP DO
     !loop over anzatz functions
     do jFunc = 1, nFunc
       do iFunc = 1, nFunc
@@ -219,6 +222,9 @@ contains
 
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine init_modg_covolumeCoeffs
   ! ************************************************************************ !
@@ -287,6 +293,7 @@ contains
       end do
     end do
 
+
   end subroutine init_modg_multilevelCoeffs
   ! ************************************************************************ !
 
@@ -417,6 +424,8 @@ contains
     real(kind=rk) :: n_q
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iAns, iAnsX, iAnsY, iAnsZ, n_q, ansPos)
+
     ! allocate the output array
     select case(basisType)
       case(Q_space)
@@ -442,6 +451,7 @@ contains
       polyValX(2,:) = coords(:,1)
       polyValY(2,:) = coords(:,2)
       polyValZ(2,:) = coords(:,3)
+      !$OMP DO
       ! ... higher order polynomials are build recursively
       do iAns = 3, maxPolyDegree+1
         n_q = 1.0_rk / real(iAns-1,kind=rk)
@@ -467,11 +477,13 @@ contains
           &                   * polyValZ(iAns-2,:) )  &
           &                 *n_q
       end do
+      !$OMP END DO
     end if
 
     ! Now, build the complete point value.
     select case(basisType)
       case(Q_space)
+        !$OMP DO
         do iAnsX = 1, maxPolyDegree+1
           do iAnsY = 1, maxPolyDegree+1
             do iAnsZ = 1, maxPolyDegree+1
@@ -483,19 +495,23 @@ contains
             end do
           end do
         end do
+        !$OMP END DO
       case(P_space)
         iAnsX = 1
         iAnsY = 1
         iAnsZ = 1
 ?? copy :: getDofsPTens(maxPolyDegree, ansPosMax)
+        !$OMP DO
         do ansPos = 1, ansPosMax
           polyVal(ansPos, :) = polyValX(iAnsX,:)     &
             &                    * polyValY(iAnsY,:) &
             &                    * polyValZ(iAnsZ,:)
 ?? copy :: nextModgCoeffPTens(iAnsX, iAnsY, iAnsZ)
         end do
+        !$OMP END DO
     end select
 
+    !$OMP END PARALLEL
 
   end subroutine evalLegendreTensPoly
   ! ************************************************************************ !
diff --git a/source/ply_nodes_module.f90 b/source/ply_nodes_module.f90
index 691edee..13d431d 100644
--- a/source/ply_nodes_module.f90
+++ b/source/ply_nodes_module.f90
@@ -5,13 +5,13 @@ module ply_nodes_module
 
   use tem_aux_module,    only: tem_abort
 
-  use ply_space_integration_module, only: ply_create_surface_gauss_points_cube,   & 
-   &                                      ply_create_surface_gauss_points_cube_2d,& 
-   &                                      ply_create_surface_gauss_points_cube_1d,& 
-   &                                      ply_create_volume_gauss_points_cube,    & 
-   &                                      ply_create_volume_gauss_points_cube_2d, & 
-   &                                      ply_create_volume_gauss_points_cube_1d 
-  
+  use ply_space_integration_module, only: ply_create_surface_gauss_points_cube,   &
+   &                                      ply_create_surface_gauss_points_cube_2d,&
+   &                                      ply_create_surface_gauss_points_cube_1d,&
+   &                                      ply_create_volume_gauss_points_cube,    &
+   &                                      ply_create_volume_gauss_points_cube_2d, &
+   &                                      ply_create_volume_gauss_points_cube_1d
+
    use ply_chebPoint_module,        only: create_volume_cheb_points_cube,           &
    &                                      create_volume_cheb_points_cube_2d,        &
    &                                      create_volume_cheb_points_cube_1d,        &
diff --git a/source/ply_poly_project_module.fpp b/source/ply_poly_project_module.fpp
index 98b8ddf..65658b4 100644
--- a/source/ply_poly_project_module.fpp
+++ b/source/ply_poly_project_module.fpp
@@ -123,7 +123,7 @@ module ply_poly_project_module
 
     !> projection header consits of general information like which kind
     !! of projection is used
-    
+
     !> In the body datatype, there is for each dimension the main data
     !! for the projection method stored
     type(ply_prj_body_type)   :: body_1d
@@ -359,7 +359,7 @@ contains
     me%body_3d%oversamp_dofs = (oversampling_order)**3
     me%body_2d%oversamp_dofs = (oversampling_order)**2
     me%body_1d%oversamp_dofs = oversampling_order
-    
+
     select case (trim(proj_init%header%kind))
     case('fpt')
       ! Fill fpt datatype
@@ -500,6 +500,8 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
+
 
     select case(trim(me%kind))
     case ('l2p')
@@ -509,23 +511,29 @@ contains
       ! additional summation
       select case(dim)
       case (1)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
+        !$OMP END DO
       case (2)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
+        !$OMP END DO
       case (3)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
+        !$OMP END DO
       end select
 
     case ('fpt')
@@ -542,40 +550,50 @@ contains
           &                   legCoeffs = modal_data,     &
           &                   nVars     = nVars           )
       case (1)
+        !$OMP DO
         do iVar = 1,nVars
           call ply_LegToPnt( fpt       = me%body_1d%fpt,     &
             &                pntVal    = nodal_data(:,iVar), &
             &                legCoeffs = modal_data(:,iVar), &
             &                nIndeps   = 1                   )
         end do
+        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
+        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_3D( fxt = me%body_3d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
+        !$OMP END DO
       case (2)
+        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_2D( fxt = me%body_2d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
+        !$OMP END DO
 
       case (1)
+        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_1D( fxt = me%body_1d%fxt,         &
             &               modal_data = modal_data(:,iVar), &
             &               nodal_data = nodal_data(:,iVar)  )
         end do
+        !$OMP END DO
       end select
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine ply_poly_project_m2n_multivar
   !****************************************************************************!
 
@@ -594,28 +612,36 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
+
 
     select case(trim(me%kind))
     case ('l2p')
       select case (dim)
       case (1)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
+        !$OMP END DO
       case (2)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
+        !$OMP END DO
       case (3)
+        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
+        !$OMP END DO
       end select
 
      case ('fpt')
@@ -632,47 +658,57 @@ contains
           &                   pntVal    = nodal_data,     &
           &                   legCoeffs = modal_data      )
       case (1)
+        !$OMP DO
         do iVar = 1,nVars
           call ply_pntToLeg( fpt       = me%body_1d%fpt,     &
             &                nIndeps   = 1,                  &
             &                pntVal    = nodal_data(:,iVar), &
             &                legCoeffs = modal_data(:,iVar)  )
         end do
+        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
-        do iVar = 1, nVars 
+        !$OMP DO
+        do iVar = 1, nVars
           call ply_fxt_n2m_3D(                                  &
             &         fxt              = me%body_3d%fxt,        &
             &         nodal_data       = nodal_data(:,iVar),    &
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
+        !$OMP END DO
 
       case (2)
-        do iVar = 1, nVars 
+        !$OMP DO
+        do iVar = 1, nVars
           call ply_fxt_n2m_2D(                                  &
             &         fxt              = me%body_2d%fxt,        &
             &         nodal_data       = nodal_data(:,iVar),    &
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
+        !$OMP END DO
 
       case (1)
-        do iVar = 1, nVars 
+        !$OMP DO
+        do iVar = 1, nVars
           call ply_fxt_n2m_1D(                               &
             &         fxt              = me%body_1d%fxt,     &
             &         nodal_data       = nodal_data(:,iVar), &
             &         modal_data       = modal_data(:,iVar)  )
         end do
+        !$OMP END DO
       end select
 
     case default
        write(logUnit(1),*) 'ERROR in projection nodal to modal'
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine ply_poly_project_n2m_multivar
   !***************************************************************************!
 
diff --git a/source/ply_poly_transformation_module.f90 b/source/ply_poly_transformation_module.f90
index c727fd5..a6e7a1e 100644
--- a/source/ply_poly_transformation_module.f90
+++ b/source/ply_poly_transformation_module.f90
@@ -101,11 +101,13 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
     integer :: nVars, nDofs, nComponents, nChildDofs
     integer :: iVar
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nComponents, nDofs, workData, nChildDofs)
 
     nVars = size(varDofs)
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
+    !$OMP DO
     varLoop: do iVar = 1, nVars
       nComponents = varComps(iVar)
       nDofs = vardofs(iVar)
@@ -145,6 +147,10 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
       deallocate(workData)
 
     end do varLoop
+    !$OMP END DO
+
+    !$OMP END PARALLEL
+
 
   end subroutine ply_Poly_Transformation
   ! ************************************************************************ !
@@ -193,6 +199,9 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
     real(kind=rk), allocatable :: transform_matrix(:,:)
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iChild, iElem)
+
+
     nChilds = 2**nDims
 
     max_modes = nint(real(nDofs, kind=rk)**(1.0_rk/real(nDims, kind=rk)))
@@ -218,6 +227,7 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
 
     if (subsamp%sampling_lvl > 1) then
 
+      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -304,7 +314,9 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elementLoop
+      !$OMP END DO
     else
+      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           ! Create lower and upper indices for all data of iElem in meshData.
@@ -355,10 +367,13 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elemLoop
+      !$OMP END DO
     end if
 
     deallocate(transform_matrix)
 
+    !$OMP END PARALLEL
+
   end subroutine ply_subsampleData
   ! ************************************************************************ !
 
@@ -402,6 +417,7 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs, &
     real(kind=rk), allocatable :: temp_data(:)
     real(kind=rk), allocatable :: childData_prev(:)
     ! -------------------------------------------------------------------- !
+
     parent_modes = nint(real(nParentDofs,kind=rk)        &
       &                 **(1/real(nDimensions,kind=rk)))
 
@@ -708,6 +724,7 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs, &
     deallocate(childData_prev)
     deallocate(temp_Data)
 
+
   end subroutine ply_projDataToChild
   ! ************************************************************************ !
 
@@ -746,6 +763,8 @@ subroutine ply_transform_matrix(max_modes, v)
     integer :: m, orig
     real(kind=rk) :: shifting, scaling
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(orig, m)
+
 
     ! transformation matrix looks like this:
     ! [1.0  --  --     shift=0.5   ]
@@ -768,6 +787,7 @@ subroutine ply_transform_matrix(max_modes, v)
       v(2,2) = scaling
 
       if (max_modes > 2) then
+        !$OMP DO
         do orig = 3,max_modes
           v(1,orig) = ply_beta(orig-1) * v(1,orig-2)                    &
             &       + ply_alpha(orig-1) * shifting * v(1,orig-1)        &
@@ -785,12 +805,14 @@ subroutine ply_transform_matrix(max_modes, v)
             end if
           end do
         end do
+        !$OMP END DO
       end if
 
       ! Due to the symmetry of the problem (the left subinterval has just
       ! the shifting with a changed sign), we can fill the other half of
       ! the matrix by copying the already computed values accordingly with
       ! a change in sign, as needed (alternatingly).
+      !$OMP DO
       do m = 1 , max_modes
         do orig = 1, m-1
           if (mod((m+orig),2) /= 0) then
@@ -800,8 +822,11 @@ subroutine ply_transform_matrix(max_modes, v)
           end if
         end do
       end do
+      !$OMP END DO
     end if
 
+    !$OMP END PARALLEL
+
   end subroutine ply_transform_matrix
   ! ************************************************************************ !
 
diff --git a/source/ply_sampling_adaptive_module.f90 b/source/ply_sampling_adaptive_module.f90
index a7cdbd4..04bdeb6 100644
--- a/source/ply_sampling_adaptive_module.f90
+++ b/source/ply_sampling_adaptive_module.f90
@@ -115,7 +115,7 @@ module ply_sampling_adaptive_module
   type sampled_method_data_type
     type(realarray_type), allocatable :: component(:)
   end type sampled_method_data_type
-    
+
 
 contains
 
@@ -446,7 +446,7 @@ subroutine ply_sample_adaptive( me, ndims, orig_mesh, orig_bcs, varsys,   &
         elemlevel = tem_levelOf(curmesh%treeID(iElem))
         reached_limit(iElem) = lastrefine                                     &
           &                    .or. ( (me%AbsUpperBoundLevel > 0)             &
-          &                    .and. (elemlevel >= me%AbsUpperBoundLevel - 1) ) 
+          &                    .and. (elemlevel >= me%AbsUpperBoundLevel - 1) )
 
         ! Now get the spectral variation (sum of absolute values of all
         ! higher modes).
@@ -906,6 +906,7 @@ pure function sum_abs_mode(iMode, degree, nDims, dat) result(modesum)
     integer :: nModes
     integer :: jMode, kMode
     ! -------------------------------------------------------------------- !
+
     nModes = (degree+1)
 
     select case(nDims)
@@ -930,6 +931,7 @@ pure function sum_abs_mode(iMode, degree, nDims, dat) result(modesum)
         end do
       end do
     end select
+
   end function sum_abs_mode
 
 end module ply_sampling_adaptive_module
diff --git a/source/ply_sampling_module.fpp b/source/ply_sampling_module.fpp
index aebb914..d9651b3 100644
--- a/source/ply_sampling_module.fpp
+++ b/source/ply_sampling_module.fpp
@@ -652,15 +652,21 @@ contains
     integer :: iElem
     integer :: nComps
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem)
+
     nComps = fun%nComponents
     datlen = tree%nElems * nComps
 
     call c_f_pointer(fun%method_data, p)
 
+    !$OMP DO
     do iElem=1,n
       res(1+(iElem-1)*nComps:iElem*nComps) &
         & = p%dat(1+(elempos(iElem)-1)*nComps:elempos(iElem)*nComps)
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine get_sampled_element
   ! ************************************************************************ !
diff --git a/source/ply_sampling_varsys_module.f90 b/source/ply_sampling_varsys_module.f90
index cd9d63f..2473da5 100644
--- a/source/ply_sampling_varsys_module.f90
+++ b/source/ply_sampling_varsys_module.f90
@@ -83,6 +83,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
     integer, allocatable :: elempos(:)
     real(kind=rk), allocatable :: elemdat(:)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, iComponent, iElem, iTotComp)
 
     nVars = trackInst%varmap%varPos%nVals
 
@@ -114,12 +115,14 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
       nComponents = varsys%method%val(varpos)%nComponents
       nDofs = (var_degree(iVar)+1)**nDims
 
+      !$OMP DO
       do iComponent=1,nComponents
         iTotComp = iScalar+iComponent-1
         call ply_sampling_var_allocate( var     = var(iTotComp), &
           &                             nElems  = nElems,        &
           &                             datalen = nDofs*nElems   )
       end do
+      !$OMP END DO
 
       isScalar: if (nComponents == 1) then
 
@@ -141,6 +144,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
         ! To avoid overly large memory consumption, we do this element by
         ! element.
         allocate(elemDat(nComponents*ndofs))
+        !$OMP DO
         do iElem=1,nElems
           call varSys%method%val(varpos)%get_element( &
             &    varSys  = varSys,                    &
@@ -156,6 +160,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
               & = elemdat(iComponent::nComponents)
           end do
         end do
+        !$OMP END DO
         deallocate(elemDat)
 
       end if isScalar
@@ -165,6 +170,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
       ! polynomial degree in each element.
       ! This is redundant, but convenient.
       lastScalar = iScalar+nComponents-1
+      !$OMP DO
       do iComponent=0,nComponents-1
         var(iScalar+iComponent)%first(1) = 1
         do iElem=1,nElems
@@ -173,11 +179,14 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
           var(iScalar+iComponent)%degree(iElem) = var_degree(iVar)
         end do
       end do
+      !$OMP END DO
 
       iScalar = iScalar + nComponents
 
     end do variables
 
+    !$OMP END PARALLEL
+
   end subroutine ply_sampling_varsys_for_track
   ! ------------------------------------------------------------------------ !
   ! ------------------------------------------------------------------------ !
@@ -298,6 +307,7 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     integer :: nElems
     integer :: ndofs
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem, ndofs, absmean, variation)
 
     if (allocated(var%deviates)) deallocate(var%deviates)
     var%nDeviating = 0
@@ -305,6 +315,7 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     if (allocated(var%first)) then
       nElems = size(var%first)-1
       allocate(var%deviates(nElems))
+      !$OMP DO
       do iElem=1,nElems
         ndofs = var%first(iElem+1) - var%first(iElem) - 1
         absmean = max( abs(var%dat(var%first(iElem))), min_mean )
@@ -312,8 +323,11 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
         var%deviates(iElem) = (variation > threshold*absmean)
         if (var%deviates(iElem)) var%nDeviating = var%nDeviating + 1
       end do
+      !$OMP END DO
     end if
 
+    !$OMP END PARALLEL
+
   end subroutine ply_sampling_var_compute_elemdev
   ! ------------------------------------------------------------------------ !
   ! ------------------------------------------------------------------------ !
diff --git a/source/ply_space_integration_module.f90 b/source/ply_space_integration_module.f90
index 8d7ae83..fc5bb7f 100644
--- a/source/ply_space_integration_module.f90
+++ b/source/ply_space_integration_module.f90
@@ -115,6 +115,7 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**3
     allocate(points(numQuadPoints,3))
@@ -129,6 +130,7 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
+    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -144,6 +146,10 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
         end do
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
+
   end subroutine ply_create_volume_gauss_points_cube
   ! *********************************************************************** !
 
@@ -166,6 +172,7 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**2
     allocate(points(numQuadPoints,3))
@@ -180,6 +187,7 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
+    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
@@ -192,6 +200,10 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
         pointNumber = pointNumber + 1
       end do
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
+
   end subroutine ply_create_volume_gauss_points_cube_2d
   ! *********************************************************************** !
 
@@ -213,6 +225,7 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
 
     allocate(points(num_intp_per_direction,3))
     allocate(weights(num_intp_per_direction))
@@ -226,6 +239,7 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
+    !$OMP DO
     do i = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -236,6 +250,10 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(i)
       pointNumber = pointNumber + 1
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
+
   end subroutine ply_create_volume_gauss_points_cube_1d
   ! *********************************************************************** !
 
@@ -265,6 +283,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nquadPoints
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i)
+
     nQuadPoints = num_intp_per_direction**2
 
     allocate(points(nQuadPoints,3))
@@ -282,6 +302,7 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -293,8 +314,10 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -306,8 +329,10 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
 
     case(3) ! face in z direction, z coord is fixed
+      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -319,6 +344,7 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
+      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube:' &
@@ -326,6 +352,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine ply_create_surface_gauss_points_cube
   ! *********************************************************************** !
 
@@ -355,6 +383,8 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nQuadPoints
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
+
     ! The number of quadrature points on the boundary of a 2d volume is the
     ! number of quad points in one direction
     nQuadPoints = num_intp_per_direction
@@ -374,6 +404,7 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points for 2d case to get the three dimensional values.
@@ -383,8 +414,10 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
+      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points in 2d case to get the three dimensional values.
@@ -394,6 +427,7 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
+      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube_2d:' &
@@ -401,6 +435,8 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     end select
 
+    !$OMP END PARALLEL
+
   end subroutine ply_create_surface_gauss_points_cube_2d
   ! *********************************************************************** !
 
@@ -467,6 +503,8 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j)
+
     numQuadPoints = num_intp_per_direction
 
     allocate(points(numQuadPoints,1))
@@ -481,6 +519,7 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
+    !$OMP DO
     do j = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -488,6 +527,10 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(j)
       pointNumber = pointNumber + 1
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
+
   end subroutine ply_create_gauss_points_1d
   ! *********************************************************************** !
 
@@ -515,12 +558,14 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
     real(kind=rk) :: EPS
     integer :: m, i, j
     ! ---------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, p1, p2, p3, pp, z1, z)
 
     EPS= 1.0 / (10.0**(PRECISION(1.0_rk)-2) )
     m = (nIntP+1)/2
     xm = 0.5*(x2+x1)
     xl = 0.5*(x2-x1)
 
+    !$OMP DO
     do i = 1, m
 
       z = cos(PI*((i-1)+0.75_rk)/(nIntP+0.5_rk))
@@ -545,6 +590,9 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
       w(nIntp-i+1) = w(i)
 
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_gaussLegPoints
   ! *********************************************************************** !
diff --git a/source/ply_split_element_module.f90 b/source/ply_split_element_module.f90
index a02c1b8..5fd0468 100644
--- a/source/ply_split_element_module.f90
+++ b/source/ply_split_element_module.f90
@@ -126,7 +126,7 @@ end subroutine ply_split_element_init
   !! indices would most likely be better.
   !! Maybe, using explicit shaped arrays and therby allowing more dimensions
   !! in the input, while keeping the interface to two dimensions for all
-  !! cases (the normal direction and all independent degrees of freedom). 
+  !! cases (the normal direction and all independent degrees of freedom).
   !! For vectorization on x86 it also is necessary to have a stride-1 access
   !! only in reading and writing.
   !! The rotation of data might not be the best option because of this.
@@ -219,6 +219,7 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     integer :: nParents
     integer :: parentpos, childpos
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDir, parentMode, iParent, childMode, indep)
 
     nParents = size(parent_data,2)
 
@@ -232,10 +233,13 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     ! The number of independent modes (in normal directions) is given
     ! by the product of the length in all directions, except the last one.
     nIndeps = 1
+    !$OMP DO
     do iDir=1,nDims-1
       nIndeps = nIndeps*inLen(iDir)
     end do
+    !$OMP END DO
 
+    !$OMP DO
     oldmodes: do parentMode=1,inLen(nDims)
       ! Maximal number modes to compute, as this is a triangular matrix
       ! it is limited by the diagonal (parentMode). However, it may be
@@ -266,6 +270,9 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
       end do elemloop
 
     end do oldmodes
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
   end subroutine ply_split_element_singleD
   ! ======================================================================== !
diff --git a/source/ply_split_legendre_module.f90 b/source/ply_split_legendre_module.f90
index 2b7ffda..3af9284 100644
--- a/source/ply_split_legendre_module.f90
+++ b/source/ply_split_legendre_module.f90
@@ -346,7 +346,7 @@ subroutine ply_split_legendre_test(success)
       write(*,*) 'Beta check failed'
       RETURN
     end if
- 
+
     ! Expected properties for alpha_frac:
     ! * It should be the same as dividing the corresponding values of alpha
     ! * If both modes are identic, it should be one
diff --git a/source/ply_transfer_module.fpp b/source/ply_transfer_module.fpp
index 35a2a99..5f87f8a 100644
--- a/source/ply_transfer_module.fpp
+++ b/source/ply_transfer_module.fpp
@@ -123,12 +123,14 @@ contains
 
       ispace_oq: if (inspace == Q_Space) then
 
+        !$OMP DO
         ! Both, output and input are Q Polynomials
         do out_Y=0,minord-1
           out_off = out_Y*(outdegree+1)
           in_off = out_Y*(indegree+1)
           outdat(out_off+1:out_off+minord) = indat(in_off+1:in_off+minord)
         end do
+        !$OMP END DO
 
       else ispace_oq
 

From 8f40ed583de53c43ecd581580ee291554d9ad6d1 Mon Sep 17 00:00:00 2001
From: Robin Weihe <robin.weihe@student.uni-siegen.de>
Date: Tue, 30 Oct 2018 09:38:11 +0100
Subject: [PATCH 04/21] rams%b and params%u were removed from params. They were
 put in their own datatype. The modules where params is used were updated.
 ply_legFpt_module, ply_l2p_module, ply_leg_diff_module and
 ply_oversample_module were OpenMP parallelized.

--HG--
branch : feature/openmp
---
 source/ply_oversample_module.fpp | 48 +++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index aa8780b..38aab0b 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -155,7 +155,9 @@ contains
     nScalars = size(modalCoeffs,2)
     maxorders = 3*(poly_proj%min_degree + 1)-2
 
+    !$OMP WORKSHARE
     modalCoeffs = 0.0_rk
+    !$OMP END WORKSHARE
 
     if (poly_proj%basisType == Q_Space) then
       posQ: if (present(ensure_positivity)) then
@@ -163,6 +165,7 @@ contains
         varQ: do iVar=1,nScalars
           if (ensure_positivity(iVar)) then
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do dof = 1, mpd1_cube
               iDegZ = (dof-1)/mpd1_square + 1
               iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -178,9 +181,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varQ
         do iVar=1,nScalars
+          !$OMP DO
           do dof = 1, mpd1_cube
             iDegZ = (dof-1)/mpd1_square + 1
             iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -193,8 +198,10 @@ contains
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end if
           end do
+          !$OMP END DO
         end do
       else posQ
+        !$OMP DO
         do dof = 1, mpd1_cube
           iDegZ = (dof-1)/mpd1_square + 1
           iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -206,6 +213,7 @@ contains
             modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
           end do
         end do
+        !$OMP END DO
       end if posQ
 
     else !P_Space
@@ -218,6 +226,7 @@ contains
             iDegY = 1
             iDegZ = 1
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
               iOrd = iDegX+iDegY+iDegZ-2
@@ -232,12 +241,14 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varP
         do iVar=1,nScalars
           iDegX = 1
           iDegY = 1
           iDegZ = 1
+          !$OMP SINGLE
           do idof = 1, poly_proj%body_3d%min_dofs
             iOrd = iDegX+iDegY+iDegZ-2
             if (iOrd > ord_lim) EXIT
@@ -248,11 +259,13 @@ contains
             modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
           end do
+          !$OMP END SINGLE
         end do
       else posP
         iDegX = 1
         iDegY = 1
         iDegZ = 1
+        !$OMP SINGLE
         do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
           dofOverSamp = iDegX + ( iDegY-1  &
@@ -261,6 +274,7 @@ contains
           modalCoeffs(dofOverSamp,:) = state(dof,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
         end do
+        !$OMP END SINGLE
       end if posP
 
     end if
@@ -302,6 +316,7 @@ contains
     nScalars = size(modalCoeffs,2)
 
     if (poly_proj%basisType == Q_Space) then
+      !$OMP DO
       do iVar=1,nScalars
         do dof = 1, mpd1_cube
           iDegZ = (dof-1)/mpd1_square + 1
@@ -313,12 +328,14 @@ contains
           state(dof,iVar) = modalCoeffs(dofOverSamp,iVar)
         end do
       end do
+      !$OMP END DO
 
     else !P_Space
 
       iDegX = 1
       iDegY = 1
       iDegZ = 1
+      !$OMP SINGLE
       do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
         dofOverSamp = iDegX + ( iDegY-1  &
@@ -327,6 +344,7 @@ contains
         state(dof,:) = modalCoeffs(dofOverSamp,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
       end do
+      !$OMP END SINGLE
     end if
 
   end subroutine ply_convertFromoversample_3d
@@ -390,7 +408,9 @@ contains
     mpd1_square = mpd1**2
 
     ! Initialize oversampled space correct to 0
+    !$OMP WORKSHARE
     modalCoeffs(:,:) = 0.0_rk
+    !$OMP END WORKSAHRE
 
     if (poly_proj%basisType == Q_Space) then
       posQ: if (present(ensure_positivity)) then
@@ -398,6 +418,7 @@ contains
         varQ: do iVar=1,nPVars
           if (ensure_positivity(iVar)) then
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do dof = 1, mpd1_square
               iDegX = mod(dof-1,mpd1)+1
               iDegY = (dof-1)/mpd1+1
@@ -412,9 +433,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varQ
         do iVar=1,nPVars
+          !$OMP DO
           do dof = 1, mpd1_square
             iDegX = mod(dof-1,mpd1)+1
             iDegY = (dof-1)/mpd1+1
@@ -424,14 +447,17 @@ contains
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end if
           end do
+          !$OMP END DO
         end do
       else posQ
+      !$OMP DO
         do dof = 1, mpd1_square
           iDegX = mod(dof-1,mpd1)+1
           iDegY = (dof-1)/mpd1+1
           dofOverSamp = 1 + (iDegX-1) + (iDegY-1)*(oversamp_degree+1)
           modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
         end do
+        !$OMP END DO
       end if posQ
 
     else !P_Space
@@ -440,6 +466,7 @@ contains
         ord_lim = maxorders
         varP: do iVar=1,nPVars
           if (ensure_positivity(iVar)) then
+            !$OMP SINGLE
             iDegX = 1
             iDegY = 1
             ordersum = 0.0_rk
@@ -457,9 +484,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varP
         do iVar=1,nPVars
+          !$OMP SINGLE
           iDegX = 1
           iDegY = 1
           do idof = 1, poly_proj%body_2d%min_dofs
@@ -470,8 +499,10 @@ contains
             modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
           end do
+          !$OMP END SINGLE
         end do
       else posP
+        !$OMP SINGLE
         iDegX = 1
         iDegY = 1
         do idof = 1, poly_proj%body_2d%min_dofs
@@ -480,6 +511,7 @@ contains
           modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
         end do
+        !$OMP END SINGLE
       end if posP
 
     end if
@@ -530,15 +562,18 @@ contains
 
     if (poly_proj%basisType == Q_Space) then
 
+      !$OMP DO
       do dof = 1, mpd1_square
         iDegX = mod(dof-1,mpd1)+1
         iDegY = (dof-1)/mpd1+1
         dofOverSamp = 1 + (iDegX-1) + (iDegY-1)*(oversamp_degree+1)
         state(dof,1:nPVars) = modalCoeffs(dofOverSamp,1:nPVars)
       end do
+      !$OMP END DO
 
     else !P_Space
-
+      
+      !$OMP SINGLE
       iDegX = 1
       iDegY = 1
       iDegZ = 0 ! not used in posOfModgCoeffPTens_2D, nextModgCoeffPTens
@@ -548,6 +583,7 @@ contains
         state(dof,1:nPVars) = modalCoeffs(dofOverSamp,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
       end do
+      !$OMP END SINGLE
 
     end if
 
@@ -597,13 +633,16 @@ contains
     nPVars = (poly_proj%maxPolyDegree+1)*nVars
 
     ! Initialize oversampled space correct to 0
+    !$OMP WORKSHARE
     ModalCoeffs(:,:) = 0.0_rk
+    !$OMP END WORKSHARE
 
     if (present(ensure_positivity)) then
       ord_lim = poly_proj%min_degree+1
       do iVar = 1,nVars
         if (ensure_positivity(iVar)) then
           varSum = 0.0_rk
+          !$OMP SINGLE
           do iPoint=2,ord_lim
             varSum = varSum + abs(state(iPoint,iVar))
             if (varSum >= state(1,iVar)) then
@@ -611,19 +650,24 @@ contains
               EXIT
             end if
           end do
+          !$OMP END SINGLE
         end if
       end do
       do iVar=1,nVars
+        !$OMP DO
         do iPoint=1,ord_lim
           ModalCoeffs(iPoint,iVar) = state(iPoint,iVar)
         end do
+        !$OMP END DO
       end do
     else
+      !$OMP DO
       do iVP = 1,nPVars
         iVar = (iVP-1)/(poly_proj%min_degree+1) + 1
         iPoint = iVP - (iVar-1)*(poly_proj%min_degree+1)
         ModalCoeffs(iPoint,iVar) = state(iPoint,iVar)
       end do
+      !$OMP END DO
     end if
 
   end subroutine ply_convert2oversample_1d
@@ -659,11 +703,13 @@ contains
       nPVars = (poly_proj%maxPolyDegree+1)*size(state,2)
     end if
 
+    !$OMP DO
     do iVP = 1,nPVars
       iVar = (iVP-1)/(poly_proj%min_degree+1) + 1
       iPoint = iVP - (iVar-1)*(poly_proj%min_degree+1)
       state(iPoint,iVar) = modalCoeffs(iPoint,iVar)
     end do
+    !$OMP END DO
 
   end subroutine ply_convertFromoversample_1d
   ! ************************************************************************ !

From 21f5a440f74c82df5eadbd6b2013eeb96775667f Mon Sep 17 00:00:00 2001
From: Robin Weihe <robin.weihe@student.uni-siegen.de>
Date: Tue, 30 Oct 2018 15:13:26 +0100
Subject: [PATCH 05/21] params%b and params%u were removed from params for
 parallel implementation. They were put in their own Datatype. The modules
 using params were updated. Open MP implementation in ply_legfpt_module,
 ply_l2p_module,ply_leg_diff_module and ply_oversample_module

--HG--
branch : feature/openmp
---
 source/fpt/ply_chebPoint_module.f90           | 73 --------------
 source/fpt/ply_legFpt_2D_module.fpp           | 44 ++++-----
 source/fpt/ply_legFpt_3D_module.fpp           | 35 ++++---
 source/fpt/ply_legFpt_module.f90              | 51 +++++++---
 source/fpt/ply_polyBaseExc_module.fpp         | 98 ++++++++++++-------
 source/ply_LegPolyProjection_module.f90       | 23 +----
 source/ply_fxt_module.f90                     | 36 +------
 source/ply_l2p_module.f90                     |  2 +-
 source/ply_leg_diff_module.fpp                | 16 ++-
 source/ply_legser_module.f90                  |  6 --
 source/ply_modg_basis_module.fpp              | 16 +--
 source/ply_oversample_module.fpp              | 46 +++++++++
 source/ply_poly_project_module.fpp            | 67 +++++--------
 source/ply_poly_transformation_module.f90     | 19 ----
 source/ply_sampling_module.fpp                |  4 -
 source/ply_sampling_varsys_module.f90         | 15 +--
 source/ply_space_integration_module.f90       | 35 -------
 source/ply_split_element_module.f90           |  7 +-
 source/ply_transfer_module.fpp                |  2 -
 .../ply_fpt_2D_lobattoNodes_test.fpp          | 10 +-
 .../ply_fpt_2D_singVar_lobattoNodes_test.fpp  | 10 +-
 utests/with_fftw/ply_fpt_2D_singVar_test.fpp  | 10 +-
 utests/with_fftw/ply_fpt_2D_test.fpp          |  9 +-
 .../ply_fpt_3D_lobattoNodes_test.fpp          |  9 +-
 .../with_fftw/ply_fpt_3D_performance_test.f90 | 33 ++++---
 utests/with_fftw/ply_fpt_3D_test.fpp          |  9 +-
 ...fpt_ifpt_2D_multiVar_lobattoNodes_test.f90 | 39 ++++----
 .../ply_fpt_ifpt_2D_multiVar_test.f90         | 35 ++++---
 ..._fpt_ifpt_2D_singVar_lobattoNodes_test.f90 | 15 ++-
 .../ply_fpt_ifpt_2D_singVar_test.f90          | 35 ++++---
 ...fpt_ifpt_3D_multiVar_lobattoNodes_test.f90 | 35 ++++---
 .../ply_fpt_ifpt_3D_multiVar_test.f90         | 33 ++++---
 ..._fpt_ifpt_3D_singVar_lobattoNodes_test.f90 | 35 ++++---
 .../ply_fpt_ifpt_3D_singVar_test.f90          | 35 ++++---
 utests/with_fftw/ply_fpt_ifpt_test.f90        | 24 +++--
 .../with_fftw/ply_fpt_lobattoNodes_test.f90   | 32 +++---
 utests/with_fftw/ply_fpt_test.f90             |  8 +-
 .../ply_ifpt_3D_singVar_lobattoNodes_test.fpp | 10 +-
 utests/with_fftw/ply_ifpt_3D_singVar_test.fpp | 10 +-
 .../with_fftw/ply_ifpt_lobattoNodes_test.f90  | 32 +++---
 utests/with_fftw/ply_ifpt_test.f90            | 33 ++++---
 41 files changed, 525 insertions(+), 571 deletions(-)

diff --git a/source/fpt/ply_chebPoint_module.f90 b/source/fpt/ply_chebPoint_module.f90
index 01dc4f0..c9a4b1c 100644
--- a/source/fpt/ply_chebPoint_module.f90
+++ b/source/fpt/ply_chebPoint_module.f90
@@ -39,17 +39,11 @@ subroutine ply_chebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
-
     allocate(chebPnt1D(nPoints))
-    !$OMP DO
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = -1.0_rk &
         & * cos( PI / nPoints * ( (iPoint - 1.0_rk) + 1.0_rk / 2.0_rk ) )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_chebPoint_1D
   ! ************************************************************************ !
@@ -70,16 +64,11 @@ subroutine ply_lobattoChebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
 
     allocate(chebPnt1D(nPoints))
-    !$OMP DO
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = cos( ( iPoint - 1.0_rk ) * PI / ( nPoints - 1.0_rk ) )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_lobattoChebPoint_1D
   ! ************************************************************************ !
@@ -96,14 +85,12 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
     nquadpoints = num_intp_per_direction**3
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -116,9 +103,6 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube
   ! ************************************************************************ !
@@ -136,14 +120,12 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
     nquadpoints = num_intp_per_direction**3
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -156,9 +138,6 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube
   ! ************************************************************************ !
@@ -176,8 +155,6 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
-
 
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
@@ -185,7 +162,6 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -196,9 +172,6 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_2d
   ! ************************************************************************ !
@@ -218,14 +191,12 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -236,9 +207,6 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_2d
   ! ************************************************************************ !
@@ -254,13 +222,11 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -269,9 +235,6 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_1d
   ! ************************************************************************ !
@@ -290,13 +253,11 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -305,9 +266,6 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_1d
   ! ************************************************************************ !
@@ -333,7 +291,6 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
@@ -344,7 +301,6 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -353,9 +309,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -364,9 +318,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber , 1) = chebPnt1D(i)
@@ -375,14 +327,11 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube: unknown ' &
         & // 'face direction'                                              )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_cheb_points_cube
   ! ************************************************************************ !
 
@@ -407,7 +356,6 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
     allocate(points(nquadpoints,3))
@@ -418,7 +366,6 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -427,9 +374,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -438,9 +383,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -449,14 +392,11 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_lobattocheb_points_cube:' &
         & // ' unknown face direction'                                   )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_lobattocheb_points_cube
   ! ************************************************************************ !
 
@@ -480,7 +420,6 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
     allocate(points(nquadpoints,3))
@@ -491,29 +430,24 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine create_surface_cheb_points_cube_2d
   ! ************************************************************************ !
@@ -538,7 +472,6 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
     allocate(points(nquadpoints,3))
@@ -549,30 +482,24 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_lobattocheb_points_cube_2d
   ! ************************************************************************ !
 
diff --git a/source/fpt/ply_legFpt_2D_module.fpp b/source/fpt/ply_legFpt_2D_module.fpp
index 2f62583..81343c8 100644
--- a/source/fpt/ply_legFpt_2D_module.fpp
+++ b/source/fpt/ply_legFpt_2D_module.fpp
@@ -12,6 +12,7 @@ module ply_legFpt_2D_module
   use ply_legFpt_module,  only: ply_legFpt_type, &
     &                           ply_legToPnt, &
     &                           ply_pntToLeg, &
+    &                           ply_legFpt_bu_type, &
     &                           assignment(=)
 
   implicit none
@@ -37,10 +38,11 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_legToPnt_2D_singVar( fpt, legCoeffs, pntVal )
+  subroutine ply_legToPnt_2D_singVar( fpt, legCoeffs, pntVal, bu )
    ! --------------------------------------------------------------------- !
    !> The FPT parameters.
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    !> The Legendre coefficients to convert to point values (Chebyshev nodes).
    !! \attention Although this array serves as input only, it is modified
    !! inside of this routine by the underlying FPT algorithm. So, when
@@ -54,8 +56,6 @@ contains
    real(kind=rk), dimension(:), allocatable :: alph
    real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iAlph, nIndeps)
-
    striplen = fpt%legToChebParams%striplen
    n = fpt%legToChebParams%n
 
@@ -70,7 +70,7 @@ contains
    !  1  4  7
    !  2  5  8
    !  3  6  9
-   !$OMP DO
+
    yStripLoop: do iStrip = 1, n, striplen
      ! iAlph is the index of the first element in a line for the transformation
      ! in y-direction.
@@ -84,16 +84,15 @@ contains
      call ply_legToPnt( fpt       = fpt,     &
        &                nIndeps   = nIndeps, &
        &                legCoeffs = alph,    &
-       &                pntVal    = gam      )
+       &                pntVal    = gam,     &
+       &                bu        = bu       )
 
      ! Write gam to pntVal array
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do yStripLoop
-   !$OMP END DO
 
    ! x-direction
-   !$OMP DO
    xStripLoop: do iStrip = 1, n, striplen
      do iAlph = iStrip, min(iStrip+striplen-1, n)
        alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n) !ztrafo
@@ -105,14 +104,13 @@ contains
      call ply_legToPnt( fpt       = fpt,     &
        &                nIndeps   = nIndeps, &
        &                legCoeffs = alph,    &
-       &                pntVal    = gam      )
+       &                pntVal    = gam,     &
+       &                bu        = bu       )
 
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do xStripLoop
-   !$OMP END DO
 
-   !$OMP END PARALLEL
 
   end subroutine ply_legToPnt_2D_singVar
   ! ************************************************************************ !
@@ -121,10 +119,11 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_legToPnt_2D_multVar( fpt, legCoeffs, pntVal, nVars )
+  subroutine ply_legToPnt_2D_multVar( fpt, legCoeffs, pntVal, nVars, bu )
    ! --------------------------------------------------------------------- !
    !> The FPT parameters.
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    !> The Legendre coefficients to convert to point values (Chebyshev nodes).
    !! \attention Although this array serves as input only, it is modified
    !! inside of this routine by the underlying FPT algorithm. So, when
@@ -140,7 +139,7 @@ contains
    ! --------------------------------------------------------------------- !
 
    do iVar = 1, nVars
-     call ply_legToPnt_2D(fpt, legCoeffs(:,iVar), pntVal(:,iVar))
+     call ply_legToPnt_2D(fpt, legCoeffs(:,iVar), pntVal(:,iVar), bu)
    end do
 
   end subroutine ply_legToPnt_2D_multVar
@@ -150,10 +149,11 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_pntToLeg_2D_singVar( fpt, pntVal, legCoeffs )
+  subroutine ply_pntToLeg_2D_singVar( fpt, pntVal, legCoeffs, bu )
    ! --------------------------------------------------------------------- !
     !> Parameters of the Fast Polynomial transformation.
     type(ply_legFpt_type), intent(inout) :: fpt
+    type(ply_legFpt_bu_type), intent(inout) :: bu
     !> The point values to transform to 2D modal Legendre expansion.
     !! \attention Although this array serves as input only, it is modified
     !! inside of this routine by the underlying DCT algorithm. So, when
@@ -167,7 +167,6 @@ contains
     real(kind=rk), dimension(:), allocatable :: alph
     real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(nIndeps, iStrip, iAlph)
 
     striplen = fpt%chebToLegParams%striplen
     n = fpt%legToChebParams%n
@@ -176,7 +175,6 @@ contains
     allocate(alph(min(striplen, n)*n))
     allocate(gam(min(striplen, n)*n))
 
-    !$OMP DO
     yStripLoop: do iStrip = 1, n, striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n) !y_Trafo
         alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n)
@@ -188,16 +186,16 @@ contains
       call ply_pntToLeg( fpt       = fpt,     &
         &                nIndeps   = nIndeps, &
         &                legCoeffs = gam,     &
-        &                pntVal    = alph     )
+        &                pntVal    = alph,    &
+        &                bu        = bu       )
 
       ! temp -> pntVal (stride-1 writing)
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do yStripLoop ! iStrip
-    !$OMP END DO
+
 
     ! x-direction
-    !$OMP DO
     xStripLoop: do iStrip = 1,n,striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n)
         !ztrafo
@@ -210,14 +208,13 @@ contains
       call ply_pntToLeg( fpt       = fpt,     &
         &                nIndeps   = nIndeps, &
         &                legCoeffs = gam,     &
-        &                pntVal    = alph     )
+        &                pntVal    = alph,    &
+        &                bu        = bu       )
 
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do xStripLoop
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg_2D_singVar
   ! ************************************************************************ !
@@ -226,10 +223,11 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_pntToLeg_2D_multVar( fpt, pntVal, legCoeffs, nVars )
+  subroutine ply_pntToLeg_2D_multVar( fpt, pntVal, legCoeffs, nVars, bu )
    ! --------------------------------------------------------------------- !
    !> Parameters of the Fast Polynomial transformation.
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    !> The point values to transform to 2D modal Legendre expansion.
    !! \attention Although this array serves as input only, it is modified
    !! inside of this routine by the underlying DCT algorithm. So, when
@@ -245,7 +243,7 @@ contains
    ! --------------------------------------------------------------------- !
 
    do iVar = 1, nVars
-     call ply_pntToLeg_2D(fpt, pntVal(:,iVar), legCoeffs(:,iVar))
+     call ply_pntToLeg_2D(fpt, pntVal(:,iVar), legCoeffs(:,iVar), bu)
    end do
 
   end subroutine ply_pntToLeg_2D_multVar
diff --git a/source/fpt/ply_legFpt_3D_module.fpp b/source/fpt/ply_legFpt_3D_module.fpp
index 4acfeef..79daf4a 100644
--- a/source/fpt/ply_legFpt_3D_module.fpp
+++ b/source/fpt/ply_legFpt_3D_module.fpp
@@ -8,6 +8,7 @@ module ply_legFpt_3D_module
   use env_module,         only: rk
   use ply_legFpt_module,  only: ply_legFpt_type, &
     &                           ply_legToPnt, &
+    &                           ply_legFpt_bu_type, &
     &                           ply_PntToLeg
 
   implicit none
@@ -31,10 +32,11 @@ contains
 
 
   ! ************************************************************************ !
-  subroutine ply_legToPnt_3D_singvar( fpt, legCoeffs, pntVal )
+  subroutine ply_legToPnt_3D_singvar( fpt, legCoeffs, pntVal, bu )
    ! --------------------------------------------------------------------- !
    !> The FPT parameters.
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    !> The Legendre coefficients to convert to point values (Chebyshev nodes).
    !! \attention Although this array serves as input only, it is modified
    !! inside of this routine by the underlying FPT algorithm. So, when
@@ -115,7 +117,8 @@ contains
      call ply_legToPnt( fpt       = fpt,     &
        &                nIndeps   = nIndeps, &
        &                legCoeffs = alph,    &
-       &                pntVal    = gam      )
+       &                pntVal    = gam,     &
+       &                bu        = bu       )
 
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
@@ -134,7 +137,8 @@ contains
      call ply_legToPnt( fpt       = fpt,     &
        &                nIndeps   = nIndeps, &
        &                legCoeffs = alph,    &
-       &                pntVal    = gam      )
+       &                pntVal    = gam,     &
+       &                bu        = bu       )
 
      legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
@@ -152,7 +156,8 @@ contains
      call ply_legToPnt( fpt       = fpt,     &
        &                nIndeps   = nIndeps, &
        &                legCoeffs = alph,    &
-       &                pntVal    = gam      )
+       &                pntVal    = gam,     &
+       &                bu        = bu       )
 
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
@@ -167,7 +172,7 @@ contains
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
   !!VK: no multivar fashion of this routine is used anymore
-  subroutine ply_legToPnt_3D_multVar( fpt, legCoeffs, pntVal, nVars )
+  subroutine ply_legToPnt_3D_multVar( fpt, legCoeffs, pntVal, nVars, bu )
    ! --------------------------------------------------------------------- !
    !> The Legendre coefficients to convert to point values (Chebyshev nodes).
    !! \attention Although this array serves as input only, it is modified
@@ -176,6 +181,7 @@ contains
    !! be modified.
    real(kind=rk), intent(inout) :: legCoeffs(:,:)
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    real(kind=rk), intent(inout) :: pntVal(:,:)
    integer, intent(in) :: nVars
    ! --------------------------------------------------------------------- !
@@ -183,7 +189,7 @@ contains
    ! --------------------------------------------------------------------- !
 
    do iVar = 1, nVars
-    call ply_legToPnt_3D( fpt, legCoeffs(:,iVar), pntVal(:,iVar) )
+    call ply_legToPnt_3D( fpt, legCoeffs(:,iVar), pntVal(:,iVar), bu )
    end do
 
   end subroutine ply_legToPnt_3D_multVar
@@ -193,9 +199,10 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_pntToLeg_3D_multVar( fpt, pntVal, legCoeffs, nVars )
+  subroutine ply_pntToLeg_3D_multVar( fpt, pntVal, legCoeffs, nVars, bu )
    ! --------------------------------------------------------------------- !
    type(ply_legFpt_type), intent(inout) :: fpt
+   type(ply_legFpt_bu_type), intent(inout) :: bu
    !> The point values to transform to 3D modal Legendre expansion.
    !! \attention Although this array serves as input only, it is modified
    !! inside of this routine by the underlying DCT algorithm. So, when
@@ -209,7 +216,7 @@ contains
    ! --------------------------------------------------------------------- !
 
    do iVar = 1, nVars
-     call ply_pntToLeg_3D( fpt, pntVal(:,iVar), legCoeffs(:,iVar) )
+     call ply_pntToLeg_3D( fpt, pntVal(:,iVar), legCoeffs(:,iVar), bu )
    end do
 
   end subroutine ply_pntToLeg_3D_multVar
@@ -219,9 +226,10 @@ contains
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_pntToLeg_3D_singVar( fpt, pntVal, legCoeffs )
+  subroutine ply_pntToLeg_3D_singVar( fpt, pntVal, legCoeffs, bu )
    ! --------------------------------------------------------------------- !
     type(ply_legFpt_type), intent(inout) :: fpt
+    type(ply_legFpt_bu_type), intent(inout) :: bu
     !> The point values to transform to 3D modal Legendre expansion.
     !! \attention Although this array serves as input only, it is modified
     !! inside of this routine by the underlying DCT algorithm. So, when
@@ -268,7 +276,8 @@ contains
       call ply_pntToLeg( fpt       = fpt,     &
         &                nIndeps   = nIndeps, &
         &                legCoeffs = gam,     &
-        &                pntVal    = alph     )
+        &                pntVal    = alph,    &
+        &                bu        = bu       )
 
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
@@ -290,7 +299,8 @@ contains
       call ply_pntToLeg( fpt       = fpt,     &
         &                nIndeps   = nIndeps, &
         &                legCoeffs = gam,     &
-        &                pntVal    = alph     )
+        &                pntVal    = alph,    &
+        &                bu        = bu       )
 
         ! todo: fft on temp
         ! temp -> pntVal (stride-1 writing)
@@ -310,7 +320,8 @@ contains
       call ply_pntToLeg( fpt       = fpt,     &
         &                nIndeps   = nIndeps, &
         &                legCoeffs = gam,     &
-        &                pntVal    = alph     )
+        &                pntVal    = alph,    &
+        &                bu        = bu       )
 
       ! todo: fft on temp
       ! temp -> pntVal (stride-1 writing)
diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 8d0678f..8593a19 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -11,6 +11,7 @@ module ply_legFpt_module
     &                               ply_fpt_single,        &
     &                               ply_legToCheb_param,   &
     &                               ply_chebToLeg_param,   &
+    &                               ply_bu_type,           &
     &                               assignment(=)
   use fftw_wrap
 
@@ -47,11 +48,18 @@ module ply_legFpt_module
     logical :: use_lobatto_points
   end type ply_legFpt_type
 
+  type ply_legFpt_bu_type
+    type(ply_bu_type) :: legToCheb_bu
+    type(ply_bu_type) :: chebToLeg_bu
+  end type ply_legFpt_bu_type
+
   interface assignment(=)
     module procedure Copy_fpt
+    module procedure Copy_fpt_bu
   end interface
 
   public :: ply_legFpt_type, ply_init_legFpt, ply_legToPnt, ply_pntToLeg
+  public :: ply_legFpt_bu_type
   public :: assignment(=)
 
 
@@ -77,18 +85,31 @@ end subroutine Copy_fpt
   ! ************************************************************************ !
 
 
+  ! ************************************************************************ !
+  subroutine Copy_fpt_bu(left, right)
+    ! -------------------------------------------------------------------- !
+    type(ply_legFpt_bu_type), intent(out) :: left
+    type(ply_legFpt_bu_type), intent(in) :: right
+    ! -------------------------------------------------------------------- !
+     left%legToCheb_bu = right%legToCheb_bu
+     left%chebToLeg_bu = right%chebToLeg_bu
+  end subroutine Copy_fpt_bu
+  ! ************************************************************************ !
+
+
   ! ************************************************************************ !
   !> Subroutine to initialize the fast polynomial transformation
   !! for Legendre expansion.
   subroutine ply_init_legFpt( maxPolyDegree, nIndeps, fpt, blocksize, &
     &                         approx_terms, striplen, lobattoPoints,  &
-    &                         subblockingWidth, fft_flags             )
+    &                         subblockingWidth, fft_flags, bu         )
     ! -------------------------------------------------------------------- !
     integer, intent(in) :: maxPolyDegree
 
     !> Number of independent values that can be computed simultaneously.
     integer, intent(in) :: nIndeps
     type(ply_legFpt_type), intent(inout) :: fpt
+    type(ply_legFpt_bu_type), intent(inout) :: bu
 
     !> Smallest block that is approximated by approx_terms coefficients.
     !!
@@ -158,7 +179,8 @@ subroutine ply_init_legFpt( maxPolyDegree, nIndeps, fpt, blocksize, &
       &                blocksize        = blocksize,           &
       &                approx_terms     = approx_terms,        &
       &                striplen         = maxstriplen,         &
-      &                subblockingWidth = subblockingWidth     )
+      &                subblockingWidth = subblockingWidth,    &
+      &                bu               = bu%legToCheb_bu      )
 
     ! Init the fast Chebyshev to Legendre transformation.
     call ply_fpt_init( n                = maxPolyDegree+1,     &
@@ -167,7 +189,8 @@ subroutine ply_init_legFpt( maxPolyDegree, nIndeps, fpt, blocksize, &
       &                blocksize        = blocksize,           &
       &                approx_terms     = approx_terms,        &
       &                striplen         = maxstriplen,         &
-      &                subblockingWidth = subblockingWidth     )
+      &                subblockingWidth = subblockingWidth,    &
+      &                bu               = bu%chebToLeg_bu      )
 
     ! Create the buffers for the intermediate arrays
     n = fpt%legToChebParams%n
@@ -215,10 +238,11 @@ end subroutine ply_init_legFpt
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
+  subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps, bu )
     ! -------------------------------------------------------------------- !
     real(kind=rk), intent(inout) :: legCoeffs(:)
     type(ply_legFpt_type), intent(inout) :: fpt
+     type(ply_legFpt_bu_type), intent(inout) :: bu
     real(kind=rk), intent(inout) :: pntVal(:)
     integer, intent(in) :: nIndeps
     ! -------------------------------------------------------------------- !
@@ -227,7 +251,7 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb, bu)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_points) then
@@ -236,7 +260,8 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
-          &                  params = fpt%legToChebParams       )
+          &                  params = fpt%legToChebParams,      &
+          &                  bu     = bu%legToCheb_bu           )
 
         ! Normalize the coefficients of the Chebyshev polynomials due
         ! to the unnormalized version of DCT in the FFTW.
@@ -255,7 +280,8 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
-          &                  params = fpt%legToChebParams       )
+          &                  params = fpt%legToChebParams,      &
+          &                  bu     = bu%legToCheb_bu           )
 
         ! Normalize the coefficients of the Chebyshev polynomials due
         ! to the unnormalized version of DCT in the FFTW.
@@ -277,9 +303,10 @@ end subroutine ply_legToPnt
   ! ************************************************************************ !
   !> Subroutine to transform Legendre expansion to point values
   !! at Chebyshev nodes.
-  subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
+  subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps, bu )
     ! -------------------------------------------------------------------- !
     type(ply_legFpt_type), intent(inout) :: fpt
+     type(ply_legFpt_bu_type), intent(inout) :: bu
     real(kind=rk), intent(inout) :: pntVal(:)
     real(kind=rk), intent(inout) :: legCoeffs(:)
     integer, intent(in) :: nIndeps
@@ -290,7 +317,7 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb, normFactor)
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb, normFactor, bu)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_Points) then
@@ -309,7 +336,8 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
 
         call ply_fpt_single( gam    = legCoeffs(iDof:iDof+n-1), &
           &                  alph   = cheb,                     &
-          &                  params = fpt%ChebToLegParams       )
+          &                  params = fpt%ChebToLegParams,      &
+          &                  bu     = bu%chebToLeg_bu           )
       end do
       !$OMP END DO
 
@@ -329,7 +357,8 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
 
         call ply_fpt_single( gam    = legCoeffs(iDof:iDof+n-1), &
           &                  alph   = cheb,                     &
-          &                  params = fpt%ChebToLegParams       )
+          &                  params = fpt%ChebToLegParams,      &
+          &                  bu     = bu%chebToLeg_bu           )
       end do
       !$OMP END DO
 
diff --git a/source/fpt/ply_polyBaseExc_module.fpp b/source/fpt/ply_polyBaseExc_module.fpp
index 496a912..c967880 100644
--- a/source/fpt/ply_polyBaseExc_module.fpp
+++ b/source/fpt/ply_polyBaseExc_module.fpp
@@ -81,9 +81,15 @@ module ply_polyBaseExc_module
     type(ply_matrixExpCoeffOddEven_type), allocatable :: col(:)
   end type ply_subvector_type
 
-  type ply_trafo_params_type
+  type ply_bu_type
     !> Lagrange polynomials evaluated at the Chebyshev points on [0,+1].
-    type(ply_sub_vec), allocatable :: u(:,:)
+     type(ply_sub_vec), allocatable :: u(:,:)
+
+    !> Conversion data structure used for fpt.
+    type(ply_subvector_type), allocatable :: b(:)
+  end type ply_bu_type
+
+  type ply_trafo_params_type
 
     !> The array to store the diagonals of the matrix in.
     !!
@@ -138,12 +144,11 @@ module ply_polyBaseExc_module
     !> The transformation type
     integer :: trafo
 
-    !> Conversion data structure used for fpt.
-    type(ply_subvector_type), allocatable :: b(:)
   end type ply_trafo_params_type
 
   interface assignment(=)
     module procedure Copy_trafo_params
+    module procedure Copy_bu
   end interface
 
   integer, parameter :: ply_legToCheb_param = 1
@@ -153,7 +158,7 @@ module ply_polyBaseExc_module
   public :: ply_fpt_exec_striped
   public :: ply_fpt_exec
   public :: ply_fpt_single
-  public :: ply_trafo_params_type
+  public :: ply_trafo_params_type, ply_bu_type
   public :: ply_legToCheb_param, ply_chebToLeg_param
   public :: ply_lambda
   public :: assignment(=)
@@ -198,22 +203,38 @@ contains
     left%diag = right%diag
     allocate( left%adapter(right%s, (right%s+mod(right%s,2)/2), right%nBlocks) )
     left%adapter = right%adapter
-    if(allocated(left%u))then
-      deallocate(left%u)
-    end if
-    allocate(left%u(1:right%h,0:right%k-1))
-    left%u = right%u
+!    if(allocated(left%u))then
+!      deallocate(left%u)
+!    end if
+!    allocate(left%u(1:right%h,0:right%k-1))
+!    left%u = right%u
 
   end subroutine Copy_trafo_params
   ! ************************************************************************ !
 
 
+  ! ************************************************************************ !
+  subroutine Copy_bu( left, right )
+    ! -------------------------------------------------------------------- !
+    type(ply_bu_type), intent(out) :: left
+    type(ply_bu_type), intent(in)  :: right
+    ! --------------------------------------------------------------------- !
+   if(allocated(left%u))then
+     deallocate(left%u)
+    end if
+    left%u = right%u
+  end subroutine Copy_bu
+  ! ************************************************************************ !
+
+
+
   ! ************************************************************************ !
   subroutine ply_fpt_init( n, params, trafo, blocksize, approx_terms, &
-    &                      striplen, subblockingWidth                 )
+    &                      striplen, subblockingWidth, bu              )
     ! -------------------------------------------------------------------- !
     integer, intent(in) :: n
     type(ply_trafo_params_type), intent(inout) :: params
+    type(ply_bu_type), intent(inout) :: bu
     integer, intent(in) :: trafo
 
     !> Smallest block that is approximated by approx_terms coefficients.
@@ -571,8 +592,8 @@ contains
 
     end if
 
-    allocate(params%u(0:h,0:k-1))
-    params%u(0:h,0:k-1) = u(0:h,0:k-1)
+    allocate(bu%u(0:h,0:k-1))
+    bu%u(0:h,0:k-1) = u(0:h,0:k-1)
     allocate(params%sub(0:h))
     params%sub(0:h) = sub(0:h)
     params%n = n
@@ -581,12 +602,12 @@ contains
     params%h = h
 
     ! Allocate the coefficients array for conversion
-    allocate(params%b(0:params%h))
+    allocate(bu%b(0:params%h))
     do l=0, h
       nRows = (params%nBlocks - 1) / (2**l) - 1
-      allocate(params%b(l)%col(2:nRows+1))
+      allocate(bu%b(l)%col(2:nRows+1))
       do j=2,nRows+1
-        allocate(params%b(l)%col(j)%coeff(0:k-1,0:1))
+        allocate(bu%b(l)%col(j)%coeff(0:k-1,0:1))
       end do
     end do
 
@@ -912,7 +933,7 @@ contains
   !> Convert strip of coefficients of a modal representation in terms of
   !! Legendre polynomials to modal coefficients in terms of Chebyshev
   !! polynomials.
-  subroutine ply_fpt_exec( alph, gam, params, nIndeps )
+  subroutine ply_fpt_exec( alph, gam, params, nIndeps, bu )
     ! -------------------------------------------------------------------- !
     !> Number of values that can be computed independently.
     integer, intent(in) :: nIndeps
@@ -930,6 +951,7 @@ contains
 
     !> The parameters of the fast polynomial transformation.
     type(ply_trafo_params_type), intent(inout) :: params
+    type(ply_bu_type), intent(inout) :: bu
     ! -------------------------------------------------------------------- !
     integer :: j, r, i, l, k, h, n, s, m
     integer :: iFun, indep
@@ -965,13 +987,13 @@ contains
         row_rem = mod(n-remainder, rowsize) + remainder + iFun
         blockColLoop: do j = 2, nRows+1, 1+mod(nRows,2)
           do r = 0, k-1
-            params%b(l)%col(j)%coeff(r,0) = 0.0_rk
-            params%b(l)%col(j)%coeff(r,1) = 0.0_rk
+            bu%b(l)%col(j)%coeff(r,0) = 0.0_rk
+            bu%b(l)%col(j)%coeff(r,1) = 0.0_rk
             do m = 0, rowsize-1
               odd = mod(row_rem + m + (j-1)*rowsize,2)
-              params%b(l)%col(j)%coeff(r,odd) &
-                &  = params%b(l)%col(j)%coeff(r,odd) &
-                &    + params%u(l,r)%dat(m) &
+              bu%b(l)%col(j)%coeff(r,odd) &
+                &  = bu%b(l)%col(j)%coeff(r,odd) &
+                &    + bu%u(l,r)%dat(m) &
                 &      * alph(row_rem + m + (j-1)*rowsize + 1)
             end do
           end do
@@ -988,7 +1010,7 @@ contains
                 gam(iVal) = gam(iVal)      &
                   &       + params%sub(l)%subRow(i)%subCol(j)%rowDat(m)&
                   &               %coeff(r) &
-                  &         * params%b(l)%col(j)%coeff(r,odd)
+                  &         * bu%b(l)%col(j)%coeff(r,odd)
               end do ! r
             end do ! m
           end do ! j
@@ -1048,10 +1070,11 @@ contains
   !> Convert strip of coefficients of a modal representation in terms of
   !! Legendre polynomials to modal coefficients in terms of Chebyshev
   !! polynomials.
-  subroutine ply_fpt_single( alph, gam, params)
+  subroutine ply_fpt_single( alph, gam, params, bu )
     ! -------------------------------------------------------------------- !
     !> The parameters of the fast polynomial transformation.
     type(ply_trafo_params_type), intent(inout) :: params
+    type(ply_bu_type), intent(inout) :: bu
 
     !> Modal coefficients of the Legendre expansion.
     !! Size has to be: params%n
@@ -1093,13 +1116,13 @@ contains
       row_rem = mod(n-remainder, rowsize) + remainder
       blockColLoop: do j = 2, nRows+1, 1+mod(nRows,2)
         do r = 0, k-1
-          params%b(l)%col(j)%coeff(r,0) = 0.0_rk
-          params%b(l)%col(j)%coeff(r,1) = 0.0_rk
+          bu%b(l)%col(j)%coeff(r,0) = 0.0_rk
+          bu%b(l)%col(j)%coeff(r,1) = 0.0_rk
           do m = 0, rowsize-1
             odd = mod(row_rem + m + (j-1)*rowsize,2)
-            params%b(l)%col(j)%coeff(r,odd) &
-              &  = params%b(l)%col(j)%coeff(r,odd) &
-              &    + params%u(l,r)%dat(m) &
+            bu%b(l)%col(j)%coeff(r,odd) &
+              &  = bu%b(l)%col(j)%coeff(r,odd) &
+              &    + bu%u(l,r)%dat(m) &
               &      * alph(row_rem + m + (j-1)*rowsize + 1)
           end do
         end do
@@ -1116,7 +1139,7 @@ contains
               gam(iVal) = gam(iVal)      &
                 &       + params%sub(l)%subRow(i)%subCol(j)%rowDat(m)&
                 &               %coeff(r) &
-                &         * params%b(l)%col(j)%coeff(r,odd)
+                &         * bu%b(l)%col(j)%coeff(r,odd)
             end do ! r
           end do ! m
         end do ! j
@@ -1173,13 +1196,14 @@ contains
   ! ************************************************************************ !
   !> Convert coefficients of a modal representation in terms of Legendre
   !! polynomials to modal coefficients in terms of Chebyshev polynomials.
-  subroutine ply_fpt_exec_striped( nIndeps, alph, gam, params )
+  subroutine ply_fpt_exec_striped( nIndeps, alph, gam, params, bu )
     ! -------------------------------------------------------------------- !
     !> Number of values that can be computed independently.
     integer, intent(in) :: nIndeps
 
     !> The parameters of the fast polynomial transformation.
     type(ply_trafo_params_type), intent(inout) :: params
+    type(ply_bu_type), intent(inout) :: bu
 
     !> Modal coefficients of the Legendre expansion.
     !! Size has to be: (1:params%n*indeps,nVars)
@@ -1236,13 +1260,13 @@ contains
 
           blockColLoop: do j = 2, nRows+1, 1+mod(nRows,2)
             do r = 0, k-1
-              params%b(l)%col(j)%coeff(r,0) = 0.0_rk
-              params%b(l)%col(j)%coeff(r,1) = 0.0_rk
+              bu%b(l)%col(j)%coeff(r,0) = 0.0_rk
+              bu%b(l)%col(j)%coeff(r,1) = 0.0_rk
               do m = 0, rowsize-1
                 odd = mod(row_rem + m + (j-1)*rowsize,2)
-                params%b(l)%col(j)%coeff(r,odd) &
-                  &  = params%b(l)%col(j)%coeff(r,odd) &
-                  &    + params%u(l,r)%dat(m) &
+                bu%b(l)%col(j)%coeff(r,odd) &
+                  &  = bu%b(l)%col(j)%coeff(r,odd) &
+                  &    + bu%u(l,r)%dat(m) &
                   &      * alph(row_rem + m + (j-1)*rowsize + 1)
               end do
             end do
@@ -1259,7 +1283,7 @@ contains
                   gam(iVal) = gam(iVal)      &
                     &       + params%sub(l)%subRow(i)%subCol(j)%rowDat(m)&
                     &               %coeff(r) &
-                    &         * params%b(l)%col(j)%coeff(r,odd)
+                    &         * bu%b(l)%col(j)%coeff(r,odd)
                 end do ! r
               end do ! m
             end do ! j
diff --git a/source/ply_LegPolyProjection_module.f90 b/source/ply_LegPolyProjection_module.f90
index a36db61..e293d5b 100644
--- a/source/ply_LegPolyProjection_module.f90
+++ b/source/ply_LegPolyProjection_module.f90
@@ -140,8 +140,6 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     integer :: nChildDofs, oneDof
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nDofs, nComponents, nChilds, nChildDofs, workDat)
-
     if (subsamp%projectionType.ne.ply_QLegendrePoly_prp) then
       call tem_abort( 'ERROR in ply_QPolyProjection: subsampling is ' &
         & // 'only implemented for Q-Legendre-Polynomials'            )
@@ -151,7 +149,6 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
-    !$OMP DO
     varLoop: do iVar=1,nVars
       nDofs = varDofs(iVar)
       nComponents = varcomps(iVar)
@@ -225,9 +222,7 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
       deallocate(projection_oneDof%projCoeff)
 
     end do varLoop
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_QPolyProjection
   ! ************************************************************************ !
@@ -270,8 +265,6 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
     real(kind=rk) :: dimexp
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, xShift, yShift, zShift)
-
     select case(dofType)
     case(ply_QLegendrePoly_prp)
       allocate(projection%projCoeff(nDofs, nChildDofs, nChilds))
@@ -284,7 +277,6 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
       projCoeffOneDim = ply_QLegOneDimCoeff( nint(nDofs**dimexp),     &
         &                                    nint(nChildDofs**dimexp) )
 
-      !$OMP DO
       ! Loop over the children of this element
       childLoop: do iChild = 1, nChilds
 
@@ -337,14 +329,14 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
           end do childDofLoop
         end do parentDofLoop
       end do childLoop
-      !$OMP END DO
+
 
     case default
       call tem_abort( 'ERROR in ply_initProjCoeff: initialization of '      &
         & // 'projection coefficients for subsampling is implemented only ' &
         & // 'for Q-Legendre polynomials'                                   )
     end select
-    !$OMP END PARALLEL
+
     deallocate(projCoeffOneDim)
   end subroutine ply_initQLegProjCoeff
   ! ************************************************************************ !
@@ -590,8 +582,6 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iElem, lowElemIndex, upElemIndex, iChild, lowChildIndex, upChildIndex)
-
     nChilds = 2**ndims
     nElems = tree%nElems
     nElemsToRefine = count(new_refine_tree)
@@ -608,7 +598,6 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
 
     if (subsamp%sampling_lvl > 1) then
 
-      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -700,11 +689,9 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
         end if
 
       end do elementLoop
-      !$OMP END DO
 
     else
 
-      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           allocate(childData(nChildDofs*nChilds*nComponents))
@@ -755,10 +742,8 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
           deallocate(childData)
         end if
       end do elemLoop
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
 
   end subroutine ply_subsampleData
   ! ************************************************************************ !
@@ -797,11 +782,9 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
     integer :: childDof_pos, parentDof_pos
     real(kind=rk) :: projCoeff
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, iComp, projCoeff, childDof_pos, parentDof_pos)
 
     childData(:) = 0.0_rk
 
-    !$OMP DO
     childLoop: do iChild = 1, nChilds
       parentDofLoop: do iParentDof = 1, nParentDofs
         childDofLoop: do iChildDof = 1, nChildDofs
@@ -821,9 +804,7 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
         end do childDofLoop
       end do parentDofLoop
     end do childLoop
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_projDataToChild
   ! ************************************************************************ !
diff --git a/source/ply_fxt_module.f90 b/source/ply_fxt_module.f90
index 616c31c..b01a393 100644
--- a/source/ply_fxt_module.f90
+++ b/source/ply_fxt_module.f90
@@ -211,12 +211,10 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
 
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
-    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -224,19 +222,15 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = modal_data(lb:ub), &
         &                   nodal_data = nodal_data(lb:ub)  )
     end do
-    !$OMP END DO
 
-    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,                             &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1), &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1)  )
     end do
-    !$OMP END DO
     nodal_data = modal_data
 
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_m2n_2D
   ! ************************************************************************ !
@@ -258,8 +252,6 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     real(kind=rk), pointer :: tmp_in(:), tmp_out(:)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, tmp_in, tmp_out, iColumn)
-
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
@@ -268,7 +260,6 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     tmp_in = -42
     tmp_out = -42
 
-    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -279,9 +270,8 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   nodal_data = tmp_out   )
       nodal_data(lb:ub) = tmp_out
     end do
-    !$OMP END DO
 
-    !$OMP DO
+
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1 ) / nModesPerDim ) * msq &
@@ -292,20 +282,17 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = nodal_data(lb:ub:nModesPerDim), &
         &                   nodal_data = modal_data(lb:ub:nModesPerDim)  )
     end do
-    !$OMP END DO
+
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
-    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,              &
         &                   modal_data = modal_data(lb:ub:msq), &
         &                   nodal_data = nodal_data(lb:ub:msq)  )
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_m2n_3D
   ! ************************************************************************ !
@@ -350,12 +337,9 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
-
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
-    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -363,20 +347,16 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
-    !$OMP END DO
 
 
-    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,                             &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1), &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1)  )
     end do
-    !$OMP END DO
     modal_data = nodal_data
 
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_n2m_2D
   ! ************************************************************************ !
@@ -396,13 +376,10 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq, ntotalDofs
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, iColumn, ub, lb)
-
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
 
-    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -411,9 +388,8 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
-    !$OMP END DO
 
-    !$OMP DO
+
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1) / nModesPerDim ) * msq &
@@ -424,20 +400,16 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = modal_data(lb:ub:nModesPerDim), &
         &                   modal_data = nodal_data(lb:ub:nModesPerDim)  )
     end do
-    !$OMP END DO
+
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
-    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,              &
         &                   nodal_data = nodal_data(lb:ub:msq), &
         &                   modal_data = modal_data(lb:ub:msq)  )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_n2m_3D
   ! ************************************************************************ !
diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index 8e1ce65..f9547ef 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -241,7 +241,7 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iRow, iCell, iCol)
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
     if (nDofs > 1) then
 
       !$OMP DO
diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index 4034bc0..5fa6e07 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -45,8 +45,7 @@ contains
     integer :: dofPos, dofPosPrev, dofPos2Prev
     integer :: leg(3), iDeg, iDeg1, iDeg2, iDeg3, DV(3)
     ! -------------------------------------------------------------------- !
-
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, dofPosPrev, iDeg2, iDeg3, iVar,leg, dofPos, iDeg)
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, iDeg2, iDeg3, iDeg, iVar, leg, dofPos)
 
     if (present(dirVec)) then
       DV = dirvec
@@ -121,8 +120,8 @@ contains
     end do
     !$OMP END DO
 
-    !$OMP DO
     ! Scale the results due to the Jacobians of the mappings
+    !$OMP DO
     do dofpos=1,(mpd+1)**3
       ideg3 = (dofpos-1)/(mpd+1)**2 + 1
       iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -135,6 +134,7 @@ contains
     end do
     !$OMP END DO
 
+
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    ! Uncollapsed version of the scaling !
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -152,7 +152,6 @@ contains
 !!       end do
 !!     end do
 !!   end do
-
     !$OMP END PARALLEL
 
   end subroutine calcDiff_leg_normal
@@ -185,7 +184,6 @@ contains
     integer :: dofPos, dofPosPrev, dofPos2Prev
     integer :: leg(2), iDeg1, iDeg2, DV(2)
     ! -------------------------------------------------------------------- !
-
     !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, iDeg2, leg, dofPos, dofPosPrev, iVar)
 
     if (present(dirVec)) then
@@ -257,8 +255,9 @@ contains
     end do
     !$OMP END DO
 
-    !$OMP DO
+
     ! Scale the results due to the Jacobians of the mappings
+    !$OMP DO
     do iDeg1 = 1, mPd+1
       do iDeg2 = 1, mPd+1
         leg = (/iDeg1, iDeg2/)
@@ -270,9 +269,9 @@ contains
       end do
     end do
     !$OMP END DO
-
     !$OMP END PARALLEL
 
+
   end subroutine calcDiff_leg_2d_normal
   ! ************************************************************************ !
 
@@ -376,7 +375,6 @@ contains
     integer :: iDegX
     integer :: dofPos, dofPosPrev, dofPos2Prev
     ! -------------------------------------------------------------------- !
-
     !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(dofPos, iDegX, dofPosPrev, dofPos2Prev)
 
     ! Build the derivative in x direction
@@ -405,9 +403,9 @@ contains
         &                         * (2.0_rk*iDegX - 1.0_rk)
     end do
     !$OMP END DO
-
     !$OMP END PARALLEL
 
+
   end subroutine calcDiff_leg_1d
   ! ************************************************************************ !
 
diff --git a/source/ply_legser_module.f90 b/source/ply_legser_module.f90
index dc3ad15..b75fd3c 100644
--- a/source/ply_legser_module.f90
+++ b/source/ply_legser_module.f90
@@ -41,23 +41,19 @@ subroutine legser(A, B, n)
     real(kind=rk) :: ak, al, bb, c, d
     integer :: k, l, ll
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, d, c, l, ll, al, bb, ak)
 
     ak = 0.0_rk
 
     ! Calculation of the first Legendre coefficient
     b(1) = 0.5_rk * a(1)
-    !$OMP DO
     do k=3,n,2
       ak = ak + 2.0_rk
       b(1) = b(1) - a(k)/(ak*ak - 1.0_rk)
     end do
-    !$OMP END DO
     c = 2.0_rk / 3.0_rk
     al = 0.0_rk
 
     ! Start main loop (remaining Legendre coefficients)
-    !$OMP DO
     do l=2,n
       ! Calculation of the Lth coefficient
       ll = l+2
@@ -77,9 +73,7 @@ subroutine legser(A, B, n)
         &   / ( (al+al+3.0_rk)*(al+al+2.0_rk) )
       b(l) = (al+0.5_rk)*bb
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine legser
   ! ************************************************************************ !
diff --git a/source/ply_modg_basis_module.fpp b/source/ply_modg_basis_module.fpp
index acd25c3..c6be9ef 100644
--- a/source/ply_modg_basis_module.fpp
+++ b/source/ply_modg_basis_module.fpp
@@ -159,8 +159,6 @@ contains
     integer :: iFunc, jFunc
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(jFunc, iFunc)
-
     allocate(GaussPoints(nPoints))
     allocate(GaussPoints_left(nPoints))
     allocate(GaussPoints_right(nPoints))
@@ -200,7 +198,6 @@ contains
 
     allocate( integral%anz_anzShift(1:nFunc, 1:nFunc, 2))
 
-    !$OMP DO
     !loop over anzatz functions
     do jFunc = 1, nFunc
       do iFunc = 1, nFunc
@@ -222,9 +219,7 @@ contains
 
       end do
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine init_modg_covolumeCoeffs
   ! ************************************************************************ !
@@ -424,8 +419,6 @@ contains
     real(kind=rk) :: n_q
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iAns, iAnsX, iAnsY, iAnsZ, n_q, ansPos)
-
     ! allocate the output array
     select case(basisType)
       case(Q_space)
@@ -451,7 +444,7 @@ contains
       polyValX(2,:) = coords(:,1)
       polyValY(2,:) = coords(:,2)
       polyValZ(2,:) = coords(:,3)
-      !$OMP DO
+
       ! ... higher order polynomials are build recursively
       do iAns = 3, maxPolyDegree+1
         n_q = 1.0_rk / real(iAns-1,kind=rk)
@@ -477,13 +470,11 @@ contains
           &                   * polyValZ(iAns-2,:) )  &
           &                 *n_q
       end do
-      !$OMP END DO
     end if
 
     ! Now, build the complete point value.
     select case(basisType)
       case(Q_space)
-        !$OMP DO
         do iAnsX = 1, maxPolyDegree+1
           do iAnsY = 1, maxPolyDegree+1
             do iAnsZ = 1, maxPolyDegree+1
@@ -495,7 +486,7 @@ contains
             end do
           end do
         end do
-        !$OMP END DO
+
       case(P_space)
         iAnsX = 1
         iAnsY = 1
@@ -508,10 +499,9 @@ contains
             &                    * polyValZ(iAnsZ,:)
 ?? copy :: nextModgCoeffPTens(iAnsX, iAnsY, iAnsZ)
         end do
-        !$OMP END DO
+
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine evalLegendreTensPoly
   ! ************************************************************************ !
diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index aa8780b..ea26745 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -155,7 +155,9 @@ contains
     nScalars = size(modalCoeffs,2)
     maxorders = 3*(poly_proj%min_degree + 1)-2
 
+    !$OMP WORKSHARE
     modalCoeffs = 0.0_rk
+    !$OMP END WORKSHARE
 
     if (poly_proj%basisType == Q_Space) then
       posQ: if (present(ensure_positivity)) then
@@ -163,6 +165,7 @@ contains
         varQ: do iVar=1,nScalars
           if (ensure_positivity(iVar)) then
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do dof = 1, mpd1_cube
               iDegZ = (dof-1)/mpd1_square + 1
               iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -178,9 +181,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varQ
         do iVar=1,nScalars
+          !$OMP DO
           do dof = 1, mpd1_cube
             iDegZ = (dof-1)/mpd1_square + 1
             iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -193,8 +198,10 @@ contains
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end if
           end do
+          !$OMP END DO
         end do
       else posQ
+        !$OMP DO
         do dof = 1, mpd1_cube
           iDegZ = (dof-1)/mpd1_square + 1
           iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -206,6 +213,7 @@ contains
             modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
           end do
         end do
+        !$OMP END DO
       end if posQ
 
     else !P_Space
@@ -218,6 +226,7 @@ contains
             iDegY = 1
             iDegZ = 1
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
               iOrd = iDegX+iDegY+iDegZ-2
@@ -232,12 +241,14 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varP
         do iVar=1,nScalars
           iDegX = 1
           iDegY = 1
           iDegZ = 1
+          !$OMP SINGLE
           do idof = 1, poly_proj%body_3d%min_dofs
             iOrd = iDegX+iDegY+iDegZ-2
             if (iOrd > ord_lim) EXIT
@@ -248,11 +259,13 @@ contains
             modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
           end do
+          !$OMP END SINGLE
         end do
       else posP
         iDegX = 1
         iDegY = 1
         iDegZ = 1
+        !$OMP SINGLE
         do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
           dofOverSamp = iDegX + ( iDegY-1  &
@@ -261,6 +274,7 @@ contains
           modalCoeffs(dofOverSamp,:) = state(dof,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
         end do
+        !$OMP END SINGLE
       end if posP
 
     end if
@@ -302,6 +316,7 @@ contains
     nScalars = size(modalCoeffs,2)
 
     if (poly_proj%basisType == Q_Space) then
+      !$OMP DO
       do iVar=1,nScalars
         do dof = 1, mpd1_cube
           iDegZ = (dof-1)/mpd1_square + 1
@@ -313,12 +328,14 @@ contains
           state(dof,iVar) = modalCoeffs(dofOverSamp,iVar)
         end do
       end do
+      !$OMP END DO
 
     else !P_Space
 
       iDegX = 1
       iDegY = 1
       iDegZ = 1
+      !$OMP SINGLE
       do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
         dofOverSamp = iDegX + ( iDegY-1  &
@@ -327,6 +344,7 @@ contains
         state(dof,:) = modalCoeffs(dofOverSamp,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
       end do
+      !$OMP END SINGLE
     end if
 
   end subroutine ply_convertFromoversample_3d
@@ -382,7 +400,9 @@ contains
     else
       nPVars = size(state,2)
     end if
+    !$OMP WORKSHARE
     maxorders = 2*(poly_proj%min_degree + 1)-1
+    !$OMP END WORKSHARE
 
     ! Information for the oversampling loop
     oversamp_degree = poly_proj%oversamp_degree
@@ -398,6 +418,7 @@ contains
         varQ: do iVar=1,nPVars
           if (ensure_positivity(iVar)) then
             ordersum = 0.0_rk
+            !$OMP SINGLE
             do dof = 1, mpd1_square
               iDegX = mod(dof-1,mpd1)+1
               iDegY = (dof-1)/mpd1+1
@@ -412,9 +433,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varQ
         do iVar=1,nPVars
+          !$OMP DO
           do dof = 1, mpd1_square
             iDegX = mod(dof-1,mpd1)+1
             iDegY = (dof-1)/mpd1+1
@@ -424,14 +447,17 @@ contains
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end if
           end do
+          !$OMP END DO
         end do
       else posQ
+        !$OMP DO
         do dof = 1, mpd1_square
           iDegX = mod(dof-1,mpd1)+1
           iDegY = (dof-1)/mpd1+1
           dofOverSamp = 1 + (iDegX-1) + (iDegY-1)*(oversamp_degree+1)
           modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
         end do
+        !$OMP END DO
       end if posQ
 
     else !P_Space
@@ -440,6 +466,7 @@ contains
         ord_lim = maxorders
         varP: do iVar=1,nPVars
           if (ensure_positivity(iVar)) then
+            !$OMP SINGLE
             iDegX = 1
             iDegY = 1
             ordersum = 0.0_rk
@@ -457,9 +484,11 @@ contains
                  EXIT
               end if
             end do
+            !$OMP END SINGLE
           end if
         end do varP
         do iVar=1,nPVars
+          !$OMP SINGLE
           iDegX = 1
           iDegY = 1
           do idof = 1, poly_proj%body_2d%min_dofs
@@ -470,8 +499,10 @@ contains
             modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
           end do
+          !$OMP END SINGLE
         end do
       else posP
+        !$OMP SINGLE
         iDegX = 1
         iDegY = 1
         do idof = 1, poly_proj%body_2d%min_dofs
@@ -480,6 +511,7 @@ contains
           modalCoeffs(dofOverSamp,1:nPVars) = state(dof,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
         end do
+        !$OMP END SINGLE
       end if posP
 
     end if
@@ -530,15 +562,18 @@ contains
 
     if (poly_proj%basisType == Q_Space) then
 
+      !$OMP DO
       do dof = 1, mpd1_square
         iDegX = mod(dof-1,mpd1)+1
         iDegY = (dof-1)/mpd1+1
         dofOverSamp = 1 + (iDegX-1) + (iDegY-1)*(oversamp_degree+1)
         state(dof,1:nPVars) = modalCoeffs(dofOverSamp,1:nPVars)
       end do
+      !$OMP END DO
 
     else !P_Space
 
+      !$OMP SINGLE
       iDegX = 1
       iDegY = 1
       iDegZ = 0 ! not used in posOfModgCoeffPTens_2D, nextModgCoeffPTens
@@ -548,6 +583,7 @@ contains
         state(dof,1:nPVars) = modalCoeffs(dofOverSamp,1:nPVars)
 ?? copy :: nextModgCoeffPTens2D(iDegX, iDegY)
       end do
+      !$OMP END SINGLE
 
     end if
 
@@ -597,13 +633,16 @@ contains
     nPVars = (poly_proj%maxPolyDegree+1)*nVars
 
     ! Initialize oversampled space correct to 0
+    !$OMP WORKSHARE
     ModalCoeffs(:,:) = 0.0_rk
+    !$OMP END WORKSHARE
 
     if (present(ensure_positivity)) then
       ord_lim = poly_proj%min_degree+1
       do iVar = 1,nVars
         if (ensure_positivity(iVar)) then
           varSum = 0.0_rk
+          !$OMP SINGLE
           do iPoint=2,ord_lim
             varSum = varSum + abs(state(iPoint,iVar))
             if (varSum >= state(1,iVar)) then
@@ -611,19 +650,24 @@ contains
               EXIT
             end if
           end do
+          !$OMP END SINGLE
         end if
       end do
       do iVar=1,nVars
+        !$OMP DO
         do iPoint=1,ord_lim
           ModalCoeffs(iPoint,iVar) = state(iPoint,iVar)
         end do
+        !$OMP END DO
       end do
     else
+      !$OMP DO
       do iVP = 1,nPVars
         iVar = (iVP-1)/(poly_proj%min_degree+1) + 1
         iPoint = iVP - (iVar-1)*(poly_proj%min_degree+1)
         ModalCoeffs(iPoint,iVar) = state(iPoint,iVar)
       end do
+      !$OMP END DO
     end if
 
   end subroutine ply_convert2oversample_1d
@@ -659,11 +703,13 @@ contains
       nPVars = (poly_proj%maxPolyDegree+1)*size(state,2)
     end if
 
+    !$OMP DO
     do iVP = 1,nPVars
       iVar = (iVP-1)/(poly_proj%min_degree+1) + 1
       iPoint = iVP - (iVar-1)*(poly_proj%min_degree+1)
       state(iPoint,iVar) = modalCoeffs(iPoint,iVar)
     end do
+    !$OMP END DO
 
   end subroutine ply_convertFromoversample_1d
   ! ************************************************************************ !
diff --git a/source/ply_poly_project_module.fpp b/source/ply_poly_project_module.fpp
index 65658b4..fde5519 100644
--- a/source/ply_poly_project_module.fpp
+++ b/source/ply_poly_project_module.fpp
@@ -17,6 +17,7 @@ module ply_poly_project_module
     &                                    ply_prj_init_type
 
   use ply_LegFpt_module,           only: ply_legFpt_type, &
+    &                                    ply_legFpt_bu_type, &
     &                                    ply_init_legFpt, &
     &                                    ply_legToPnt,    &
     &                                    ply_PntToLeg,    &
@@ -58,6 +59,7 @@ module ply_poly_project_module
     !! of nonlinear equations. It is used if fpt is choses as projection
     !! method in the lua file
     type(ply_legFpt_type)  :: fpt
+    type(ply_legFpt_bu_type) :: bu
     !> The Legendre Polynomial type for the Fast Orthogonal Function
     !! Transform via fxtpack. It is used if 'fxt' is chosen as projection
     !! method in the lua file
@@ -195,6 +197,7 @@ contains
     !------------------------------------------------------------------------!
 
     left%fpt = right%fpt
+    left%bu = right%bu
     left%l2p = right%l2p
     left%fxt = right%fxt
     left%nodes = right%nodes
@@ -376,7 +379,8 @@ contains
         & blocksize        = proj_init%header%fpt_header%blocksize,       &
         & approx_terms     = proj_init%header%fpt_header%approx_terms,    &
         & striplen         = proj_init%header%fpt_header%striplen,        &
-        & subblockingWidth = proj_init%header%fpt_header%subblockingWidth )
+        & subblockingWidth = proj_init%header%fpt_header%subblockingWidth,&
+        & bu               = me%body_1d%bu                                )
       !> Initialization/Create  of the volume quadrature  nodes and the
       !! quadrature points on the face
       call init_cheb_nodes_1d(                                          &
@@ -394,7 +398,8 @@ contains
           & blocksize        = proj_init%header%fpt_header%blocksize,       &
           & approx_terms     = proj_init%header%fpt_header%approx_terms,    &
           & striplen         = proj_init%header%fpt_header%striplen,        &
-          & subblockingWidth = proj_init%header%fpt_header%subblockingWidth )
+          & subblockingWidth = proj_init%header%fpt_header%subblockingWidth,&
+          & bu               = me%body_2d%bu                                )
         call init_cheb_nodes_2d(                                          &
           & me                = proj_init%header%fpt_header%nodes_header, &
           & nodes             = me%body_2d%nodes,                         &
@@ -411,7 +416,8 @@ contains
           & blocksize        = proj_init%header%fpt_header%blocksize,       &
           & approx_terms     = proj_init%header%fpt_header%approx_terms,    &
           & striplen         = proj_init%header%fpt_header%striplen,        &
-          & subblockingWidth = proj_init%header%fpt_header%subblockingWidth )
+          & subblockingWidth = proj_init%header%fpt_header%subblockingWidth,&
+          & bu               = me%body_3d%bu                                )
         call init_cheb_nodes(                                             &
           & me                = proj_init%header%fpt_header%nodes_header, &
           & nodes             = me%body_3d%nodes,                         &
@@ -500,8 +506,6 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
-
 
     select case(trim(me%kind))
     case ('l2p')
@@ -511,29 +515,26 @@ contains
       ! additional summation
       select case(dim)
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
+
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
+
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
+
       end select
 
     case ('fpt')
@@ -543,56 +544,50 @@ contains
         call ply_LegToPnt_3D( fpt       = me%body_3d%fpt, &
           &                   pntVal    = nodal_data,     &
           &                   legCoeffs = modal_data,     &
-          &                   nVars     = nVars           )
+          &                   nVars     = nVars,          &
+          &                   bu        = me%body_3d%bu   )
       case (2)
         call ply_LegToPnt_2D( fpt       = me%body_2d%fpt, &
           &                   pntVal    = nodal_data,     &
           &                   legCoeffs = modal_data,     &
-          &                   nVars     = nVars           )
+          &                   nVars     = nVars,          &
+          &                   bu        = me%body_2d%bu   )
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_LegToPnt( fpt       = me%body_1d%fpt,     &
             &                pntVal    = nodal_data(:,iVar), &
             &                legCoeffs = modal_data(:,iVar), &
-            &                nIndeps   = 1                   )
+            &                nIndeps   = 1,                  &
+            &                bu        = me%body_1d%bu       )
         end do
-        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_3D( fxt = me%body_3d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
-        !$OMP END DO
       case (2)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_2D( fxt = me%body_2d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
-        !$OMP END DO
 
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_1D( fxt = me%body_1d%fxt,         &
             &               modal_data = modal_data(:,iVar), &
             &               nodal_data = nodal_data(:,iVar)  )
         end do
-        !$OMP END DO
       end select
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine ply_poly_project_m2n_multivar
   !****************************************************************************!
@@ -612,36 +607,28 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
-
 
     select case(trim(me%kind))
     case ('l2p')
       select case (dim)
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       end select
 
      case ('fpt')
@@ -651,27 +638,27 @@ contains
         call ply_pntToLeg_3D( fpt       = me%body_3d%fpt, &
           &                   nVars     = nVars,          &
           &                   pntVal    = nodal_data,     &
-          &                   legCoeffs = modal_data      )
+          &                   legCoeffs = modal_data,     &
+          &                   bu        = me%body_3d%bu   )
       case (2)
         call ply_pntToLeg_2D( fpt       = me%body_2d%fpt, &
           &                   nVars     = nVars,          &
           &                   pntVal    = nodal_data,     &
-          &                   legCoeffs = modal_data      )
+          &                   legCoeffs = modal_data,     &
+          &                   bu        = me%body_2d%bu   )
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_pntToLeg( fpt       = me%body_1d%fpt,     &
             &                nIndeps   = 1,                  &
             &                pntVal    = nodal_data(:,iVar), &
-            &                legCoeffs = modal_data(:,iVar)  )
+            &                legCoeffs = modal_data(:,iVar), &
+            &                bu        = me%body_1d%bu       )
         end do
-        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_3D(                                  &
             &         fxt              = me%body_3d%fxt,        &
@@ -679,10 +666,8 @@ contains
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
-        !$OMP END DO
 
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_2D(                                  &
             &         fxt              = me%body_2d%fxt,        &
@@ -690,24 +675,20 @@ contains
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
-        !$OMP END DO
 
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_1D(                               &
             &         fxt              = me%body_1d%fxt,     &
             &         nodal_data       = nodal_data(:,iVar), &
             &         modal_data       = modal_data(:,iVar)  )
         end do
-        !$OMP END DO
       end select
 
     case default
        write(logUnit(1),*) 'ERROR in projection nodal to modal'
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine ply_poly_project_n2m_multivar
   !***************************************************************************!
diff --git a/source/ply_poly_transformation_module.f90 b/source/ply_poly_transformation_module.f90
index a6e7a1e..f457789 100644
--- a/source/ply_poly_transformation_module.f90
+++ b/source/ply_poly_transformation_module.f90
@@ -101,13 +101,11 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
     integer :: nVars, nDofs, nComponents, nChildDofs
     integer :: iVar
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nComponents, nDofs, workData, nChildDofs)
 
     nVars = size(varDofs)
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
-    !$OMP DO
     varLoop: do iVar = 1, nVars
       nComponents = varComps(iVar)
       nDofs = vardofs(iVar)
@@ -147,9 +145,6 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
       deallocate(workData)
 
     end do varLoop
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
 
   end subroutine ply_Poly_Transformation
@@ -199,8 +194,6 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
     real(kind=rk), allocatable :: transform_matrix(:,:)
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iChild, iElem)
-
 
     nChilds = 2**nDims
 
@@ -227,7 +220,6 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
 
     if (subsamp%sampling_lvl > 1) then
 
-      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -314,9 +306,7 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elementLoop
-      !$OMP END DO
     else
-      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           ! Create lower and upper indices for all data of iElem in meshData.
@@ -367,12 +357,10 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elemLoop
-      !$OMP END DO
     end if
 
     deallocate(transform_matrix)
 
-    !$OMP END PARALLEL
 
   end subroutine ply_subsampleData
   ! ************************************************************************ !
@@ -763,8 +751,6 @@ subroutine ply_transform_matrix(max_modes, v)
     integer :: m, orig
     real(kind=rk) :: shifting, scaling
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(orig, m)
-
 
     ! transformation matrix looks like this:
     ! [1.0  --  --     shift=0.5   ]
@@ -787,7 +773,6 @@ subroutine ply_transform_matrix(max_modes, v)
       v(2,2) = scaling
 
       if (max_modes > 2) then
-        !$OMP DO
         do orig = 3,max_modes
           v(1,orig) = ply_beta(orig-1) * v(1,orig-2)                    &
             &       + ply_alpha(orig-1) * shifting * v(1,orig-1)        &
@@ -805,14 +790,12 @@ subroutine ply_transform_matrix(max_modes, v)
             end if
           end do
         end do
-        !$OMP END DO
       end if
 
       ! Due to the symmetry of the problem (the left subinterval has just
       ! the shifting with a changed sign), we can fill the other half of
       ! the matrix by copying the already computed values accordingly with
       ! a change in sign, as needed (alternatingly).
-      !$OMP DO
       do m = 1 , max_modes
         do orig = 1, m-1
           if (mod((m+orig),2) /= 0) then
@@ -822,10 +805,8 @@ subroutine ply_transform_matrix(max_modes, v)
           end if
         end do
       end do
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
 
   end subroutine ply_transform_matrix
   ! ************************************************************************ !
diff --git a/source/ply_sampling_module.fpp b/source/ply_sampling_module.fpp
index d9651b3..f72f2b7 100644
--- a/source/ply_sampling_module.fpp
+++ b/source/ply_sampling_module.fpp
@@ -652,21 +652,17 @@ contains
     integer :: iElem
     integer :: nComps
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem)
 
     nComps = fun%nComponents
     datlen = tree%nElems * nComps
 
     call c_f_pointer(fun%method_data, p)
 
-    !$OMP DO
     do iElem=1,n
       res(1+(iElem-1)*nComps:iElem*nComps) &
         & = p%dat(1+(elempos(iElem)-1)*nComps:elempos(iElem)*nComps)
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine get_sampled_element
   ! ************************************************************************ !
diff --git a/source/ply_sampling_varsys_module.f90 b/source/ply_sampling_varsys_module.f90
index 2473da5..aed8a40 100644
--- a/source/ply_sampling_varsys_module.f90
+++ b/source/ply_sampling_varsys_module.f90
@@ -83,7 +83,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
     integer, allocatable :: elempos(:)
     real(kind=rk), allocatable :: elemdat(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, iComponent, iElem, iTotComp)
 
     nVars = trackInst%varmap%varPos%nVals
 
@@ -115,14 +114,12 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
       nComponents = varsys%method%val(varpos)%nComponents
       nDofs = (var_degree(iVar)+1)**nDims
 
-      !$OMP DO
       do iComponent=1,nComponents
         iTotComp = iScalar+iComponent-1
         call ply_sampling_var_allocate( var     = var(iTotComp), &
           &                             nElems  = nElems,        &
           &                             datalen = nDofs*nElems   )
       end do
-      !$OMP END DO
 
       isScalar: if (nComponents == 1) then
 
@@ -144,7 +141,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
         ! To avoid overly large memory consumption, we do this element by
         ! element.
         allocate(elemDat(nComponents*ndofs))
-        !$OMP DO
+
         do iElem=1,nElems
           call varSys%method%val(varpos)%get_element( &
             &    varSys  = varSys,                    &
@@ -160,7 +157,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
               & = elemdat(iComponent::nComponents)
           end do
         end do
-        !$OMP END DO
+
         deallocate(elemDat)
 
       end if isScalar
@@ -170,7 +167,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
       ! polynomial degree in each element.
       ! This is redundant, but convenient.
       lastScalar = iScalar+nComponents-1
-      !$OMP DO
       do iComponent=0,nComponents-1
         var(iScalar+iComponent)%first(1) = 1
         do iElem=1,nElems
@@ -179,13 +175,11 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
           var(iScalar+iComponent)%degree(iElem) = var_degree(iVar)
         end do
       end do
-      !$OMP END DO
 
       iScalar = iScalar + nComponents
 
     end do variables
 
-    !$OMP END PARALLEL
 
   end subroutine ply_sampling_varsys_for_track
   ! ------------------------------------------------------------------------ !
@@ -307,7 +301,6 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     integer :: nElems
     integer :: ndofs
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem, ndofs, absmean, variation)
 
     if (allocated(var%deviates)) deallocate(var%deviates)
     var%nDeviating = 0
@@ -315,7 +308,7 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     if (allocated(var%first)) then
       nElems = size(var%first)-1
       allocate(var%deviates(nElems))
-      !$OMP DO
+
       do iElem=1,nElems
         ndofs = var%first(iElem+1) - var%first(iElem) - 1
         absmean = max( abs(var%dat(var%first(iElem))), min_mean )
@@ -323,10 +316,8 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
         var%deviates(iElem) = (variation > threshold*absmean)
         if (var%deviates(iElem)) var%nDeviating = var%nDeviating + 1
       end do
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
 
   end subroutine ply_sampling_var_compute_elemdev
   ! ------------------------------------------------------------------------ !
diff --git a/source/ply_space_integration_module.f90 b/source/ply_space_integration_module.f90
index fc5bb7f..fa6f1e0 100644
--- a/source/ply_space_integration_module.f90
+++ b/source/ply_space_integration_module.f90
@@ -115,7 +115,6 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**3
     allocate(points(numQuadPoints,3))
@@ -130,7 +129,6 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -146,9 +144,6 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube
   ! *********************************************************************** !
@@ -172,7 +167,6 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**2
     allocate(points(numQuadPoints,3))
@@ -187,7 +181,6 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
@@ -200,9 +193,7 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube_2d
   ! *********************************************************************** !
@@ -225,7 +216,6 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
 
     allocate(points(num_intp_per_direction,3))
     allocate(weights(num_intp_per_direction))
@@ -239,7 +229,6 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -250,9 +239,7 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(i)
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube_1d
   ! *********************************************************************** !
@@ -283,7 +270,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nquadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i)
 
     nQuadPoints = num_intp_per_direction**2
 
@@ -302,7 +288,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -314,10 +299,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -329,10 +312,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case(3) ! face in z direction, z coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -344,7 +325,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube:' &
@@ -352,7 +332,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine ply_create_surface_gauss_points_cube
   ! *********************************************************************** !
@@ -383,7 +362,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
 
     ! The number of quadrature points on the boundary of a 2d volume is the
     ! number of quad points in one direction
@@ -404,7 +382,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points for 2d case to get the three dimensional values.
@@ -414,10 +391,8 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points in 2d case to get the three dimensional values.
@@ -427,7 +402,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube_2d:' &
@@ -435,7 +409,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     end select
 
-    !$OMP END PARALLEL
 
   end subroutine ply_create_surface_gauss_points_cube_2d
   ! *********************************************************************** !
@@ -503,7 +476,6 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j)
 
     numQuadPoints = num_intp_per_direction
 
@@ -519,7 +491,6 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -527,9 +498,7 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(j)
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_create_gauss_points_1d
   ! *********************************************************************** !
@@ -558,14 +527,12 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
     real(kind=rk) :: EPS
     integer :: m, i, j
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, p1, p2, p3, pp, z1, z)
 
     EPS= 1.0 / (10.0**(PRECISION(1.0_rk)-2) )
     m = (nIntP+1)/2
     xm = 0.5*(x2+x1)
     xl = 0.5*(x2-x1)
 
-    !$OMP DO
     do i = 1, m
 
       z = cos(PI*((i-1)+0.75_rk)/(nIntP+0.5_rk))
@@ -590,9 +557,7 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
       w(nIntp-i+1) = w(i)
 
     end do
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_gaussLegPoints
   ! *********************************************************************** !
diff --git a/source/ply_split_element_module.f90 b/source/ply_split_element_module.f90
index 5fd0468..5af6213 100644
--- a/source/ply_split_element_module.f90
+++ b/source/ply_split_element_module.f90
@@ -219,7 +219,6 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     integer :: nParents
     integer :: parentpos, childpos
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDir, parentMode, iParent, childMode, indep)
 
     nParents = size(parent_data,2)
 
@@ -233,13 +232,11 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     ! The number of independent modes (in normal directions) is given
     ! by the product of the length in all directions, except the last one.
     nIndeps = 1
-    !$OMP DO
     do iDir=1,nDims-1
       nIndeps = nIndeps*inLen(iDir)
     end do
-    !$OMP END DO
 
-    !$OMP DO
+
     oldmodes: do parentMode=1,inLen(nDims)
       ! Maximal number modes to compute, as this is a triangular matrix
       ! it is limited by the diagonal (parentMode). However, it may be
@@ -270,9 +267,7 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
       end do elemloop
 
     end do oldmodes
-    !$OMP END DO
 
-    !$OMP END PARALLEL
 
   end subroutine ply_split_element_singleD
   ! ======================================================================== !
diff --git a/source/ply_transfer_module.fpp b/source/ply_transfer_module.fpp
index 5f87f8a..35a2a99 100644
--- a/source/ply_transfer_module.fpp
+++ b/source/ply_transfer_module.fpp
@@ -123,14 +123,12 @@ contains
 
       ispace_oq: if (inspace == Q_Space) then
 
-        !$OMP DO
         ! Both, output and input are Q Polynomials
         do out_Y=0,minord-1
           out_off = out_Y*(outdegree+1)
           in_off = out_Y*(indegree+1)
           outdat(out_off+1:out_off+minord) = indat(in_off+1:in_off+minord)
         end do
-        !$OMP END DO
 
       else ispace_oq
 
diff --git a/utests/with_fftw/ply_fpt_2D_lobattoNodes_test.fpp b/utests/with_fftw/ply_fpt_2D_lobattoNodes_test.fpp
index 9f94bbe..3f35a6f 100644
--- a/utests/with_fftw/ply_fpt_2D_lobattoNodes_test.fpp
+++ b/utests/with_fftw/ply_fpt_2D_lobattoNodes_test.fpp
@@ -7,7 +7,8 @@ program ply_fpt_2D_lobattoNodes_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -52,6 +53,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -122,7 +124,8 @@ contains
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   nIndeps       = maxPolyDegree+1, &
       &                   fpt           = fpt,             &
-      &                   lobattoPoints = .true.           )
+      &                   lobattoPoints = .true.,          &
+      &                   bu            = bu               )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**2, nVars ))
@@ -130,7 +133,8 @@ contains
     call ply_legToPnt_2D( fpt       = fpt,       &
       &                   legCoeffs = legCoeffs, &
       &                   pntVal    = pntVal,    &
-      &                   nVars     = nVars      )
+      &                   nVars     = nVars,     &
+      &                   bu        = bu         )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_2D_singVar_lobattoNodes_test.fpp b/utests/with_fftw/ply_fpt_2D_singVar_lobattoNodes_test.fpp
index a424ff8..a99696b 100644
--- a/utests/with_fftw/ply_fpt_2D_singVar_lobattoNodes_test.fpp
+++ b/utests/with_fftw/ply_fpt_2D_singVar_lobattoNodes_test.fpp
@@ -6,7 +6,8 @@ program ply_fpt_2D_singVar_lobattoNodes_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -49,6 +50,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -114,12 +116,14 @@ contains
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   nIndeps       = maxpolydegree+1, &
       &                   fpt           = fpt,             &
-      &                   lobattoPoints = .true.           )
+      &                   lobattoPoints = .true.,          &
+      &                   bu            = bu               )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**2 ))
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal )
+    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
+        &                 bu = bu )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_2D_singVar_test.fpp b/utests/with_fftw/ply_fpt_2D_singVar_test.fpp
index 1a5b550..811b053 100644
--- a/utests/with_fftw/ply_fpt_2D_singVar_test.fpp
+++ b/utests/with_fftw/ply_fpt_2D_singVar_test.fpp
@@ -6,7 +6,8 @@ program ply_fpt_2D_singVar_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_Space
@@ -49,6 +50,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -114,12 +116,14 @@ contains
     ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   nIndeps       = maxPolyDegree+1, &
-      &                   fpt           = fpt              )
+      &                   fpt           = fpt,             &
+      &                   bu            = bu               )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**2 ))
     write(logUnit(1),*) 'Calculating FPT ...'
-    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal )
+    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
+        &                 bu  = bu  )
     write(logUnit(1),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_2D_test.fpp b/utests/with_fftw/ply_fpt_2D_test.fpp
index 2c88472..84f6338 100644
--- a/utests/with_fftw/ply_fpt_2D_test.fpp
+++ b/utests/with_fftw/ply_fpt_2D_test.fpp
@@ -6,7 +6,8 @@ program ply_fpt_2D_test
   use tem_param_module,         only: PI
   use tem_general_module,       only: tem_general_type, tem_start
   use tem_logging_module,       only: logUnit
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -49,6 +50,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -119,13 +121,14 @@ contains
     ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   nIndeps       = maxPolyDegree+1, &
-      &                   fpt           = fpt              )
+      &                   fpt           = fpt,             &
+      &                   bu            = bu               )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**2, nVars ))
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
-      &                   nVars = nVars )
+      &                   nVars = nVars, bu = bu                             )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_3D_lobattoNodes_test.fpp b/utests/with_fftw/ply_fpt_3D_lobattoNodes_test.fpp
index 33acdc9..7bd95bd 100644
--- a/utests/with_fftw/ply_fpt_3D_lobattoNodes_test.fpp
+++ b/utests/with_fftw/ply_fpt_3D_lobattoNodes_test.fpp
@@ -6,7 +6,8 @@ program ply_fpt_3D_lobattoNodes_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                  ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -49,6 +50,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -124,13 +126,14 @@ contains
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxpolydegree+1)**2, &
       &                   fpt           = fpt,                  &
-      &                   lobattoPoints = .true.                )
+      &                   lobattoPoints = .true.,               &
+      &                   bu            = bu                    )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_3D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
-      &                   nVars = nVars )
+      &                   nVars = nVars, bu = bu                             )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_3D_performance_test.f90 b/utests/with_fftw/ply_fpt_3D_performance_test.f90
index d6ad78b..74687fe 100644
--- a/utests/with_fftw/ply_fpt_3D_performance_test.f90
+++ b/utests/with_fftw/ply_fpt_3D_performance_test.f90
@@ -5,7 +5,8 @@ program ply_fpt_3D_performance_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit, tem_logging_init_primary
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D, &
     &                                 ply_pntToLeg_3D
 
@@ -34,7 +35,7 @@ program ply_fpt_3D_performance_test
   write(*,*) 'Maximal deviation:', res
   if (res <  1.e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -47,6 +48,7 @@ subroutine ply_check_legToPnt_3D(power,res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     real(kind=rk) :: starttime, stoptime
 
     ! Define the maximal polynomial degree we want to calculate the
@@ -57,40 +59,43 @@ subroutine ply_check_legToPnt_3D(power,res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar, rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxpolydegree+1)**2, &
-      &                   fpt           = fpt                   )
-  
+      &                   fpt           = fpt,                  &
+      &                   bu            = bu                    )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     legCoeffsIn = legCoeffs
     starttime = MPI_Wtime()
     call ply_legToPnt_3D( fpt       = fpt,         &
       &                   legCoeffs = legCoeffsIn, &
       &                   pntVal    = pntVal,      &
-      &                   nVars     = nVars        ) 
+      &                   nVars     = nVars,       &
+      &                   bu        = bu           )
     stoptime = MPI_Wtime()
     write(*,*) 'Time for degree ', maxpolydegree, ' trafo:   ', stoptime - starttime
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
     starttime = MPI_Wtime()
     call ply_pntToLeg_3D( fpt       = fpt,    &
       &                   pntVal    = pntVal, &
       &                   legCoeffs = legVal, &
-      &                   nVars     = nVars   )
+      &                   nVars     = nVars,  &
+      &                   bu        = bu      )
     stoptime = MPI_Wtime()
     write(*,*) 'Time for degree ', maxpolydegree, ' inverse: ', stoptime - starttime
-  
+
     res = maxval(abs(legVal(:,:) - legCoeffs(:,:)))
 
   end subroutine ply_check_legToPnt_3D
diff --git a/utests/with_fftw/ply_fpt_3D_test.fpp b/utests/with_fftw/ply_fpt_3D_test.fpp
index 0858c9c..4fdcbb6 100644
--- a/utests/with_fftw/ply_fpt_3D_test.fpp
+++ b/utests/with_fftw/ply_fpt_3D_test.fpp
@@ -6,7 +6,8 @@ program ply_fpt_3D_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT,  &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -49,6 +50,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -124,13 +126,14 @@ contains
     ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxpolydegree+1)**2, &
-      &                   fpt           = fpt                   )
+      &                   fpt           = fpt,                  &
+      &                   bu            = bu                    )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_3D( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
-      &                   nVars = nVars )
+      &                   nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_lobattoNodes_test.f90 b/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_lobattoNodes_test.f90
index be0920d..c3aa6df 100644
--- a/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_lobattoNodes_test.f90
@@ -2,9 +2,10 @@
 !! \author{Jens Zudrop}
 program ply_fpt_ifpt_2D_multiVar_lobattoNodes_test
   use env_module,               only: rk, fin_env
-  use tem_logging_module,       only: logUnit 
+  use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D,    &
     &                                 ply_pntToLeg_2D
 
@@ -18,7 +19,7 @@ program ply_fpt_ifpt_2D_multiVar_lobattoNodes_test
   call tem_start(codeName = 'Ateles unit test', &
     &            version  = 'utest',            &
     &            general  = general             )
-  
+
   res = 0.0_rk
   do iPower = 1,6
     call ply_check_legToPnt_2D(iPower, newRes)
@@ -29,8 +30,8 @@ program ply_fpt_ifpt_2D_multiVar_lobattoNodes_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
-  
+  end if
+
   call fin_env()
 
 contains
@@ -43,6 +44,7 @@ subroutine ply_check_legToPnt_2D(power, res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -52,35 +54,36 @@ subroutine ply_check_legToPnt_2D(power, res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**2
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**2, nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**2, nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**2, nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**2, nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar,rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   fpt           = fpt,             &
       &                   nIndeps       = maxPolyDegree+1, &
-      &                   lobattoPoints = .true.           )
-  
+      &                   lobattoPoints = .true.,          &
+      &                   bu            = bu               )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**2,nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**2,nVars ))
     legCoeffsIn = legCoeffs ! Duplicate input vector to make sure that it is not modified in the trafo
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, &
-      &                   pntVal = pntVal, nVars = nVars )
+      &                   pntVal = pntVal, nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**2,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**2,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal,       &
-      &                   legCoeffs = legVal, nVars = nVars )
+      &                   legCoeffs = legVal, nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iDof = 1, (maxPolyDegree+1)**2
     !!  write(*,*) 'Leg coeff ', iDof, ' has error: ', legVal(iDof) - legCoeffs(iDof)
     !!end do
@@ -93,7 +96,7 @@ subroutine ply_check_legToPnt_2D(power, res)
       maxErr = maxloc(abs(legVal(:,iVar) - legCoeffs(:,iVar)), 1)
       write(*,*) 'Ref. sol ', legCoeffs(maxErr,iVar), ' alg delivers: ', legVal(maxErr,iVar)
     end do
-   
+
     res = maxval(abs(legVal(:,:) - legCoeffs(:,:)))
 
   end subroutine
diff --git a/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_test.f90 b/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_test.f90
index 38478c2..18b5a72 100644
--- a/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_2D_multiVar_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_2D_multiVar_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+  &                                   ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D, &
     &                                 ply_pntToLeg_2D
 
@@ -18,7 +19,7 @@ program ply_fpt_ifpt_2D_multiVar_test
   call tem_start(codeName = 'Ateles unit test', &
     &            version  = 'utest',            &
     &            general  = general             )
- 
+
   res = 0.0_rk
   do iPower = 1,6
     call ply_check_legToPnt_2D(iPower, newRes)
@@ -29,7 +30,7 @@ program ply_fpt_ifpt_2D_multiVar_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -43,6 +44,7 @@ subroutine ply_check_legToPnt_2D(power, res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -52,34 +54,35 @@ subroutine ply_check_legToPnt_2D(power, res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**2
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**2, nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**2, nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**2, nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**2, nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar,rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   fpt           = fpt,             &
-      &                   nIndeps       = maxPolyDegree+1  )
-  
+      &                   nIndeps       = maxPolyDegree+1, &
+      &                   bu            = bu               )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**2,nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**2,nVars ))
     legCoeffsIn = legCoeffs ! Duplicate input vector to make sure that it is not modified in the trafo
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, &
-      &                   pntVal = pntVal, nVars = nVars      )
+      &                   pntVal = pntVal, nVars = nVars, bu = bu   )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**2,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**2,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal,       &
-      &                   legCoeffs = legVal, nVars = nVars )
+      &                   legCoeffs = legVal, nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iDof = 1, (maxPolyDegree+1)**2
     !!  write(*,*) 'Leg coeff ', iDof, ' has error: ', legVal(iDof) - legCoeffs(iDof)
     !!end do
@@ -92,7 +95,7 @@ subroutine ply_check_legToPnt_2D(power, res)
       maxErr = maxloc(abs(legVal(:,iVar) - legCoeffs(:,iVar)), 1)
       write(*,*) 'Ref. sol ', legCoeffs(maxErr,iVar), ' alg delivers: ', legVal(maxErr,iVar)
     end do
-   
+
     res = maxval(abs(legVal(:,:) - legCoeffs(:,:)))
 
   end subroutine
diff --git a/utests/with_fftw/ply_fpt_ifpt_2D_singVar_lobattoNodes_test.f90 b/utests/with_fftw/ply_fpt_ifpt_2D_singVar_lobattoNodes_test.f90
index 6f4bcdf..23adb70 100644
--- a/utests/with_fftw/ply_fpt_ifpt_2D_singVar_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_2D_singVar_lobattoNodes_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_2D_singVar_lobattoNodes_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D,    &
     &                                 ply_pntToLeg_2D
 
@@ -29,7 +30,7 @@ program ply_fpt_ifpt_2D_singVar_lobattoNodes_test
 
   if (res < 1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -44,6 +45,7 @@ subroutine ply_check_legToPnt_2D(power, res)
     real(kind=rk), allocatable :: legCoeffs(:), legCoeffsIn(:)
     real(kind=rk), allocatable :: pntVal(:), legVal(:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -62,19 +64,22 @@ subroutine ply_check_legToPnt_2D(power, res)
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,   &
       &                   fpt           = fpt,             &
       &                   nIndeps       = maxPolyDegree+1, &
-      &                   lobattoPoints = .true.           )
+      &                   lobattoPoints = .true.,          &
+      &                   bu            = bu               )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal( (maxPolyDegree+1)**2))
     legCoeffsIn = legCoeffs ! Duplicate input vector to make sure that it is not modified in the trafo
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, pntVal = pntVal )
+    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, pntVal = pntVal, &
+        &                 bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
     allocate(legVal( (maxPolyDegree+1)**2 ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
-    call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal, legCoeffs = legVal )
+    call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, &
+          &               bu = bu )
     write(logUnit(10),*) 'Finished'
 
     !!do iDof = 1, (maxPolyDegree+1)**2
diff --git a/utests/with_fftw/ply_fpt_ifpt_2D_singVar_test.f90 b/utests/with_fftw/ply_fpt_ifpt_2D_singVar_test.f90
index be0ec3a..b7f2990 100644
--- a/utests/with_fftw/ply_fpt_ifpt_2D_singVar_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_2D_singVar_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_2D_singVar_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+  &                                   ply_legFpt_bu_type
   use ply_legFpt_2D_module,     only: ply_legToPnt_2D, ply_pntToLeg_2D
 
   implicit none
@@ -28,7 +29,7 @@ program ply_fpt_ifpt_2D_singVar_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -41,7 +42,8 @@ subroutine ply_check_legToPnt_2D(power, res)
     real(kind=rk), allocatable :: legCoeffs(:), legCoeffsIn(:)
     real(kind=rk), allocatable :: pntVal(:), legVal(:)
     type(ply_legFpt_type) :: fpt
-  
+    type(ply_legFpt_bu_type) :: bu
+
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
@@ -49,30 +51,31 @@ subroutine ply_check_legToPnt_2D(power, res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**2
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**2)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**2)) 
+    allocate(legCoeffs((maxPolyDegree+1)**2))
+    allocate(legCoeffsIn((maxPolyDegree+1)**2))
     legCoeffs(:) = real(1,rk)
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,  &
       &                   fpt           = fpt,            &
-      &                   nIndeps       = maxPolyDegree+1 )
-  
+      &                   nIndeps       = maxPolyDegree+1,&
+      &                   bu            = bu              )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**2)) 
+    allocate(pntVal( (maxPolyDegree+1)**2))
     legCoeffsIn = legCoeffs ! Duplicate input vector to make sure that it is not modified in the trafo
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, pntVal = pntVal ) 
+    call ply_legToPnt_2D( fpt = fpt, legCoeffs = legCoeffsIn, pntVal = pntVal, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**2 )) 
+    allocate(legVal( (maxPolyDegree+1)**2 ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
-    call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal, legCoeffs = legVal )
+    call ply_pntToLeg_2D( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iDof = 1, (maxPolyDegree+1)**2
     !!  write(*,*) 'Leg coeff ', iDof, ' has error: ', legVal(iDof) - legCoeffs(iDof)
     !!end do
@@ -83,7 +86,7 @@ subroutine ply_check_legToPnt_2D(power, res)
              & ' has largest error of: ' ,maxval(abs(legVal(:) - legCoeffs(:)))
     maxErr = maxloc(abs(legVal(:) - legCoeffs(:)), 1)
     write(logUnit(10),*) 'Ref. sol ', legCoeffs(maxErr), ' alg delivers: ', legVal(maxErr)
-   
+
     res = maxval(abs(legVal(:) - legCoeffs(:)))
 
   end subroutine
diff --git a/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_lobattoNodes_test.f90 b/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_lobattoNodes_test.f90
index 9137aaa..bc043e8 100644
--- a/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_lobattoNodes_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_3D_multiVar_lobattoNodes_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D, &
     &                                 ply_pntToLeg_3D
 
@@ -29,7 +30,7 @@ program ply_fpt_ifpt_3D_multiVar_lobattoNodes_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -42,7 +43,8 @@ subroutine ply_check_legToPnt_3D(power,res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
-  
+    type(ply_legFpt_bu_type) :: bu
+
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
@@ -51,39 +53,40 @@ subroutine ply_check_legToPnt_3D(power,res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar, rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxPolyDegree+1)**2, &
       &                   fpt           = fpt,                  &
-      &                   lobattoPoints = .true.                )
-  
+      &                   lobattoPoints = .true.,               &
+      &                   bu            = bu                    )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     legCoeffsIn = legCoeffs
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_3D( fpt = fpt, legCoeffs = legCoeffsIn, &
-      &                   pntVal = pntVal, nVars = nVars      )
+      &                   pntVal = pntVal, nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal,       &
-      &                   legCoeffs = legVal, nVars = nVars )
+      &                   legCoeffs = legVal, nVars = nVars, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
 
     ! Write out the coefficient with the largest absolute error
     do iVar = 1, nVars
-      do iDof = 1, size(legVal(:,iVar)) 
+      do iDof = 1, size(legVal(:,iVar))
         write(logUnit(10),*) legVal(iDof,iVar), legCoeffs(iDof,iVar)
       end do
       write(logUnit(10),*) 'For var ', iVar, &
diff --git a/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_test.f90 b/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_test.f90
index ee39504..ae817c3 100644
--- a/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_3D_multiVar_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_3D_multiVar_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D, &
     &                                 ply_pntToLeg_3D
 
@@ -29,7 +30,7 @@ program ply_fpt_ifpt_3D_multiVar_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -42,6 +43,7 @@ subroutine ply_check_legToPnt_3D(power,res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -51,38 +53,41 @@ subroutine ply_check_legToPnt_3D(power,res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar, rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxpolydegree+1)**2, &
-      &                   fpt           = fpt                   )
-  
+      &                   fpt           = fpt,                  &
+      &                   bu            = bu                    )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     legCoeffsIn = legCoeffs
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt_3D( fpt       = fpt,         &
       &                   legCoeffs = legCoeffsIn, &
       &                   pntVal    = pntVal,      &
-      &                   nVars     = nVars        ) 
+      &                   nVars     = nVars,       &
+      &                   bu        = bu           )
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     call ply_pntToLeg_3D( fpt       = fpt,    &
       &                   pntVal    = pntVal, &
       &                   legCoeffs = legVal, &
-      &                   nVars     = nVars   )
+      &                   nVars     = nVars,  &
+      &                   bu        = bu      )
     write(logUnit(10),*) 'Finished'
-  
+
 
     ! Write out the coefficient with the largest absolute error
     do iVar = 1, nVars
diff --git a/utests/with_fftw/ply_fpt_ifpt_3D_singVar_lobattoNodes_test.f90 b/utests/with_fftw/ply_fpt_ifpt_3D_singVar_lobattoNodes_test.f90
index 78cd332..eeec7a2 100644
--- a/utests/with_fftw/ply_fpt_ifpt_3D_singVar_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_3D_singVar_lobattoNodes_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_3D_singVar_lobattoNodes_test
   use env_module,               only: rk, fin_env
   use tem_general_module,       only: tem_general_type, tem_start
   use tem_logging_module,       only: logUnit
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+  &                                   ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D, &
     &                                 ply_pntToLeg_3D
 
@@ -18,7 +19,7 @@ program ply_fpt_ifpt_3D_singVar_lobattoNodes_test
   call tem_start(codeName = 'Ateles unit test', &
     &            version  = 'utest',            &
     &            general  = general             )
-  
+
   res = 0.0_rk
   do iPower = 1,3
     call ply_check_legToPnt_3D(iPower, newRes)
@@ -29,7 +30,7 @@ program ply_fpt_ifpt_3D_singVar_lobattoNodes_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -44,6 +45,7 @@ subroutine ply_check_legToPnt_3D(power,res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -53,43 +55,44 @@ subroutine ply_check_legToPnt_3D(power,res)
       & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' // &
       & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar, rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxPolyDegree+1)**2, &
       &                   fpt           = fpt,                  &
-      &                   lobattoPoints = .true.                )
-  
+      &                   lobattoPoints = .true.,               &
+      &                   bu            = bu                    )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     legCoeffsIn = legCoeffs
     write(logUnit(10),*) 'Calculating FPT ...'
     do iVar=1,nVars
       call ply_legToPnt_3D( fpt = fpt, legCoeffs = legCoeffsIn(:,iVar), &
-        &                   pntVal = pntVal(:,iVar) )
+        &                   pntVal = pntVal(:,iVar), bu = bu )
     end do
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     do iVar=1,nVars
       call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal(:,iVar), &
-        &                   legCoeffs = legVal(:,iVar)          )
+        &                   legCoeffs = legVal(:,iVar), bu = bu )
     end do
     write(logUnit(10),*) 'Finished'
-  
+
 
     ! Write out the coefficient with the largest absolute error
     do iVar = 1, nVars
-      do iDof = 1, size(legVal(:,iVar)) 
+      do iDof = 1, size(legVal(:,iVar))
         write(logUnit(10),*) legVal(iDof,iVar), legCoeffs(iDof,iVar)
       end do
       write(logUnit(10),*) 'For var ', iVar, &
diff --git a/utests/with_fftw/ply_fpt_ifpt_3D_singVar_test.f90 b/utests/with_fftw/ply_fpt_ifpt_3D_singVar_test.f90
index 4306662..5a72a18 100644
--- a/utests/with_fftw/ply_fpt_ifpt_3D_singVar_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_3D_singVar_test.f90
@@ -4,7 +4,8 @@ program ply_fpt_ifpt_3D_singVar_test
   use env_module,               only: rk, fin_env
   use tem_logging_module,       only: logUnit
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_legToPnt_3D, ply_pntToLeg_3D
 
   implicit none
@@ -17,7 +18,7 @@ program ply_fpt_ifpt_3D_singVar_test
   call tem_start(codeName = 'Ateles unit test', &
     &            version  = 'utest',            &
     &            general  = general             )
- 
+
   res = 0.0_rk
   do iPower = 1,3
     call ply_check_legToPnt_3D(iPower, newRes)
@@ -28,7 +29,7 @@ program ply_fpt_ifpt_3D_singVar_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -41,6 +42,7 @@ subroutine ply_check_legToPnt_3D(power,res)
     real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
     real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -50,40 +52,43 @@ subroutine ply_check_legToPnt_3D(power,res)
       & // ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
     write(logUnit(10),*) '------------------------------------' &
       & // ' Number of Legendre coefficients (total): ', (maxPolyDegree+1)**3
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs((maxPolyDegree+1)**3,nVars)) 
-    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars)) 
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
     do iVar = 1, nVars
       legCoeffs(:,iVar) = real(iVar, rk)
     end do
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxPolyDegree+1)**2, &
-      &                   fpt           = fpt                   )
-  
+      &                   fpt           = fpt,                  &
+      &                   bu            = bu                    )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal( (maxPolyDegree+1)**3, nVars )) 
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
     legCoeffsIn = legCoeffs
     write(logUnit(10),*) 'Calculating FPT ...'
     do iVar=1,nVars
       call ply_legToPnt_3D( fpt       = fpt,                 &
         &                   legCoeffs = legCoeffsIn(:,iVar), &
-        &                   pntVal    = pntVal(:,iVar)       ) 
+        &                   pntVal    = pntVal(:,iVar),      &
+        &                   bu        = bu                   )
     end do
     write(logUnit(10),*) 'Finished'
 
     ! now transform back to Legendre coefficients
-    allocate(legVal( (maxPolyDegree+1)**3,nVars )) 
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     do iVar=1,nVars
       call ply_pntToLeg_3D( fpt       = fpt,            &
         &                   pntVal    = pntVal(:,iVar), &
-        &                   legCoeffs = legVal(:,iVar)  ) 
+        &                   legCoeffs = legVal(:,iVar), &
+        &                   bu        = bu              )
     end do
     write(logUnit(10),*) 'Finished'
-  
+
 
     ! Write out the coefficient with the largest absolute error
     do iVar = 1, nVars
diff --git a/utests/with_fftw/ply_fpt_ifpt_test.f90 b/utests/with_fftw/ply_fpt_ifpt_test.f90
index b6ae7fe..2aabefb 100644
--- a/utests/with_fftw/ply_fpt_ifpt_test.f90
+++ b/utests/with_fftw/ply_fpt_ifpt_test.f90
@@ -5,7 +5,7 @@ program ply_fpt_ifpt_test
   use tem_logging_module,       only: logUnit
   use tem_aux_module,           only: tem_abort
   use ply_legFpt_module,        only: ply_init_legFpt, ply_legFpt_type, &
-    &                                 ply_legToPnt, ply_pntToLeg
+    &                                 ply_legToPnt, ply_pntToLeg, ply_legFpt_bu_type
   use ply_modg_basis_module,    only: legendre_1D
   use tem_general_module,       only: tem_general_type, tem_start
 
@@ -30,7 +30,7 @@ program ply_fpt_ifpt_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
 
   call fin_env()
 
@@ -42,33 +42,37 @@ subroutine check_fwd_bwd(power, res)
     integer :: maxPolyDegree
     real(kind=rk), allocatable :: legCoeffs(:), pntVal(:), legVal(:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
     write(logUnit(10),*) '------- Number of Legendre coefficients: ', maxPolyDegree+1
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs(1:maxPolyDegree+1)) 
-    allocate(legVal(1:maxPolyDegree+1)) 
+    allocate(legCoeffs(1:maxPolyDegree+1))
+    allocate(legVal(1:maxPolyDegree+1))
     legCoeffs(:) = 1.0_rk
     legVal = legCoeffs
 
-    ! Init the FPT 
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree, &
       &                   nIndeps       = 1,             &
-      &                   fpt           = fpt            )
+      &                   fpt           = fpt,           &
+      &                   bu            = bu             )
 
     ! now transform to the Chebyshev nodes
-    allocate(pntVal(1:maxPolyDegree+1)) 
+    allocate(pntVal(1:maxPolyDegree+1))
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_legToPnt( fpt = fpt, legCoeffs = legVal, pntVal = pntVal, nIndeps = 1 )
+    call ply_legToPnt( fpt = fpt, legCoeffs = legVal, pntVal = pntVal,  &
+                       nIndeps = 1, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! now transform to the Legendre coefficients
     write(logUnit(10),*) 'Calculating inverse FPT ...'
-    call ply_pntToLeg( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, nIndeps = 1 )
+    call ply_pntToLeg( fpt = fpt, pntVal = pntVal, legCoeffs = legVal,  &
+        &              nIndeps = 1, bu = bu )
     write(logUnit(10),*) 'Finished'
 
     ! Write out the polynomial coefficient with the largest absolute error
diff --git a/utests/with_fftw/ply_fpt_lobattoNodes_test.f90 b/utests/with_fftw/ply_fpt_lobattoNodes_test.f90
index 61ea95c..d1fef55 100644
--- a/utests/with_fftw/ply_fpt_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_fpt_lobattoNodes_test.f90
@@ -7,7 +7,7 @@ program ply_fpt_lobattoNodes_test
   use tem_logging_module,       only: logUnit
   use tem_aux_module,           only: tem_abort
   use ply_legFpt_module,        only: ply_init_legFpt, ply_legFpt_type, &
-    &                                 ply_legToPnt
+    &                                 ply_legToPnt, ply_legFpt_bu_type
   use ply_modg_basis_module,    only: legendre_1D
   use tem_general_module,       only: tem_general_type, tem_start
 
@@ -32,7 +32,7 @@ program ply_fpt_lobattoNodes_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
   call fin_env()
 
 contains
@@ -46,23 +46,24 @@ subroutine ply_check_legToPnt(power, res)
     real(kind=rk), allocatable :: chebPnt(:)
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     type(ply_legFpt_type) :: fpt
-  
+    type(ply_legFpt_bu_type) :: bu
+
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
     write(logUnit(10),*) '------- Number of Legendre coefficients: ', maxPolyDegree+1
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs(1:maxPolyDegree+1)) 
+    allocate(legCoeffs(1:maxPolyDegree+1))
     legCoeffs(:) = 1.0_rk
-  
+
     ! Create the Chebyshev nodes on the interval [-1,+1]
     allocate(chebPnt(maxPolyDegree+1))
     do iPoint = 1, maxPolyDegree+1
       chebPnt(iPoint) = cos((iPoint-1.0_rk)*PI/maxPolyDegree);
       !write(*,*) 'Lobatto-Chebyshev-Point', iPoint, chebPnt(iPoint)
     end do
-  
+
     ! define the reference results for the point values (Chebyshev nodes)
     allocate( legValChebPnt(maxPolyDegree+1,maxPolyDegree+1) )
     legValChebPnt(:,:) = legendre_1D(chebPnt, maxPolyDegree)
@@ -73,23 +74,24 @@ subroutine ply_check_legToPnt(power, res)
       refVal(:) = refVal(:) + legValChebPnt(iPoly,:) * legCoeffs(iPoly)
     end do
     write(logUnit(10),*) 'Finished'
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree, &
       &                   nIndeps       = 1,             &
       &                   fpt           = fpt,           &
-      &                   lobattoPoints = .true.         )
-  
+      &                   lobattoPoints = .true.,        &
+      &                   bu            = bu             )
+
     ! now transform to the Chebyshev nodes
-    allocate(pntVal(1:maxPolyDegree+1)) 
+    allocate(pntVal(1:maxPolyDegree+1))
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt( fpt = fpt, legCoeffs = legCoeffs, pntVal = pntVal, &
-      &                nIndeps = 1                                        )
+      &                nIndeps = 1, bu = bu                               )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iPoint = 1, maxPolyDegree+1
     !!  write(*,*) 'Point: ', chebPnt(iPoint), &
-    !!           & ' FPT: ', pntVal(iPoint), & 
+    !!           & ' FPT: ', pntVal(iPoint), &
     !!           & ' Ref.: ', refVal(iPoint), &
     !!           & ' error: ', pntVal(iPoint)-refVal(iPoint)
     !!end do
diff --git a/utests/with_fftw/ply_fpt_test.f90 b/utests/with_fftw/ply_fpt_test.f90
index 12985b9..e6d1f51 100644
--- a/utests/with_fftw/ply_fpt_test.f90
+++ b/utests/with_fftw/ply_fpt_test.f90
@@ -6,7 +6,7 @@ program ply_fpt_test
   use tem_logging_module,       only: logUnit
   use ply_legFpt_module,        only: ply_init_legFpt, &
     &                                 ply_legFpt_type, &
-    &                                 ply_legToPnt
+    &                                 ply_legToPnt, ply_legFpt_bu_type
   use ply_modg_basis_module,    only: legendre_1D
   use tem_general_module,       only: tem_general_type, tem_start
 
@@ -58,6 +58,7 @@ subroutine ply_check_legToPnt(blocksize, maxpolydegree, res)
     real(kind=rk), allocatable :: chebPnt(:)
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
 
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
@@ -89,13 +90,14 @@ subroutine ply_check_legToPnt(blocksize, maxpolydegree, res)
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree, &
       &                   nIndeps       = 1,             &
       &                   fpt           = fpt,           &
-      &                   blocksize     = blocksize      )
+      &                   blocksize     = blocksize,     &
+      &                   bu            = bu             )
 
     ! now transform to the Chebyshev nodes
     allocate(pntVal(1:maxPolyDegree+1))
     write(logUnit(10),*) 'Calculating FPT ...'
     call ply_legToPnt( fpt = fpt, legCoeffs = legCoeffs, &
-      &                pntVal = pntVal, nIndeps = 1      )
+      &                pntVal = pntVal, nIndeps = 1, bu = bu  )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, maxPolyDegree+1
diff --git a/utests/with_fftw/ply_ifpt_3D_singVar_lobattoNodes_test.fpp b/utests/with_fftw/ply_ifpt_3D_singVar_lobattoNodes_test.fpp
index 63cfefe..644a17b 100644
--- a/utests/with_fftw/ply_ifpt_3D_singVar_lobattoNodes_test.fpp
+++ b/utests/with_fftw/ply_ifpt_3D_singVar_lobattoNodes_test.fpp
@@ -6,7 +6,8 @@ program ply_ifpt_3D_singVar_lobattoNodes_test
   use tem_param_module,         only: PI
   use tem_logging_module,       only: logUnit
   use tem_aux_module,           only: tem_abort
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_pntToLeg_3D
   use ply_modg_basis_module,    only: evalLegendreTensPoly
   use ply_dof_module,           only: Q_space
@@ -48,6 +49,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -119,11 +121,13 @@ contains
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxPolyDegree+1)**2, &
       &                   fpt           = fpt,                  &
-      &                   lobattoPoints = .true.                )
+      &                   lobattoPoints = .true.,               &
+      &                   bu            = bu)
 
     ! now transform to the Chebyshev nodes
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal, legCoeffs = legCoeffs )
+    call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal, legCoeffs = legCoeffs, &
+        &                 bu  = bu                                           )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_ifpt_3D_singVar_test.fpp b/utests/with_fftw/ply_ifpt_3D_singVar_test.fpp
index ed82cc8..adbc246 100644
--- a/utests/with_fftw/ply_ifpt_3D_singVar_test.fpp
+++ b/utests/with_fftw/ply_ifpt_3D_singVar_test.fpp
@@ -7,7 +7,8 @@ program ply_ifpt_3D_singVar_test
   use tem_logging_module,       only: logUnit
   use tem_aux_module,           only: tem_abort
   use tem_general_module,       only: tem_general_type, tem_start
-  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT
+  use ply_legFpt_module,        only: ply_legFpt_type, ply_init_legFPT, &
+    &                                 ply_legFpt_bu_type
   use ply_legFpt_3D_module,     only: ply_pntToLeg_3D
   use ply_modg_basis_module,    only: legendre_1D
 
@@ -48,6 +49,7 @@ contains
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     real(kind=rk) :: rfac
     type(ply_legFpt_type) :: fpt
+    type(ply_legFpt_bu_type) :: bu
     integer, allocatable :: rand_seed(:)
     integer :: nSeeds
 
@@ -113,11 +115,13 @@ contains
     ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree,        &
       &                   nIndeps       = (maxPolyDegree+1)**2, &
-      &                   fpt           = fpt                   )
+      &                   fpt           = fpt,                  &
+      &                   bu            = bu                    )
 
     ! now transform to the Chebyshev nodes
     write(logUnit(10),*) 'Calculating FPT ...'
-    call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal, legCoeffs = legCoeffs )
+    call ply_pntToLeg_3D( fpt = fpt, pntVal = pntVal, legCoeffs = legCoeffs, &
+        &                 bu  = bu  )
     write(logUnit(10),*) 'Finished'
 
     !!do iPoint = 1, (maxPolyDegree+1)**3
diff --git a/utests/with_fftw/ply_ifpt_lobattoNodes_test.f90 b/utests/with_fftw/ply_ifpt_lobattoNodes_test.f90
index 8cc601f..f3d8240 100644
--- a/utests/with_fftw/ply_ifpt_lobattoNodes_test.f90
+++ b/utests/with_fftw/ply_ifpt_lobattoNodes_test.f90
@@ -8,7 +8,7 @@ program ply_ifpt_lobattoNodes_test
   use tem_aux_module,           only: tem_abort
   use tem_general_module,       only: tem_general_type, tem_start
   use ply_legFpt_module,        only: ply_init_legFpt, ply_legFpt_type, &
-    &                                 ply_pntToLeg
+    &                                 ply_pntToLeg, ply_legFpt_bu_type
   use ply_modg_basis_module,    only: legendre_1D
 
   implicit none
@@ -32,7 +32,7 @@ program ply_ifpt_lobattoNodes_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
   call fin_env()
 
 contains
@@ -46,23 +46,24 @@ subroutine ply_check_pntToLeg(power, res)
     real(kind=rk), allocatable :: chebPnt(:)
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     type(ply_legFpt_type) :: fpt
-  
+    type(ply_legFpt_bu_type) :: bu
+
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
     write(logUnit(10),*) '------- Number of Legendre coefficients: ', maxPolyDegree+1
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs(1:maxPolyDegree+1)) 
+    allocate(legCoeffs(1:maxPolyDegree+1))
     legCoeffs(:) = 1.0_rk
-  
+
     ! Create the Chebyshev nodes on the interval [-1,+1]
     allocate(chebPnt(maxPolyDegree+1))
     do iPoint = 1, maxPolyDegree+1
       chebPnt(iPoint) = cos((iPoint-1.0_rk)*PI/maxPolyDegree);
       !write(*,*) 'Cehbyshev point', iPoint, ' is at: ', chebPnt(iPoint)
     end do
-  
+
     ! define the point values (Lobatto-Chebyshev-nodes)
     allocate( legValChebPnt(maxPolyDegree+1,maxPolyDegree+1) )
     legValChebPnt(:,:) = legendre_1D(chebPnt, maxPolyDegree)
@@ -73,23 +74,24 @@ subroutine ply_check_pntToLeg(power, res)
       pntVal(:) = pntVal(:) + legValChebPnt(iPoly,:) * legCoeffs(iPoly)
     end do
     write(logUnit(10),*) 'Finished'
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree, &
       &                   nIndeps       = 1,             &
       &                   fpt           = fpt,           &
-      &                   lobattoPoints = .true.         )
-  
+      &                   lobattoPoints = .true.,        &
+      &                   bu            = bu             )
+
     ! now transform to the Legendre coefficients
-    allocate(legVal(1:maxPolyDegree+1)) 
+    allocate(legVal(1:maxPolyDegree+1))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
     call ply_pntToLeg( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, &
-      &                nIndeps = 1 ) 
+      &                nIndeps = 1, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iPoly = 1, maxPolyDegree+1
     !!  write(*,*) 'Poly degree: ', iPoly, &
-    !!           & ' iFPT: ', legVal(iPoly), & 
+    !!           & ' iFPT: ', legVal(iPoly), &
     !!           & ' Ref.: ', legCoeffs(iPoly), &
     !!           & ' error: ', legVal(iPoly)-legCoeffs(iPoly)
     !!end do
diff --git a/utests/with_fftw/ply_ifpt_test.f90 b/utests/with_fftw/ply_ifpt_test.f90
index 334c42a..7971dcd 100644
--- a/utests/with_fftw/ply_ifpt_test.f90
+++ b/utests/with_fftw/ply_ifpt_test.f90
@@ -7,7 +7,7 @@ program ply_ifpt_test
   use tem_general_module,       only: tem_general_type, tem_start
   use ply_legFpt_module,        only: ply_init_legFpt, &
     &                                 ply_legFpt_type, &
-    &                                 ply_pntToLeg
+    &                                 ply_pntToLeg, ply_legFpt_bu_type
   use ply_modg_basis_module,    only: legendre_1D
 
   implicit none
@@ -31,7 +31,7 @@ program ply_ifpt_test
 
   if(res.lt.1e-08) then
     write(logUnit(1),*) 'PASSED'
-  end if 
+  end if
   call fin_env()
 
 contains
@@ -45,23 +45,24 @@ subroutine ply_check_pntToLeg(power, res)
     real(kind=rk), allocatable :: chebPnt(:)
     real(kind=rk), allocatable :: legValChebPnt(:,:)
     type(ply_legFpt_type) :: fpt
-  
+    type(ply_legFpt_bu_type) :: bu
+
     ! Define the maximal polynomial degree we want to calculate the
     ! bases exchange for.
     maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
     write(logUnit(10),*) '------- Number of Legendre coefficients: ', maxPolyDegree+1
-  
+
     ! Create the Legendre expansion coefficients
-    allocate(legCoeffs(1:maxPolyDegree+1)) 
+    allocate(legCoeffs(1:maxPolyDegree+1))
     legCoeffs(:) = 1.0_rk
-  
+
     ! Create the Chebyshev nodes on the interval [-1,+1]
     allocate(chebPnt(maxPolyDegree+1))
     do iPoint = 1, maxPolyDegree+1
       chebPnt(iPoint) = (-1.0_rk) * cos(PI/(maxPolyDegree+1)*((iPoint-1.0_rk)+1.0_rk/2.0_rk))
       !write(*,*) 'Cehbyshev point', iPoint, ' is at: ', chebPnt(iPoint)
     end do
-  
+
     ! define the point values (Chebyshev nodes)
     allocate( legValChebPnt(maxPolyDegree+1,maxPolyDegree+1) )
     legValChebPnt(:,:) = legendre_1D(chebPnt, maxPolyDegree)
@@ -72,21 +73,23 @@ subroutine ply_check_pntToLeg(power, res)
       pntVal(:) = pntVal(:) + legValChebPnt(iPoly,:) * legCoeffs(iPoly)
     end do
     write(logUnit(10),*) 'Finished'
-  
-    ! Init the FPT 
+
+    ! Init the FPT
     call ply_init_legFpt( maxPolyDegree = maxPolyDegree, &
       &                   nIndeps       = 1,             &
-      &                   fpt           = fpt            )
-  
+      &                   fpt           = fpt,           &
+      &                   bu            = bu             )
+
     ! now transform to the Legendre coefficients
-    allocate(legVal(1:maxPolyDegree+1)) 
+    allocate(legVal(1:maxPolyDegree+1))
     write(logUnit(10),*) 'Calculating inverse FPT ...'
-    call ply_pntToLeg( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, nIndeps=1 ) 
+    call ply_pntToLeg( fpt = fpt, pntVal = pntVal, legCoeffs = legVal, &
+        &              nIndeps=1, bu = bu )
     write(logUnit(10),*) 'Finished'
-  
+
     !!do iPoly = 1, maxPolyDegree+1
     !!  write(*,*) 'Poly degree: ', iPoly, &
-    !!           & ' iFPT: ', legVal(iPoly), & 
+    !!           & ' iFPT: ', legVal(iPoly), &
     !!           & ' Ref.: ', legCoeffs(iPoly), &
     !!           & ' error: ', legVal(iPoly)-legCoeffs(iPoly)
     !!end do

From 2f1b355303591f77e7e031f4247ea268ea246e48 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Tue, 7 Jan 2020 13:04:57 +0100
Subject: [PATCH 06/21] Removed all OMP statements.

--HG--
branch : feature/openmp
---
 source/fpt/ply_chebPoint_module.f90       | 89 ++++-------------------
 source/fpt/ply_legFpt_2D_module.fpp       | 16 +---
 source/fpt/ply_legFpt_module.f90          | 12 ---
 source/ply_LegPolyProjection_module.f90   | 25 +------
 source/ply_fxt_module.f90                 | 35 ---------
 source/ply_l2p_module.f90                 |  4 -
 source/ply_leg_diff_module.fpp            | 24 ------
 source/ply_legser_module.f90              |  7 --
 source/ply_modg_basis_module.fpp          | 15 ----
 source/ply_poly_project_module.fpp        | 36 ---------
 source/ply_poly_transformation_module.f90 | 22 ------
 source/ply_sampling_module.fpp            |  5 --
 source/ply_sampling_varsys_module.f90     | 15 +---
 source/ply_space_integration_module.f90   | 43 +----------
 source/ply_split_element_module.f90       |  7 --
 source/ply_transfer_module.fpp            |  2 -
 16 files changed, 21 insertions(+), 336 deletions(-)

diff --git a/source/fpt/ply_chebPoint_module.f90 b/source/fpt/ply_chebPoint_module.f90
index 510bab0..4261984 100644
--- a/source/fpt/ply_chebPoint_module.f90
+++ b/source/fpt/ply_chebPoint_module.f90
@@ -63,17 +63,13 @@ subroutine ply_chebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
 
     allocate(chebPnt1D(nPoints))
-    !$OMP DO
+
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = -1.0_rk &
         & * cos( PI / nPoints * ( (iPoint - 1.0_rk) + 1.0_rk / 2.0_rk ) )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_chebPoint_1D
   ! ************************************************************************ !
@@ -94,16 +90,12 @@ subroutine ply_lobattoChebPoint_1D( nPoints, chebPnt1D )
     ! -------------------------------------------------------------------- !
     integer :: iPoint
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iPoint)
 
     allocate(chebPnt1D(nPoints))
-    !$OMP DO
+
     do iPoint = 1, nPoints
       chebPnt1D(iPoint) = cos( ( iPoint - 1.0_rk ) * PI / ( nPoints - 1.0_rk ) )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_lobattoChebPoint_1D
   ! ************************************************************************ !
@@ -120,14 +112,13 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
+
     nquadpoints = num_intp_per_direction**3
+
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
-    pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -140,9 +131,6 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube
   ! ************************************************************************ !
@@ -160,14 +148,14 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
+
     nquadpoints = num_intp_per_direction**3
+
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -180,9 +168,6 @@ subroutine create_volume_lobattocheb_points_cube( num_intp_per_direction, &
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube
   ! ************************************************************************ !
@@ -200,16 +185,14 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
-
 
     nquadpoints = num_intp_per_direction**2
+
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -220,9 +203,6 @@ subroutine create_volume_cheb_points_cube_2d( num_intp_per_direction, points )
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_2d
   ! ************************************************************************ !
@@ -242,14 +222,14 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
+
     nquadpoints = num_intp_per_direction**2
+
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         ! here we build all possible combinations of the one-dimensional
@@ -260,9 +240,6 @@ subroutine create_volume_lobattocheb_points_cube_2d(num_intp_per_direction, poin
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_2d
   ! ************************************************************************ !
@@ -278,13 +255,12 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
+
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -293,9 +269,6 @@ subroutine create_volume_cheb_points_cube_1d( num_intp_per_direction, points )
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_cheb_points_cube_1d
   ! ************************************************************************ !
@@ -314,13 +287,12 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
     integer :: pointNumber
     real(kind=rk), allocatable :: chebPnt1D(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     allocate(points(num_intp_per_direction,3))
+
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       ! here we build all possible combinations of the one-dimensional
       ! points to get the three dimensional values.
@@ -329,9 +301,6 @@ subroutine create_volume_lobattocheb_points_cube_1d( num_intp_per_direction, &
       points(pointNumber, 3) = 0.0_rk
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine create_volume_lobattocheb_points_cube_1d
   ! ************************************************************************ !
@@ -357,9 +326,9 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
+
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
@@ -368,7 +337,6 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -377,9 +345,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -388,9 +354,7 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber , 1) = chebPnt1D(i)
@@ -399,14 +363,11 @@ subroutine create_surface_cheb_points_cube( num_intp_per_direction, points, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube: unknown ' &
         & // 'face direction'                                              )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_cheb_points_cube
   ! ************************************************************************ !
 
@@ -431,9 +392,9 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     nquadpoints = num_intp_per_direction**2
+
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
@@ -442,7 +403,6 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = (-1.0_rk)**align
@@ -451,9 +411,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -462,9 +420,7 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case(3) ! face in z direction, z coord is fixes
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           points(pointNumber, 1) = chebPnt1D(i)
@@ -473,14 +429,11 @@ subroutine create_surface_lobattocheb_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_lobattocheb_points_cube:' &
         & // ' unknown face direction'                                   )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_lobattocheb_points_cube
   ! ************************************************************************ !
 
@@ -504,9 +457,9 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
+
     allocate(points(nquadpoints,3))
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
@@ -515,30 +468,24 @@ subroutine create_surface_cheb_points_cube_2d( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_cheb_points_cube_2d
   ! ************************************************************************ !
 
@@ -562,9 +509,9 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
     real(kind=rk), allocatable :: chebPnt1D(:)
     integer :: nquadpoints
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, pointNumber)
 
     nquadpoints = num_intp_per_direction
+
     allocate(points(nquadpoints,3))
 
     call ply_lobattoChebPoint_1D( num_intp_per_direction, chebPnt1D )
@@ -573,30 +520,24 @@ subroutine create_surface_lobattocheb_points_cube_2d( &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = (-1.0_rk)**align
         points(pointNumber, 2) = chebPnt1D(i)
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         points(pointNumber, 1) = chebPnt1D(i)
         points(pointNumber, 2) = (-1.0_rk)**align
         points(pointNumber, 3) = 0.0_rk
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
     case default
       call tem_abort( 'ERROR in create_surface_cheb_points_cube_2d:' &
         & // ' unknown face direction'                               )
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine create_surface_lobattocheb_points_cube_2d
   ! ************************************************************************ !
 
diff --git a/source/fpt/ply_legFpt_2D_module.fpp b/source/fpt/ply_legFpt_2D_module.fpp
index 9293b76..ef9629d 100644
--- a/source/fpt/ply_legFpt_2D_module.fpp
+++ b/source/fpt/ply_legFpt_2D_module.fpp
@@ -80,13 +80,13 @@ contains
    real(kind=rk), dimension(:), allocatable :: alph
    real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iAlph, nIndeps)
 
    striplen = fpt%legToChebParams%striplen
    n = fpt%legToChebParams%n
 
    allocate(alph(min(striplen, n)*n))
    allocate(gam(min(striplen, n)*n))
+
    ! original layout (n = 3):
    !  1  2  3
    !  4  5  6
@@ -96,7 +96,7 @@ contains
    !  1  4  7
    !  2  5  8
    !  3  6  9
-   !$OMP DO
+
    yStripLoop: do iStrip = 1, n, striplen
      ! iAlph is the index of the first element in a line for the transformation
      ! in y-direction.
@@ -116,10 +116,8 @@ contains
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do yStripLoop
-   !$OMP END DO
 
    ! x-direction
-   !$OMP DO
    xStripLoop: do iStrip = 1, n, striplen
      do iAlph = iStrip, min(iStrip+striplen-1, n)
        alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n) !ztrafo
@@ -136,9 +134,6 @@ contains
      pntVal((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n) = gam(1:nIndeps*n)
 
    end do xStripLoop
-   !$OMP END DO
-
-   !$OMP END PARALLEL
 
   end subroutine ply_legToPnt_2D_singVar
   ! ************************************************************************ !
@@ -193,7 +188,6 @@ contains
     real(kind=rk), dimension(:), allocatable :: alph
     real(kind=rk), dimension(:), allocatable :: gam
    ! --------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(nIndeps, iStrip, iAlph)
 
     striplen = fpt%chebToLegParams%striplen
     n = fpt%legToChebParams%n
@@ -202,7 +196,6 @@ contains
     allocate(alph(min(striplen, n)*n))
     allocate(gam(min(striplen, n)*n))
 
-    !$OMP DO
     yStripLoop: do iStrip = 1, n, striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n) !y_Trafo
         alph((iAlph-iStrip)*n+1:(iAlph-iStrip+1)*n) = pntVal(iAlph::n)
@@ -220,10 +213,8 @@ contains
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do yStripLoop ! iStrip
-    !$OMP END DO
 
     ! x-direction
-    !$OMP DO
     xStripLoop: do iStrip = 1,n,striplen
       do iAlph = iStrip, min(iStrip+striplen-1, n)
         !ztrafo
@@ -241,9 +232,6 @@ contains
       legCoeffs((iStrip-1)*n+1 : (iStrip+nIndeps-1)*n)  = gam(1:nIndeps*n)
 
     end do xStripLoop
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg_2D_singVar
   ! ************************************************************************ !
diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 0518646..55d238d 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -276,12 +276,10 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_points) then
 
-      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -296,11 +294,9 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
-      !$OMP END DO
 
     else
 
-      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -314,10 +310,8 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
-      !$OMP END DO
 
     end if ! lobattoPoints
-    !$OMP END PARALLEL
 
   end subroutine ply_legToPnt
   ! ************************************************************************ !
@@ -339,13 +333,11 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb, normFactor)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_Points) then
 
       normFactor = 1.0_rk / real(n,kind=rk)
-      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -360,12 +352,10 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
-      !$OMP END DO
 
     else
 
       normFactor = 0.5_rk / real(n-1,kind=rk)
-      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -380,10 +370,8 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
-      !$OMP END DO
 
     end if ! lobattoPoints
-    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg
   ! ************************************************************************ !
diff --git a/source/ply_LegPolyProjection_module.f90 b/source/ply_LegPolyProjection_module.f90
index 815bf4a..6a8341a 100644
--- a/source/ply_LegPolyProjection_module.f90
+++ b/source/ply_LegPolyProjection_module.f90
@@ -159,8 +159,6 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     integer :: nChildDofs, oneDof
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nDofs, nComponents, nChilds, nChildDofs, workDat)
-
     if (subsamp%projectionType.ne.ply_QLegendrePoly_prp) then
       call tem_abort( 'ERROR in ply_QPolyProjection: subsampling is ' &
         & // 'only implemented for Q-Legendre-Polynomials'            )
@@ -170,7 +168,6 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
-    !$OMP DO
     varLoop: do iVar=1,nVars
       nDofs = varDofs(iVar)
       nComponents = varcomps(iVar)
@@ -244,9 +241,6 @@ subroutine ply_QPolyProjection( subsamp, dofReduction, tree, meshData,   &
       deallocate(projection_oneDof%projCoeff)
 
     end do varLoop
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_QPolyProjection
   ! ************************************************************************ !
@@ -289,8 +283,6 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
     real(kind=rk) :: dimexp
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, xShift, yShift, zShift)
-
     select case(dofType)
     case(ply_QLegendrePoly_prp)
       allocate(projection%projCoeff(nDofs, nChildDofs, nChilds))
@@ -303,7 +295,6 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
       projCoeffOneDim = ply_QLegOneDimCoeff( nint(nDofs**dimexp),     &
         &                                    nint(nChildDofs**dimexp) )
 
-      !$OMP DO
       ! Loop over the children of this element
       childLoop: do iChild = 1, nChilds
 
@@ -356,15 +347,14 @@ subroutine ply_initQLegProjCoeff( doftype, nDofs, ndims, nChilds, &
           end do childDofLoop
         end do parentDofLoop
       end do childLoop
-      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in ply_initProjCoeff: initialization of '      &
         & // 'projection coefficients for subsampling is implemented only ' &
         & // 'for Q-Legendre polynomials'                                   )
     end select
-    !$OMP END PARALLEL
     deallocate(projCoeffOneDim)
+
   end subroutine ply_initQLegProjCoeff
   ! ************************************************************************ !
 
@@ -609,8 +599,6 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iElem, lowElemIndex, upElemIndex, iChild, lowChildIndex, upChildIndex)
-
     nChilds = 2**ndims
     nElems = tree%nElems
     nElemsToRefine = count(new_refine_tree)
@@ -627,7 +615,6 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
 
     if (subsamp%sampling_lvl > 1) then
 
-      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -719,11 +706,9 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
         end if
 
       end do elementLoop
-      !$OMP END DO
 
     else
 
-      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           allocate(childData(nChildDofs*nChilds*nComponents))
@@ -774,11 +759,8 @@ subroutine ply_subsampleData( tree, meshData, nDofs, nChildDofs,          &
           deallocate(childData)
         end if
       end do elemLoop
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
-
   end subroutine ply_subsampleData
   ! ************************************************************************ !
 
@@ -816,11 +798,9 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
     integer :: childDof_pos, parentDof_pos
     real(kind=rk) :: projCoeff
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iChild, iParentDof, iChildDof, iComp, projCoeff, childDof_pos, parentDof_pos)
 
     childData(:) = 0.0_rk
 
-    !$OMP DO
     childLoop: do iChild = 1, nChilds
       parentDofLoop: do iParentDof = 1, nParentDofs
         childDofLoop: do iChildDof = 1, nChildDofs
@@ -840,9 +820,6 @@ subroutine ply_projDataToChild( parentData, nParentDofs, nChildDofs,        &
         end do childDofLoop
       end do parentDofLoop
     end do childLoop
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_projDataToChild
   ! ************************************************************************ !
diff --git a/source/ply_fxt_module.f90 b/source/ply_fxt_module.f90
index 3272c71..6173bd0 100644
--- a/source/ply_fxt_module.f90
+++ b/source/ply_fxt_module.f90
@@ -232,12 +232,10 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
 
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
-    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -245,20 +243,15 @@ subroutine ply_fxt_m2n_2D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = modal_data(lb:ub), &
         &                   nodal_data = nodal_data(lb:ub)  )
     end do
-    !$OMP END DO
 
-    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,                             &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1), &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1)  )
     end do
-    !$OMP END DO
     nodal_data = modal_data
 
-    !$OMP END PARALLEL
-
   end subroutine ply_fxt_m2n_2D
   ! ************************************************************************ !
 
@@ -279,8 +272,6 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     real(kind=rk), pointer :: tmp_in(:), tmp_out(:)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, tmp_in, tmp_out, iColumn)
-
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
@@ -289,7 +280,6 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
     tmp_in = -42
     tmp_out = -42
 
-    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -300,9 +290,7 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   nodal_data = tmp_out   )
       nodal_data(lb:ub) = tmp_out
     end do
-    !$OMP END DO
 
-    !$OMP DO
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1 ) / nModesPerDim ) * msq &
@@ -313,20 +301,15 @@ subroutine ply_fxt_m2n_3D( fxt, modal_data, nodal_data, oversamp_degree )
         &                   modal_data = nodal_data(lb:ub:nModesPerDim), &
         &                   nodal_data = modal_data(lb:ub:nModesPerDim)  )
     end do
-    !$OMP END DO
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
-    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_m2n( flpt       = fxt%flpt,              &
         &                   modal_data = modal_data(lb:ub:msq), &
         &                   nodal_data = nodal_data(lb:ub:msq)  )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_m2n_3D
   ! ************************************************************************ !
@@ -371,12 +354,10 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
     ! -------------------------------------------------------------------- !
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, lb, ub, iColumn)
 
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
 
-    !$OMP DO
     do iLine = 1, oversamp_degree+1
       lb = (iLine-1) * (oversamp_degree+1) + 1
       ub = lb + oversamp_degree
@@ -384,21 +365,15 @@ subroutine ply_fxt_n2m_2D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
-    !$OMP END DO
-
 
-    !$OMP DO
     do iColumn = 1, oversamp_degree+1
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,                             &
         &                   nodal_data = modal_data(lb:msq:oversamp_degree+1), &
         &                   modal_data = nodal_data(lb:msq:oversamp_degree+1)  )
     end do
-    !$OMP END DO
     modal_data = nodal_data
 
-    !$OMP END PARALLEL
-
   end subroutine ply_fxt_n2m_2D
   ! ************************************************************************ !
 
@@ -417,13 +392,10 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
     integer :: ub, lb, iLine, iColumn, nModesPerDim, msq, ntotalDofs
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iLine, iColumn, ub, lb)
-
     nModesPerDim = (oversamp_degree+1)
     msq = nModesPerDim*nModesPerDim
     nTotalDofs =  (oversamp_degree+1)**3
 
-    !$OMP DO
     ! The loop for msq stripes for independent x Dir evaluations
     do iLine = 1, msq
       lb = (iLine-1) * (oversamp_degree+1) + 1
@@ -432,9 +404,7 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = nodal_data(lb:ub), &
         &                   modal_data = modal_data(lb:ub)  )
     end do
-    !$OMP END DO
 
-    !$OMP DO
     ! The loop for msq stripes for independent y Dir evaluations
     do iColumn = 1, msq
       lb = int( (iColumn-1) / nModesPerDim ) * msq &
@@ -445,20 +415,15 @@ subroutine ply_fxt_n2m_3D( fxt, nodal_data, modal_data, oversamp_degree )
         &                   nodal_data = modal_data(lb:ub:nModesPerDim), &
         &                   modal_data = nodal_data(lb:ub:nModesPerDim)  )
     end do
-    !$OMP END DO
 
     ! The loop for msq stripes for independent z Dir evaluations
     ub = nTotalDofs
-    !$OMP DO
     do iColumn = 1, msq
       lb = iColumn
       call fxtf_flptld_n2m( flpt       = fxt%flpt,              &
         &                   nodal_data = nodal_data(lb:ub:msq), &
         &                   modal_data = modal_data(lb:ub:msq)  )
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_fxt_n2m_3D
   ! ************************************************************************ !
diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index e87e6d6..cbbe120 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -271,10 +271,8 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iStrip, iRow, iCell, iCol)
     if (nDofs > 1) then
 
-      !$OMP DO
       do iStrip=0,nIndeps-1,vlen
 
         ! Calculate the upper bound of the current strip
@@ -297,7 +295,6 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
         end do ! iRow = 1, nRows
       end do ! iStrip
-      !$OMP END DO
 
     else
 
@@ -305,7 +302,6 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
     end if
 
-    !$OMP END PARALLEL
   end subroutine ply_l2_projection
   ! ************************************************************************ !
 
diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index c64985c..18e4b97 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -67,8 +67,6 @@ contains
     integer :: leg(3), iDeg, iDeg1, iDeg2, iDeg3, DV(3)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, dofPosPrev, iDeg2, iDeg3, iVar,leg, dofPos, iDeg)
-
     if (present(dirVec)) then
       DV = dirvec
     else
@@ -81,7 +79,6 @@ contains
       endif
     endif
 
-    !$OMP DO
     do iDeg = 1, (mpd+1)**2
       iDeg1 = (iDeg-1)/(mpd+1) + 1      !! do IDeg1 = 1, mPd+1
       iDeg2 = iDeg - (iDeg1-1)*(mpd+1)  !! do IDeg2 = 1, mPd=1   !! iDeg2 = mod(iDeg-1,mpd+1)+1
@@ -140,9 +137,7 @@ contains
         end do
       end do
     end do
-    !$OMP END DO
 
-    !$OMP DO
     ! Scale the results due to the Jacobians of the mappings
     do dofpos=1,(mpd+1)**3
       ideg3 = (dofpos-1)/(mpd+1)**2 + 1
@@ -154,7 +149,6 @@ contains
         &                         * (2.0_rk/elemLength)         &
         &                         * (2.0_rk*leg(iDir) - 1.0_rk)
     end do
-    !$OMP END DO
 
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    ! Uncollapsed version of the scaling !
@@ -174,8 +168,6 @@ contains
 !!     end do
 !!   end do
 
-    !$OMP END PARALLEL
-
   end subroutine calcDiff_leg_normal
   ! ************************************************************************ !
 
@@ -543,8 +535,6 @@ contains
     integer :: leg(2), iDeg1, iDeg2, DV(2)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDeg1, iDeg2, leg, dofPos, dofPosPrev, iVar)
-
     if (present(dirVec)) then
       DV = dirvec
     else
@@ -555,7 +545,6 @@ contains
       endif
     endif
 
-    !$OMP DO
     do iDeg1 = 1, mPd+1
       iDeg2 =  mPd+1
       leg = (/iDeg1, iDeg2/)
@@ -612,9 +601,7 @@ contains
         end do
       end do
     end do
-    !$OMP END DO
 
-    !$OMP DO
     ! Scale the results due to the Jacobians of the mappings
     do iDeg1 = 1, mPd+1
       do iDeg2 = 1, mPd+1
@@ -626,9 +613,6 @@ contains
           &                         * (2.0_rk*leg(iDir) - 1.0_rk)
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine calcDiff_leg_2d_normal
   ! ************************************************************************ !
@@ -735,8 +719,6 @@ contains
     integer :: dofPos, dofPosPrev, dofPos2Prev
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(dofPos, iDegX, dofPosPrev, dofPos2Prev)
-
     ! Build the derivative in x direction
     dofPos = 1 + maxPolyDegree
     legCoeffsDiff(dofPos,:) = 0.0_rk
@@ -744,7 +726,6 @@ contains
       dofPosPrev = dofPos
       dofPos = 1 + (maxPolyDegree-1)
       legCoeffsDiff(dofPos,:) = legCoeffs(dofPosPrev,:)
-      !$OMP DO
       do iDegX = maxPolyDegree-1, 1, -1
         dofPos = 1 + (iDegX-1)
         dofPosPrev = 1 + (iDegX)
@@ -752,19 +733,14 @@ contains
         legCoeffsDiff(dofPos,:) = legCoeffsDiff(dofPos2Prev,:) &
           &                         + legCoeffs(dofPosPrev,:)
       end do
-      !$OMP END DO
     end if
 
-    !$OMP DO
     do iDegX = 1, maxPolyDegree+1
       dofPos = 1 + (iDegX-1)
       legCoeffsDiff(dofPos,:) = legCoeffsDiff(dofPos,:)     &
         &                         * (2.0_rk/elemLength)     &
         &                         * (2.0_rk*iDegX - 1.0_rk)
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine calcDiff_leg_1d
   ! ************************************************************************ !
diff --git a/source/ply_legser_module.f90 b/source/ply_legser_module.f90
index dc3ad15..0649a2a 100644
--- a/source/ply_legser_module.f90
+++ b/source/ply_legser_module.f90
@@ -41,23 +41,19 @@ subroutine legser(A, B, n)
     real(kind=rk) :: ak, al, bb, c, d
     integer :: k, l, ll
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, d, c, l, ll, al, bb, ak)
 
     ak = 0.0_rk
 
     ! Calculation of the first Legendre coefficient
     b(1) = 0.5_rk * a(1)
-    !$OMP DO
     do k=3,n,2
       ak = ak + 2.0_rk
       b(1) = b(1) - a(k)/(ak*ak - 1.0_rk)
     end do
-    !$OMP END DO
     c = 2.0_rk / 3.0_rk
     al = 0.0_rk
 
     ! Start main loop (remaining Legendre coefficients)
-    !$OMP DO
     do l=2,n
       ! Calculation of the Lth coefficient
       ll = l+2
@@ -77,9 +73,6 @@ subroutine legser(A, B, n)
         &   / ( (al+al+3.0_rk)*(al+al+2.0_rk) )
       b(l) = (al+0.5_rk)*bb
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine legser
   ! ************************************************************************ !
diff --git a/source/ply_modg_basis_module.fpp b/source/ply_modg_basis_module.fpp
index 0ecdccb..5812570 100644
--- a/source/ply_modg_basis_module.fpp
+++ b/source/ply_modg_basis_module.fpp
@@ -192,8 +192,6 @@ contains
     integer :: iFunc, jFunc
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(jFunc, iFunc)
-
     allocate(GaussPoints(nPoints))
     allocate(GaussPoints_left(nPoints))
     allocate(GaussPoints_right(nPoints))
@@ -233,7 +231,6 @@ contains
 
     allocate( integral%anz_anzShift(1:nFunc, 1:nFunc, 2))
 
-    !$OMP DO
     !loop over anzatz functions
     do jFunc = 1, nFunc
       do iFunc = 1, nFunc
@@ -255,9 +252,6 @@ contains
 
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine init_modg_covolumeCoeffs
   ! ************************************************************************ !
@@ -457,7 +451,6 @@ contains
     real(kind=rk) :: n_q
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iAns, iAnsX, iAnsY, iAnsZ, n_q, ansPos)
 
     ! allocate the output array
     select case(basisType)
@@ -484,7 +477,6 @@ contains
       polyValX(2,:) = coords(:,1)
       polyValY(2,:) = coords(:,2)
       polyValZ(2,:) = coords(:,3)
-      !$OMP DO
       ! ... higher order polynomials are build recursively
       do iAns = 3, maxPolyDegree+1
         n_q = 1.0_rk / real(iAns-1,kind=rk)
@@ -510,13 +502,11 @@ contains
           &                   * polyValZ(iAns-2,:) )  &
           &                 *n_q
       end do
-      !$OMP END DO
     end if
 
     ! Now, build the complete point value.
     select case(basisType)
       case(Q_space)
-        !$OMP DO
         do iAnsX = 1, maxPolyDegree+1
           do iAnsY = 1, maxPolyDegree+1
             do iAnsZ = 1, maxPolyDegree+1
@@ -528,24 +518,19 @@ contains
             end do
           end do
         end do
-        !$OMP END DO
       case(P_space)
         iAnsX = 1
         iAnsY = 1
         iAnsZ = 1
 ?? copy :: getDofsPTens(maxPolyDegree, ansPosMax)
-        !$OMP DO
         do ansPos = 1, ansPosMax
           polyVal(ansPos, :) = polyValX(iAnsX,:)     &
             &                    * polyValY(iAnsY,:) &
             &                    * polyValZ(iAnsZ,:)
 ?? copy :: nextModgCoeffPTens(iAnsX, iAnsY, iAnsZ)
         end do
-        !$OMP END DO
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine evalLegendreTensPoly
   ! ************************************************************************ !
 
diff --git a/source/ply_poly_project_module.fpp b/source/ply_poly_project_module.fpp
index 293f2e2..5a1effe 100644
--- a/source/ply_poly_project_module.fpp
+++ b/source/ply_poly_project_module.fpp
@@ -525,8 +525,6 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
-
 
     select case(trim(me%kind))
     case ('l2p')
@@ -536,29 +534,23 @@ contains
       ! additional summation
       select case(dim)
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%leg2node, &
             &                    projected = nodal_data(:,iVar),  &
             &                    original  = modal_data(:,iVar)   )
         end do
-        !$OMP END DO
       end select
 
     case ('fpt')
@@ -575,50 +567,40 @@ contains
           &                   legCoeffs = modal_data,     &
           &                   nVars     = nVars           )
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_LegToPnt( fpt       = me%body_1d%fpt,     &
             &                pntVal    = nodal_data(:,iVar), &
             &                legCoeffs = modal_data(:,iVar), &
             &                nIndeps   = 1                   )
         end do
-        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_3D( fxt = me%body_3d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
-        !$OMP END DO
       case (2)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_2D( fxt = me%body_2d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
             &              oversamp_degree = me%oversamp_degree  )
         end do
-        !$OMP END DO
 
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_fxt_m2n_1D( fxt = me%body_1d%fxt,         &
             &               modal_data = modal_data(:,iVar), &
             &               nodal_data = nodal_data(:,iVar)  )
         end do
-        !$OMP END DO
       end select
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine ply_poly_project_m2n_multivar
   !****************************************************************************!
 
@@ -637,36 +619,28 @@ contains
     !--------------------------------------------------------------------------!
     integer :: iVar
     !--------------------------------------------------------------------------!
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar)
-
 
     select case(trim(me%kind))
     case ('l2p')
       select case (dim)
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_1D( trafo = me%body_1D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_2D( trafo = me%body_2D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_l2p_trafo_3D( trafo = me%body_3D%l2p%node2leg, &
             &                    projected = modal_data(:,iVar),  &
             &                    original  = nodal_data(:,iVar)   )
         end do
-        !$OMP END DO
       end select
 
      case ('fpt')
@@ -683,20 +657,17 @@ contains
           &                   pntVal    = nodal_data,     &
           &                   legCoeffs = modal_data      )
       case (1)
-        !$OMP DO
         do iVar = 1,nVars
           call ply_pntToLeg( fpt       = me%body_1d%fpt,     &
             &                nIndeps   = 1,                  &
             &                pntVal    = nodal_data(:,iVar), &
             &                legCoeffs = modal_data(:,iVar)  )
         end do
-        !$OMP END DO
       end select
 
     case ('fxt')
       select case (dim)
       case (3)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_3D(                                  &
             &         fxt              = me%body_3d%fxt,        &
@@ -704,10 +675,8 @@ contains
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
-        !$OMP END DO
 
       case (2)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_2D(                                  &
             &         fxt              = me%body_2d%fxt,        &
@@ -715,25 +684,20 @@ contains
             &         modal_data       = modal_data(:,iVar),    &
             &         oversamp_degree  = me%oversamp_degree     )
         end do
-        !$OMP END DO
 
       case (1)
-        !$OMP DO
         do iVar = 1, nVars
           call ply_fxt_n2m_1D(                               &
             &         fxt              = me%body_1d%fxt,     &
             &         nodal_data       = nodal_data(:,iVar), &
             &         modal_data       = modal_data(:,iVar)  )
         end do
-        !$OMP END DO
       end select
 
     case default
        write(logUnit(1),*) 'ERROR in projection nodal to modal'
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine ply_poly_project_n2m_multivar
   !***************************************************************************!
 
diff --git a/source/ply_poly_transformation_module.f90 b/source/ply_poly_transformation_module.f90
index 46e946d..4817581 100644
--- a/source/ply_poly_transformation_module.f90
+++ b/source/ply_poly_transformation_module.f90
@@ -120,13 +120,11 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
     integer :: nVars, nDofs, nComponents, nChildDofs
     integer :: iVar
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, nComponents, nDofs, workData, nChildDofs)
 
     nVars = size(varDofs)
     allocate(newVarDofs(nVars))
     allocate(newMeshData(nVars))
 
-    !$OMP DO
     varLoop: do iVar = 1, nVars
       nComponents = varComps(iVar)
       nDofs = vardofs(iVar)
@@ -166,10 +164,6 @@ subroutine ply_Poly_Transformation( subsamp, dofReduction, mesh, meshData,   &
       deallocate(workData)
 
     end do varLoop
-    !$OMP END DO
-
-    !$OMP END PARALLEL
-
 
   end subroutine ply_Poly_Transformation
   ! ************************************************************************ !
@@ -218,8 +212,6 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
     real(kind=rk), allocatable :: transform_matrix(:,:)
     real(kind=rk), allocatable :: childData(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iParentElem, iChild, iElem)
-
 
     nChilds = 2**nDims
 
@@ -246,7 +238,6 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
 
     if (subsamp%sampling_lvl > 1) then
 
-      !$OMP DO
       elementLoop: do iParentElem=1,nParentElems
         ! Check if the parent cell was already refined...
         if (refine_tree(iParentElem)) then
@@ -333,9 +324,7 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elementLoop
-      !$OMP END DO
     else
-      !$OMP DO
       elemLoop: do iElem=1,nElems
         if (new_refine_tree(iElem)) then
           ! Create lower and upper indices for all data of iElem in meshData.
@@ -386,13 +375,10 @@ subroutine ply_subsampleData( mesh, meshData, nDofs, nChildDofs,         &
           deallocate(childData)
         end if
       end do elemLoop
-      !$OMP END DO
     end if
 
     deallocate(transform_matrix)
 
-    !$OMP END PARALLEL
-
   end subroutine ply_subsampleData
   ! ************************************************************************ !
 
@@ -782,8 +768,6 @@ subroutine ply_transform_matrix(max_modes, v)
     integer :: m, orig
     real(kind=rk) :: shifting, scaling
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(orig, m)
-
 
     ! transformation matrix looks like this:
     ! [1.0  --  --     shift=0.5   ]
@@ -806,7 +790,6 @@ subroutine ply_transform_matrix(max_modes, v)
       v(2,2) = scaling
 
       if (max_modes > 2) then
-        !$OMP DO
         do orig = 3,max_modes
           v(1,orig) = ply_beta(orig-1) * v(1,orig-2)                    &
             &       + ply_alpha(orig-1) * shifting * v(1,orig-1)        &
@@ -824,14 +807,12 @@ subroutine ply_transform_matrix(max_modes, v)
             end if
           end do
         end do
-        !$OMP END DO
       end if
 
       ! Due to the symmetry of the problem (the left subinterval has just
       ! the shifting with a changed sign), we can fill the other half of
       ! the matrix by copying the already computed values accordingly with
       ! a change in sign, as needed (alternatingly).
-      !$OMP DO
       do m = 1 , max_modes
         do orig = 1, m-1
           if (mod((m+orig),2) /= 0) then
@@ -841,11 +822,8 @@ subroutine ply_transform_matrix(max_modes, v)
           end if
         end do
       end do
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
-
   end subroutine ply_transform_matrix
   ! ************************************************************************ !
 
diff --git a/source/ply_sampling_module.fpp b/source/ply_sampling_module.fpp
index 0165589..c75cb77 100644
--- a/source/ply_sampling_module.fpp
+++ b/source/ply_sampling_module.fpp
@@ -680,21 +680,16 @@ contains
     integer :: iElem
     integer :: nComps
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem)
 
     nComps = fun%nComponents
     datlen = tree%nElems * nComps
 
     call c_f_pointer(fun%method_data, p)
 
-    !$OMP DO
     do iElem=1,n
       res(1+(iElem-1)*nComps:iElem*nComps) &
         & = p%dat(1+(elempos(iElem)-1)*nComps:elempos(iElem)*nComps)
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine get_sampled_element
   ! ************************************************************************ !
diff --git a/source/ply_sampling_varsys_module.f90 b/source/ply_sampling_varsys_module.f90
index d225a41..1eb9bca 100644
--- a/source/ply_sampling_varsys_module.f90
+++ b/source/ply_sampling_varsys_module.f90
@@ -108,7 +108,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
     integer, allocatable :: elempos(:)
     real(kind=rk), allocatable :: elemdat(:)
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iVar, iComponent, iElem, iTotComp)
 
     nVars = trackInst%varmap%varPos%nVals
 
@@ -146,7 +145,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
     variables: do iVar=1,nVars
       varpos = trackInst%varmap%varPos%val(iVar)
       nComponents = varsys%method%val(varpos)%nComponents
-      !$OMP DO
       do iComponent=1,nComponents
         iTotComp = iScalar+iComponent-1
         call ply_sampling_var_allocate( var     = var(iTotComp), &
@@ -154,7 +152,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
           &                             datalen = total_dofs     )
         var(iTotComp)%first(1) = 1
       end do
-      !$OMP END DO
 
       ! Varying polynomial degree for elements is possible.
       ! Need to copy data element by element.
@@ -166,7 +163,6 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
         upper_bound = lower_bound-1 + nDofs
 
         allocate(elemDat(nComponents*nDofs))
-        !$OMP DO
         call varSys%method%val(varpos)%get_element( &
           &    varSys  = varSys,                    &
           &    elempos = elempos(iElem:iElem),      &
@@ -175,6 +171,7 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
           &    nElems  = 1,                         &
           &    nDofs   = nDofs,                     &
           &    res     = elemdat                    )
+
         do iComponent=1,nComponents
           iTotComp = iScalar+iComponent-1
           var(iTotComp)%dat(lower_bound:upper_bound) &
@@ -182,21 +179,16 @@ subroutine ply_sampling_varsys_for_track( varsys, trackInst, mesh, nDims, &
           var(iTotComp)%first(iElem+1) = upper_bound+1
           var(iTotComp)%degree(iElem) = lvl_degree(iLevel)
         end do
-        !$OMP END DO
 
         lower_bound = upper_bound + 1
 
         deallocate(elemDat)
-      !$OMP DO
       end do
-      !$OMP END DO
 
       iScalar = iScalar + nComponents
 
     end do variables
 
-    !$OMP END PARALLEL
-
   end subroutine ply_sampling_varsys_for_track
   ! ------------------------------------------------------------------------ !
   ! ------------------------------------------------------------------------ !
@@ -317,7 +309,6 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     integer :: nElems
     integer :: ndofs
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iElem, ndofs, absmean, variation)
 
     if (allocated(var%deviates)) deallocate(var%deviates)
     var%nDeviating = 0
@@ -325,7 +316,6 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
     if (allocated(var%first)) then
       nElems = size(var%first)-1
       allocate(var%deviates(nElems))
-      !$OMP DO
       do iElem=1,nElems
         ndofs = var%first(iElem+1) - var%first(iElem) - 1
         absmean = max( abs(var%dat(var%first(iElem))), min_mean )
@@ -333,11 +323,8 @@ subroutine ply_sampling_var_compute_elemdev(var, threshold, min_mean)
         var%deviates(iElem) = (variation > threshold*absmean)
         if (var%deviates(iElem)) var%nDeviating = var%nDeviating + 1
       end do
-      !$OMP END DO
     end if
 
-    !$OMP END PARALLEL
-
   end subroutine ply_sampling_var_compute_elemdev
   ! ------------------------------------------------------------------------ !
   ! ------------------------------------------------------------------------ !
diff --git a/source/ply_space_integration_module.f90 b/source/ply_space_integration_module.f90
index fb5eede..b076296 100644
--- a/source/ply_space_integration_module.f90
+++ b/source/ply_space_integration_module.f90
@@ -144,9 +144,9 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(k, j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**3
+
     allocate(points(numQuadPoints,3))
     allocate(weights(numQuadPoints))
     allocate(gaussp1D(num_intp_per_direction))
@@ -159,7 +159,6 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
@@ -175,9 +174,6 @@ subroutine ply_create_volume_gauss_points_cube( num_intp_per_direction,  &
         end do
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube
   ! *********************************************************************** !
@@ -201,9 +197,9 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: numQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i, pointNumber)
 
     numQuadPoints = num_intp_per_direction**2
+
     allocate(points(numQuadPoints,3))
     allocate(weights(numQuadPoints))
     allocate(gaussp1D(num_intp_per_direction))
@@ -216,7 +212,6 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
@@ -229,9 +224,6 @@ subroutine ply_create_volume_gauss_points_cube_2d( num_intp_per_direction, &
         pointNumber = pointNumber + 1
       end do
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube_2d
   ! *********************************************************************** !
@@ -254,7 +246,6 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
 
     allocate(points(num_intp_per_direction,3))
     allocate(weights(num_intp_per_direction))
@@ -268,7 +259,6 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do i = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -279,9 +269,6 @@ subroutine ply_create_volume_gauss_points_cube_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(i)
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_create_volume_gauss_points_cube_1d
   ! *********************************************************************** !
@@ -312,7 +299,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nquadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j, i)
 
     nQuadPoints = num_intp_per_direction**2
 
@@ -331,7 +317,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -343,10 +328,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -358,10 +341,8 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case(3) ! face in z direction, z coord is fixed
-      !$OMP DO
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
           !> here we build all possible combinations of the one-dimensional
@@ -373,7 +354,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
           pointNumber = pointNumber + 1
         end do
       end do
-      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube:' &
@@ -381,8 +361,6 @@ subroutine ply_create_surface_gauss_points_cube( num_intp_per_direction, &
 
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine ply_create_surface_gauss_points_cube
   ! *********************************************************************** !
 
@@ -412,7 +390,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
     real(kind=rk), allocatable :: weights1D(:)
     integer :: nQuadPoints
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i)
 
     ! The number of quadrature points on the boundary of a 2d volume is the
     ! number of quad points in one direction
@@ -433,7 +410,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     select case(dir)
     case(1) ! face in x direction, x coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points for 2d case to get the three dimensional values.
@@ -443,10 +419,8 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
 
     case(2) ! face in y direction, y coord is fixed
-      !$OMP DO
       do i = 1, num_intp_per_direction
         !> here we build all possible combinations of the one-dimensional
         !! quadrature points in 2d case to get the three dimensional values.
@@ -456,7 +430,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
         weights(PointNumber) = weights1D(i)
         pointNumber = pointNumber + 1
       end do
-      !$OMP END DO
 
     case default
       call tem_abort( 'ERROR in create_surface_gauss_points_cube_2d:' &
@@ -464,8 +437,6 @@ subroutine ply_create_surface_gauss_points_cube_2d(num_intp_per_direction, &
 
     end select
 
-    !$OMP END PARALLEL
-
   end subroutine ply_create_surface_gauss_points_cube_2d
   ! *********************************************************************** !
 
@@ -532,7 +503,6 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
     real(kind=rk), allocatable :: gaussp1D(:)
     real(kind=rk), allocatable :: weights1D(:)
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(j)
 
     numQuadPoints = num_intp_per_direction
 
@@ -548,7 +518,6 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       &                      nIntP = num_intp_per_direction )
 
     pointNumber = 1
-    !$OMP DO
     do j = 1, num_intp_per_direction
       !> here we build all possible combinations of the one-dimensional
       !! quadrature points to get the three dimensional values.
@@ -556,9 +525,6 @@ subroutine ply_create_gauss_points_1d( num_intp_per_direction, &
       weights(PointNumber) = weights1D(j)
       pointNumber = pointNumber + 1
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_create_gauss_points_1d
   ! *********************************************************************** !
@@ -587,14 +553,12 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
     real(kind=rk) :: EPS
     integer :: m, i, j
     ! ---------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(i, p1, p2, p3, pp, z1, z)
 
     EPS= 1.0 / (10.0**(PRECISION(1.0_rk)-2) )
     m = (nIntP+1)/2
     xm = 0.5*(x2+x1)
     xl = 0.5*(x2-x1)
 
-    !$OMP DO
     do i = 1, m
 
       z = cos(PI*((i-1)+0.75_rk)/(nIntP+0.5_rk))
@@ -619,9 +583,6 @@ subroutine ply_gaussLegPoints( x1, x2, x, w, nIntP )
       w(nIntp-i+1) = w(i)
 
     end do
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_gaussLegPoints
   ! *********************************************************************** !
diff --git a/source/ply_split_element_module.f90 b/source/ply_split_element_module.f90
index 8670c5f..e3e325d 100644
--- a/source/ply_split_element_module.f90
+++ b/source/ply_split_element_module.f90
@@ -252,7 +252,6 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     integer :: nParents
     integer :: parentpos, childpos
     ! -------------------------------------------------------------------- !
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(iDir, parentMode, iParent, childMode, indep)
 
     nParents = size(parent_data,2)
 
@@ -266,18 +265,15 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
     ! The number of independent modes (in normal directions) is given
     ! by the product of the length in all directions, except the last one.
     nIndeps = 1
-    !$OMP DO
     do iDir=1,nDims-1
       nIndeps = nIndeps*inLen(iDir)
     end do
-    !$OMP END DO
 
     if (ignore) then
       maxcol = min(outLen(1), inLen(nDims))
     else
       maxcol = inLen(nDims)
     end if
-    !$OMP DO
     oldmodes: do parentMode=1,maxcol
       ! Maximal number modes to compute, as this is a triangular matrix
       ! it is limited by the diagonal (parentMode). However, it may be
@@ -308,9 +304,6 @@ subroutine ply_split_element_singleD( nDims, inLen, outLen, parent_data, &
       end do elemloop
 
     end do oldmodes
-    !$OMP END DO
-
-    !$OMP END PARALLEL
 
   end subroutine ply_split_element_singleD
   ! ======================================================================== !
diff --git a/source/ply_transfer_module.fpp b/source/ply_transfer_module.fpp
index 980eaed..123a6fb 100644
--- a/source/ply_transfer_module.fpp
+++ b/source/ply_transfer_module.fpp
@@ -144,14 +144,12 @@ contains
 
       ispace_oq: if (inspace == Q_Space) then
 
-        !$OMP DO
         ! Both, output and input are Q Polynomials
         do out_Y=0,minord-1
           out_off = out_Y*(outdegree+1)
           in_off = out_Y*(indegree+1)
           outdat(out_off+1:out_off+minord) = indat(in_off+1:in_off+minord)
         end do
-        !$OMP END DO
 
       else ispace_oq
 

From 637b01c516ec1f97e57345ff43d794defa874794 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Thu, 13 Feb 2020 13:44:36 +0100
Subject: [PATCH 07/21] Introduced OpenMP parallelisation in m2n projection for
 fpt and l2p.

--HG--
branch : feature/openmp
---
 source/fpt/ply_chebPoint_module.f90 | 1 +
 source/fpt/ply_legFpt_module.f90    | 6 ++++++
 source/ply_l2p_module.f90           | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/source/fpt/ply_chebPoint_module.f90 b/source/fpt/ply_chebPoint_module.f90
index 4261984..0b8b8b1 100644
--- a/source/fpt/ply_chebPoint_module.f90
+++ b/source/fpt/ply_chebPoint_module.f90
@@ -119,6 +119,7 @@ subroutine create_volume_cheb_points_cube(num_intp_per_direction, points)
 
     call ply_chebPoint_1D( num_intp_per_direction, chebPnt1D )
 
+    pointNumber = 1
     do k = 1, num_intp_per_direction
       do j = 1, num_intp_per_direction
         do i = 1, num_intp_per_direction
diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 55d238d..2ab1cd1 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -276,10 +276,12 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_points) then
 
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -294,9 +296,11 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
+      !$OMP END DO
 
     else
 
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call ply_fpt_single( alph   = legCoeffs(iDof:iDof+n-1), &
           &                  gam    = cheb,                     &
@@ -310,8 +314,10 @@ subroutine ply_legToPnt( fpt, legCoeffs, pntVal, nIndeps )
           &                    cheb,                 &
           &                    pntVal(iDof:iDof+n-1) )
       end do
+      !$OMP END DO
 
     end if ! lobattoPoints
+    !$OMP END PARALLEL
 
   end subroutine ply_legToPnt
   ! ************************************************************************ !
diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index cbbe120..e5075f2 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -271,8 +271,11 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
     if (nDofs > 1) then
 
+      !$OMP DO
       do iStrip=0,nIndeps-1,vlen
 
         ! Calculate the upper bound of the current strip
@@ -295,12 +298,16 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
         end do ! iRow = 1, nRows
       end do ! iStrip
+      !$OMP END DO
 
     else
 
+      !$OMP SINGLE
       projected = matrix(nDofs,1) * original
+      !$OMP END SINGLE
 
     end if
+    !$OMP END PARALLEL
 
   end subroutine ply_l2_projection
   ! ************************************************************************ !

From 2440e691654090178a68e4901d430741addbd79a Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 4 Mar 2020 13:17:27 +0100
Subject: [PATCH 08/21] Introduced OpenMP parallelisation in n2m projection for
 fpt and for conversion to and from oversampled representation.

--HG--
branch : feature/openmp
---
 source/fpt/ply_legFpt_module.f90   |  6 ++++++
 source/ply_oversample_module.fpp   | 28 ++++++++++++++++++++--------
 source/ply_poly_project_module.fpp |  5 ++---
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 2ab1cd1..65b0c76 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -339,11 +339,13 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     if (.not. fpt%use_lobatto_Points) then
 
       normFactor = 1.0_rk / real(n,kind=rk)
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -358,10 +360,12 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
+      !$OMP END DO
 
     else
 
       normFactor = 0.5_rk / real(n-1,kind=rk)
+      !$OMP DO
       do iDof = 1, nIndeps*n, n
         call fftw_execute_r2r( fpt%planPntToCheb,     &
           &                    pntVal(iDof:iDof+n-1), &
@@ -376,8 +380,10 @@ subroutine ply_pntToLeg( fpt, pntVal, legCoeffs, nIndeps )
           &                  alph   = cheb,                     &
           &                  params = fpt%ChebToLegParams       )
       end do
+      !$OMP END DO
 
     end if ! lobattoPoints
+    !$OMP END PARALLEL
 
   end subroutine ply_pntToLeg
   ! ************************************************************************ !
diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index 191a6d2..1e20794 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -169,6 +169,7 @@ contains
     integer :: maxorders
     integer :: ord_lim
     ! -------------------------------------------------------------------- !
+
     ! Information for the oversampling loop
     oversamp_degree = poly_proj%oversamp_degree
     mpd1 = poly_proj%min_degree + 1
@@ -184,6 +185,7 @@ contains
         varQ: do iVar=1,nScalars
           if (ensure_positivity(iVar)) then
             ordersum = 0.0_rk
+            !$OMP PARALLEL DO PRIVATE(dof, iDegZ, iDegY, iDegX, iOrd)
             do dof = 1, mpd1_cube
               iDegZ = (dof-1)/mpd1_square + 1
               iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -191,6 +193,7 @@ contains
               iOrd = iDegX+iDegY+iDegZ-2
               ordersum(iOrd) = ordersum(iOrd) + abs(state(dof,iVar))
             end do
+            !$OMP END PARALLEL DO
             varsum = 0.0_rk
             do iOrd=2,ord_lim
               varsum = varsum + ordersum(iOrd)
@@ -202,6 +205,7 @@ contains
           end if
         end do varQ
         do iVar=1,nScalars
+          !$OMP PARALLEL DO PRIVATE(dof, iDegZ, iDegY, iDegX, iOrd, dofOverSamp)
           do dof = 1, mpd1_cube
             iDegZ = (dof-1)/mpd1_square + 1
             iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
@@ -214,22 +218,25 @@ contains
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end if
           end do
+          !$OMP END PARALLEL DO
         end do
       else posQ
         if (oversamp_degree == poly_proj%min_degree) then
           modalCoeffs = state
         else
           modalCoeffs = 0.0_rk
-          do dof = 1, mpd1_cube
-            iDegZ = (dof-1)/mpd1_square + 1
-            iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
-            iDegX = mod(dof-1,mpd1)+1
-            dofOverSamp = iDegX + ( iDegY-1  &
-              &                     + (iDegZ-1)*(oversamp_degree+1) &
-              &                   ) * (oversamp_degree+1)
-            do iVar=1,nScalars
+          do iVar=1,nScalars
+            !$OMP PARALLEL DO PRIVATE(dof, iDegZ, iDegY, iDegX, dofOverSamp)
+            do dof = 1, mpd1_cube
+              iDegZ = (dof-1)/mpd1_square + 1
+              iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
+              iDegX = mod(dof-1,mpd1)+1
+              dofOverSamp = iDegX + ( iDegY-1  &
+                &                     + (iDegZ-1)*(oversamp_degree+1) &
+                &                   ) * (oversamp_degree+1)
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end do
+          !$OMP END PARALLEL DO
           end do
         end if
       end if posQ
@@ -327,10 +334,13 @@ contains
     mpd1_cube = mpd1**3
     nScalars = size(modalCoeffs,2)
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iVar,dof,iDegZ,iDegY,iDegX,dofOverSamp)
     if (poly_proj%basisType == Q_Space) then
       if (oversamp_degree == poly_proj%min_degree) then
         state = modalCoeffs
       else
+        !$OMP DO
         do iVar=1,nScalars
           do dof = 1, mpd1_cube
             iDegZ = (dof-1)/mpd1_square + 1
@@ -342,6 +352,7 @@ contains
             state(dof,iVar) = modalCoeffs(dofOverSamp,iVar)
           end do
         end do
+        !$OMP END DO
       end if
 
     else !P_Space
@@ -358,6 +369,7 @@ contains
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
       end do
     end if
+    !$OMP END PARALLEL
 
   end subroutine ply_convertFromoversample_3d
   ! ************************************************************************ !
diff --git a/source/ply_poly_project_module.fpp b/source/ply_poly_project_module.fpp
index 5a1effe..882c897 100644
--- a/source/ply_poly_project_module.fpp
+++ b/source/ply_poly_project_module.fpp
@@ -554,7 +554,6 @@ contains
       end select
 
     case ('fpt')
-
       select case (dim)
       case (3)
         call ply_LegToPnt_3D( fpt       = me%body_3d%fpt, &
@@ -582,14 +581,14 @@ contains
           call ply_fxt_m2n_3D( fxt = me%body_3d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
-            &              oversamp_degree = me%oversamp_degree  )
+            &               oversamp_degree = me%oversamp_degree )
         end do
       case (2)
         do iVar = 1,nVars
           call ply_fxt_m2n_2D( fxt = me%body_2d%fxt,             &
             &               modal_data = modal_data(:,iVar),     &
             &               nodal_data = nodal_data(:,iVar),     &
-            &              oversamp_degree = me%oversamp_degree  )
+            &               oversamp_degree = me%oversamp_degree )
         end do
 
       case (1)

From e9c7782d4b2e42310f238e7513e2f33f29ad7e39 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 4 Mar 2020 14:18:09 +0100
Subject: [PATCH 09/21] Updated copyright notice

--HG--
branch : feature/openmp
---
 source/fpt/ply_legFpt_module.f90 | 1 +
 source/ply_oversample_module.fpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 65b0c76..f97e5a2 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -3,6 +3,7 @@
 ! Copyright (c) 2013-2014, 2017 Peter Vitt <peter.vitt2@uni-siegen.de>
 ! Copyright (c) 2013-2014 Verena Krupp
 ! Copyright (c) 2016 Langhammer Kay <kay.langhammer@student.uni-siegen.de>
+! Copyright (c) 2020 Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
 !
 ! Parts of this file were written by Jens Zudrop and Harald Klimach
 ! for German Research School for Simulation Sciences GmbH.
diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index 1e20794..267b851 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -4,6 +4,7 @@
 ! Copyright (c) 2014, 2017-2019 Harald Klimach <harald.klimach@uni-siegen.de>
 ! Copyright (c) 2014 Verena Krupp
 ! Copyright (c) 2016 Tobias Girresser <tobias.girresser@student.uni-siegen.de>
+! Copyright (c) 2020 Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
 !
 ! Parts of this file were written by Jens Zudrop, Nikhil Anand, Harald Klimach,
 ! Verena Krupp, Peter Vitt, and Tobias Girresser for University of Siegen.

From 02293a0562ec826ef2c6a9c32e699f78c55557c9 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Thu, 5 Mar 2020 14:27:48 +0100
Subject: [PATCH 10/21] Introduced OpenMP to derivative computation in
 ply_leg_diff_module + formatting

--HG--
branch : feature/openmp
---
 source/fpt/ply_legFpt_module.f90 |  2 +-
 source/ply_leg_diff_module.fpp   | 33 ++++++++++++++++++++++++++++++++
 source/ply_oversample_module.fpp |  4 +++-
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/source/fpt/ply_legFpt_module.f90 b/source/fpt/ply_legFpt_module.f90
index 567d904..0120037 100644
--- a/source/fpt/ply_legFpt_module.f90
+++ b/source/fpt/ply_legFpt_module.f90
@@ -500,7 +500,7 @@ subroutine ply_pntToLeg_single( fpt, pntVal, legCoeffs, nIndeps )
     integer :: n
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n,iDof, cheb)
+    !$OMP PARALLEL DEFAULT(SHARED), PRIVATE(n, iDof, cheb)
     n = fpt%legToChebParams%n
 
     normFactor = 1.0_rk / real(n,kind=rk)
diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index 590824f..c4e07b9 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -329,12 +329,16 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
+
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg1 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg2 = iDeg - (iDeg3-1)*(mpd+1)
@@ -342,7 +346,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1+1, iDeg2, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg1 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -356,8 +362,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -367,9 +375,12 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg1 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
 
+    !$OMP END PARALLEL
+
   end subroutine calcDiff_leg_x_vec
   ! ************************************************************************ !
 
@@ -397,12 +408,16 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
+
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg2 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg3-1)*(mpd+1)
@@ -410,7 +425,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2+1, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg2 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -424,8 +441,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -435,9 +454,12 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg2 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
 
+    !$OMP END PARALLEL
+
   end subroutine calcDiff_leg_y_vec
   ! ************************************************************************ !
 
@@ -465,12 +487,16 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
+
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg3 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg2 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg2-1)*(mpd+1)
@@ -478,7 +504,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2, iDeg3+1, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg3 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -492,8 +520,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         ideg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -501,9 +531,12 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg3 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
 
+    !$OMP END PARALLEL
+
   end subroutine calcDiff_leg_z_vec
   ! ************************************************************************ !
 
diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index d742002..402aa9b 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -340,7 +340,7 @@ contains
     nScalars = size(modalCoeffs,2)
 
     !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iVar,dof,iDegZ,iDegY,iDegX,dofOverSamp)
+    !$OMP PRIVATE(iVar, dof, idof, iDegZ, iDegY, iDegX, dofOverSamp)
     if (poly_proj%basisType == Q_Space) then
       if (oversamp_degree == poly_proj%min_degree) then
         state = modalCoeffs
@@ -365,6 +365,7 @@ contains
       iDegX = 1
       iDegY = 1
       iDegZ = 1
+      !$OMP DO
       do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
         dofOverSamp = iDegX + ( iDegY-1  &
@@ -373,6 +374,7 @@ contains
         state(dof,:) = modalCoeffs(dofOverSamp,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
       end do
+      !$OMP END DO
     end if
     !$OMP END PARALLEL
 

From 3ccea4f7d3c34e8e0886e557ce88078b6e472622 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Tue, 10 Mar 2020 12:54:09 +0100
Subject: [PATCH 11/21] Moved the OpenMP statements from the vectorization- to
 the standard-subroutines in ply_leg_diff_module

--HG--
branch : feature/openmp
---
 source/ply_leg_diff_module.fpp | 44 ++++++++--------------------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index c4e07b9..72a1d6d 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -67,6 +67,10 @@ contains
     integer :: leg(3), iDeg, iDeg1, iDeg2, iDeg3, DV(3)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE(iVar,dofPos,dofPosPrev,dofPos2Prev,iDeg,iDeg1,iDeg2,iDeg3), &
+    !$OMP PRIVATE(leg,DV)
+
     if (present(dirVec)) then
       DV = dirvec
     else
@@ -79,6 +83,7 @@ contains
       endif
     endif
 
+    !$OMP DO
     do iDeg = 1, (mpd+1)**2
       iDeg1 = (iDeg-1)/(mpd+1) + 1      !! do IDeg1 = 1, mPd+1
       iDeg2 = iDeg - (iDeg1-1)*(mpd+1)  !! do IDeg2 = 1, mPd=1   !! iDeg2 = mod(iDeg-1,mpd+1)+1
@@ -137,8 +142,10 @@ contains
         end do
       end do
     end do
+    !$OMP END DO
 
     ! Scale the results due to the Jacobians of the mappings
+    !$OMP DO
     do dofpos=1,(mpd+1)**3
       ideg3 = (dofpos-1)/(mpd+1)**2 + 1
       iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -149,6 +156,9 @@ contains
         &                         * (2.0_rk/elemLength)         &
         &                         * (2.0_rk*leg(iDir) - 1.0_rk)
     end do
+    !$OMP END DO
+
+    !$OMP END PARALLEL
 
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    ! Uncollapsed version of the scaling !
@@ -284,7 +294,6 @@ contains
 
     end do varloop
 
-
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    ! Uncollapsed version of the scaling !
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -329,16 +338,12 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
-
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg1 = mpd
 
-      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg2 = iDeg - (iDeg3-1)*(mpd+1)
@@ -346,9 +351,7 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1+1, iDeg2, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
-      !$OMP END DO
 
-      !$OMP DO
       do iDeg1 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -362,10 +365,8 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
-      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
-      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -375,12 +376,9 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg1 - 1.0_rk)
       end do
-      !$OMP END DO
 
     end do varloop
 
-    !$OMP END PARALLEL
-
   end subroutine calcDiff_leg_x_vec
   ! ************************************************************************ !
 
@@ -408,16 +406,12 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
-
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg2 = mpd
 
-      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg3-1)*(mpd+1)
@@ -425,9 +419,7 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2+1, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
-      !$OMP END DO
 
-      !$OMP DO
       do iDeg2 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -441,10 +433,8 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
-      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
-      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -454,12 +444,9 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg2 - 1.0_rk)
       end do
-      !$OMP END DO
 
     end do varloop
 
-    !$OMP END PARALLEL
-
   end subroutine calcDiff_leg_y_vec
   ! ************************************************************************ !
 
@@ -487,16 +474,12 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iDeg3, iDeg2, iDeg1, iDeg, iVar, dofPos, dofPosPrev)
-
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg3 = mpd
 
-      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg2 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg2-1)*(mpd+1)
@@ -504,9 +487,7 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2, iDeg3+1, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
-      !$OMP END DO
 
-      !$OMP DO
       do iDeg3 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -520,10 +501,8 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
-      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
-      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         ideg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -531,12 +510,9 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg3 - 1.0_rk)
       end do
-      !$OMP END DO
 
     end do varloop
 
-    !$OMP END PARALLEL
-
   end subroutine calcDiff_leg_z_vec
   ! ************************************************************************ !
 

From 3d0f7023fd6a8a6aa29f6a3d7e6cc1590d9f5433 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 11 Mar 2020 11:58:54 +0100
Subject: [PATCH 12/21] Removed some OpenMP statements in ply_oversample_module
 that caused utests to fail

--HG--
branch : feature/openmp
---
 source/ply_oversample_module.fpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/source/ply_oversample_module.fpp b/source/ply_oversample_module.fpp
index 402aa9b..871c534 100644
--- a/source/ply_oversample_module.fpp
+++ b/source/ply_oversample_module.fpp
@@ -226,18 +226,16 @@ contains
           modalCoeffs = state
         else
           modalCoeffs = 0.0_rk
-          do iVar=1,nScalars
-            !$OMP PARALLEL DO PRIVATE(dof, iDegZ, iDegY, iDegX, dofOverSamp)
-            do dof = 1, mpd1_cube
-              iDegZ = (dof-1)/mpd1_square + 1
-              iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
-              iDegX = mod(dof-1,mpd1)+1
-              dofOverSamp = iDegX + ( iDegY-1  &
-                &                     + (iDegZ-1)*(oversamp_degree+1) &
-                &                   ) * (oversamp_degree+1)
+          do dof = 1, mpd1_cube
+            iDegZ = (dof-1)/mpd1_square + 1
+            iDegY = (dof-1-(iDegZ-1)*mpd1_square)/mpd1+1
+            iDegX = mod(dof-1,mpd1)+1
+            dofOverSamp = iDegX + ( iDegY-1  &
+              &                     + (iDegZ-1)*(oversamp_degree+1) &
+              &                   ) * (oversamp_degree+1)
+            do iVar=1,nScalars
               modalCoeffs(dofOverSamp,iVar) = state(dof,iVar)
             end do
-          !$OMP END PARALLEL DO
           end do
         end if
       end if posQ
@@ -289,7 +287,6 @@ contains
         iDegX = 1
         iDegY = 1
         iDegZ = 1
-        !$OMP PARALLEL DO PRIVATE(idof, iDegZ, iDegY, iDegX, iOrd)
         do idof = 1, poly_proj%body_3d%min_dofs
 ?? copy :: posOfModgCoeffPTens(iDegX, iDegY, iDegZ, dof)
           dofOverSamp = iDegX + ( iDegY-1  &
@@ -298,7 +295,6 @@ contains
           modalCoeffs(dofOverSamp,:) = state(dof,:)
 ?? copy :: nextModgCoeffPTens(iDegX, iDegY, iDegZ)
         end do
-        !$OMP END PARALLEL DO
       end if posP
 
     end if

From b2b129665e14e28791cdc52c450fadd156be254e Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald.klimach@uni-siegen.de>
Date: Thu, 9 Jul 2020 20:55:07 +0200
Subject: [PATCH 13/21] Added a performance test for the l2p transformation.

--HG--
branch : feature/openmp
---
 utests/ply_l2p_3D_performance_test.f90 | 131 +++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 utests/ply_l2p_3D_performance_test.f90

diff --git a/utests/ply_l2p_3D_performance_test.f90 b/utests/ply_l2p_3D_performance_test.f90
new file mode 100644
index 0000000..571fa43
--- /dev/null
+++ b/utests/ply_l2p_3D_performance_test.f90
@@ -0,0 +1,131 @@
+! Copyright (c) 2012, 2014 Jens Zudrop <j.zudrop@grs-sim.de>
+! Copyright (c) 2012-2016, 2018-2020 Harald Klimach <harald.klimach@uni-siegen.de>
+! Copyright (c) 2013-2014 Peter Vitt <peter.vitt2@uni-siegen.de>
+! Copyright (c) 2013-2014 Verena Krupp
+! Copyright (c) 2014 Nikhil Anand <nikhil.anand@uni-siegen.de>
+!
+! Parts of this file were written by Jens Zudrop and Harald Klimach for
+! German Research School for Simulation Sciences GmbH.
+!
+! Parts of this file were written by Harald Klimach, Peter Vitt, Verena Krupp,
+! and Nikhil Anand for University of Siegen.
+!
+! Permission to use, copy, modify, and distribute this software for any
+! purpose with or without fee is hereby granted, provided that the above
+! copyright notice and this permission notice appear in all copies.
+!
+! THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+! WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+! MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+! ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+! WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+! ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+! OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+! **************************************************************************** !
+
+!> Unit test to check performance of l2p transformations.
+program ply_l2p_3D_performance_test
+  use mpi, only: mpi_wtime
+  use env_module,            only: rk, fin_env
+  use tem_logging_module,    only: logUnit, tem_logging_init_primary
+  use tem_general_module,    only: tem_general_type, tem_start
+  use ply_l2p_header_module, only: ply_l2p_header_type, ply_l2p_header_define
+  use ply_l2p_module,        only: ply_l2p_type, ply_init_l2p, ply_l2p_trafo_3D
+
+  !mpi!nprocs = 1
+
+  implicit none
+
+  integer :: iPower
+  integer, parameter :: maxpower = 7
+  real(kind=rk) :: res, newRes
+  type(tem_general_type) :: general
+
+  ! Init the Treelm environment, needed to init the log Unit
+  call tem_start(codeName = 'L2P 3D Performance Test', &
+    &            version  = '1',                       &
+    &            general  = general                    )
+  call tem_logging_init_primary( level = 1,               &
+    &                            rank = general%proc%rank )
+
+  res = 0.0_rk
+  do iPower = 1,maxpower
+    call ply_check_legToPnt_3D(iPower, newRes)
+    write(*,*) 'deviation:', newres
+    if (newRes.gt.res) then
+      res = newRes
+    end if
+  end do
+
+  write(*,*) 'Maximal deviation:', res
+  if (res <  1.e-08) then
+    write(logUnit(1),*) 'PASSED'
+  end if
+
+  call fin_env()
+
+contains
+
+  subroutine ply_check_legToPnt_3D(power,res)
+    integer, intent(in) :: power
+    real(kind=rk) :: res
+    integer :: maxPolyDegree, iVar, nVars
+    real(kind=rk), allocatable :: legCoeffs(:,:), legCoeffsIn(:,:)
+    real(kind=rk), allocatable :: pntVal(:,:), legVal(:,:)
+    type(ply_l2p_header_type) :: header
+    type(ply_l2p_type) :: trafo
+    real(kind=rk) :: starttime, stoptime
+
+    ! Define the maximal polynomial degree we want to calculate the
+    ! bases exchange for.
+    maxPolyDegree =  2**power-1  ! maxPolyDegree+1 has to be a power of 2
+    nVars = 3
+    write(logUnit(10),*) '------------------------------------' // &
+      & ' Number of Legendre coefficients (per dim): ', maxPolyDegree+1
+    write(logUnit(10),*) '------------------------------------' // &
+      & ' Number of Legendre coefficients (total): ',(maxPolyDegree+1)**3
+
+    ! Create the Legendre expansion coefficients
+    allocate(legCoeffs((maxPolyDegree+1)**3,nVars))
+    allocate(legCoeffsIn((maxPolyDegree+1)**3,nVars))
+    do iVar = 1, nVars
+      legCoeffs(:,iVar) = real(iVar, rk)
+    end do
+
+    ! Init the L2 Projection
+    call ply_l2p_header_define( me         = header,     &
+      &                         nodes_kind = 'chebyshev' )
+    call ply_init_l2p( degree = maxPolyDegree, &
+      &                l2p    = trafo,         &
+      &                header = header         )
+
+    ! now transform to the Chebyshev nodes
+    allocate(pntVal( (maxPolyDegree+1)**3, nVars ))
+    legCoeffsIn = legCoeffs
+    starttime = MPI_Wtime()
+    do iVar = 1, nVars
+      call ply_l2p_trafo_3D( trafo     = trafo%leg2node,      &
+        &                    original  = legCoeffsIn(:,iVar), &
+        &                    projected = pntVal(:,iVar)       )
+    end do
+    stoptime = MPI_Wtime()
+    write(*,*) 'Time for degree ', maxpolydegree, ' trafo:   ', &
+      &        stoptime - starttime
+
+    ! now transform back to Legendre coefficients
+    allocate(legVal( (maxPolyDegree+1)**3,nVars ))
+    starttime = MPI_Wtime()
+    do iVar = 1, nVars
+      call ply_l2p_trafo_3D( trafo     = trafo%node2leg, &
+        &                    projected = legVal(:,iVar), &
+        &                    original  = pntVal(:,iVar)  )
+    end do
+    stoptime = MPI_Wtime()
+    write(*,*) 'Time for degree ', maxpolydegree, ' inverse: ', &
+      &        stoptime - starttime
+  
+    res = maxval(abs(legVal(:,:) - legCoeffs(:,:)))
+
+  end subroutine ply_check_legToPnt_3D
+
+end program ply_l2p_3D_performance_test

From 98903487ff58f74555ad284517b6b6df41ae73c7 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 15 Jul 2020 12:13:18 +0200
Subject: [PATCH 14/21] Removed mval from private OMP variables in
 ply_l2p_module

--HG--
branch : feature/openmp
---
 source/ply_l2p_module.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index fef8b81..7dc5291 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -246,7 +246,7 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! -------------------------------------------------------------------- !
 
     !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
+    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub)
     if (nDofs > 1) then
 
       !$OMP DO

From 176a3d2e5d971d2f28921bc27e78596488aacb8d Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald.klimach@uni-siegen.de>
Date: Fri, 17 Jul 2020 21:12:25 +0200
Subject: [PATCH 15/21] Adaption to the change precice module.

--HG--
branch : feature/openmp
---
 source/ply_poly_project_module.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/ply_poly_project_module.fpp b/source/ply_poly_project_module.fpp
index 6acdd45..5747681 100644
--- a/source/ply_poly_project_module.fpp
+++ b/source/ply_poly_project_module.fpp
@@ -121,7 +121,7 @@ module ply_poly_project_module
     &                                    ply_fxt_n2m_3D, &
     &                                    ply_fxt_n2m_2D
   use tem_precice_module,          only: precice_available, &
-   &                                     precice
+   &                                     precice_handle
 
   implicit none
 

From 67c7b1e55d9725ee2c1328ba650c75440c71bf86 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Mon, 20 Jul 2020 13:48:13 +0200
Subject: [PATCH 16/21] Reverted changes of Rev 453 and replaced unnecessary
 loop with array operation

--HG--
branch : feature/openmp
---
 source/ply_l2p_module.f90 | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index 7dc5291..e9f9f5b 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -245,8 +245,9 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL DEFAULT(SHARED), &
-    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub)
+    !$OMP PARALLEL if (nIndeps > 1024), DEFAULT(SHARED) &
+    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
+
     if (nDofs > 1) then
 
       !$OMP DO
@@ -257,9 +258,7 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
 
         do iRow = 1, nDofs
 
-          do iCell = iStrip+1, iStrip+strip_ub
-            projected(iCell, iRow) = 0.0_rk
-          end do
+          projected(iStrip+1:iStrip+strip_ub, iRow) = 0.0_rk
           do iCol = 1, nDofs
             mval =  matrix(iCol,iRow)
             do iCell = iStrip+1, iStrip+strip_ub

From 997bd9732203f1ab4e8b843c5d47c558a2551abe Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 2 Sep 2020 11:39:08 +0200
Subject: [PATCH 17/21] Moved the OpenMP statement in ply_l2p_projection to the
 inner loop for better performance

--HG--
branch : feature/openmp
---
 source/ply_l2p_module.f90 | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index e9f9f5b..108d42e 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -245,20 +245,22 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL if (nIndeps > 1024), DEFAULT(SHARED) &
+    !$OMP PARALLEL IF (nIndeps > 256) &
+    !$OMP DEFAULT(SHARED) &
     !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
 
     if (nDofs > 1) then
 
-      !$OMP DO
       do iStrip=0,nIndeps-1,vlen
 
         ! Calculate the upper bound of the current strip
         strip_ub = min(iStrip + vlen, nIndeps) - iStrip
 
+        !$OMP DO
         do iRow = 1, nDofs
 
           projected(iStrip+1:iStrip+strip_ub, iRow) = 0.0_rk
+
           do iCol = 1, nDofs
             mval =  matrix(iCol,iRow)
             do iCell = iStrip+1, iStrip+strip_ub
@@ -270,8 +272,8 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
           end do ! iCol = 1, nCols
 
         end do ! iRow = 1, nRows
+        !$OMP END DO
       end do ! iStrip
-      !$OMP END DO
 
     else
 

From 0e32b07d6daa18ed66441afb991b1ec679d52211 Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Wed, 2 Sep 2020 12:05:26 +0200
Subject: [PATCH 18/21] Implemented OpenMP in ply_leg_diff_module

--HG--
branch : feature/openmp
---
 source/ply_leg_diff_module.fpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/source/ply_leg_diff_module.fpp b/source/ply_leg_diff_module.fpp
index 2f51f79..b0d1567 100644
--- a/source/ply_leg_diff_module.fpp
+++ b/source/ply_leg_diff_module.fpp
@@ -339,6 +339,9 @@ contains
     integer :: iDeg, iDeg1, iDeg2, iDeg3
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE( iVar, dofPos, dofPosPrev, iDeg, iDeg1, iDeg2, iDeg3 ) &
+    !$OMP PRIVATE( derivative )
 
     varloop: do iVar = 1, nVars
 
@@ -346,6 +349,7 @@ contains
 
       iDeg1 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg2 = iDeg - (iDeg3-1)*(mpd+1)
@@ -353,7 +357,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1+1, iDeg2, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg1 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -367,8 +373,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -378,8 +386,10 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg1 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
+    !$OMP END PARALLEL
 
   end subroutine ply_calcDiff_leg_x_vec
   ! ************************************************************************ !
@@ -407,6 +417,9 @@ contains
     integer :: iDeg, iDeg1, iDeg2, iDeg3
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE( iVar, dofPos, dofPosPrev, iDeg, iDeg1, iDeg2, iDeg3) &
+    !$OMP PRIVATE( derivative)
 
     varloop: do iVar = 1, nVars
 
@@ -414,6 +427,7 @@ contains
 
       iDeg2 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg3 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg3-1)*(mpd+1)
@@ -421,7 +435,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2+1, iDeg3, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg2 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -435,8 +451,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         iDeg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -446,8 +464,10 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg2 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
+    !$OMP END PARALLEL
 
   end subroutine ply_calcDiff_leg_y_vec
   ! ************************************************************************ !
@@ -476,12 +496,16 @@ contains
     real(kind=rk) :: derivative((mpd+1)**2,mpd+1)
     ! -------------------------------------------------------------------- !
 
+    !$OMP PARALLEL DEFAULT(SHARED), &
+    !$OMP PRIVATE( iVar, dofPos, dofPosPrev, iDeg, iDeg1, iDeg2, iDeg3) &
+    !$OMP PRIVATE( derivative)
     varloop: do iVar = 1, nVars
 
       derivative(:,mpd+1) = 0.0_rk
 
       iDeg3 = mpd
 
+      !$OMP DO
       do iDeg = 1, (mpd+1)**2
         iDeg2 = (iDeg-1)/(mpd+1) + 1
         iDeg1 = iDeg - (iDeg2-1)*(mpd+1)
@@ -489,7 +513,9 @@ contains
 ?? copy :: posOfModgCoeffQTens(iDeg1, iDeg2, iDeg3+1, mPd, dofpos )
         derivative(iDeg, mpd) = legCoeffs(dofpos,iVar)
       end do
+      !$OMP END DO
 
+      !$OMP DO
       do iDeg3 = mPd-1, 1, -1
 
         !NEC$ ivdep
@@ -503,8 +529,10 @@ contains
             &                       + legCoeffs(dofposprev, iVar)
         end do
       end do
+      !$OMP END DO
 
       ! Scale the results due to the Jacobians of the mappings
+      !$OMP DO
       do dofpos=1,(mpd+1)**3
         ideg3 = (dofpos-1)/(mpd+1)**2 + 1
         ideg = dofpos - (ideg3-1)*(mpd+1)**2
@@ -512,8 +540,10 @@ contains
           &                          * (2.0_rk/elemLength)  &
           &                          * (2.0_rk*iDeg3 - 1.0_rk)
       end do
+      !$OMP END DO
 
     end do varloop
+    !$OMP END PARALLEL
 
   end subroutine ply_calcDiff_leg_z_vec
   ! ************************************************************************ !

From fd53c9d1b723836fa7d901c83cf5efeaca8b250b Mon Sep 17 00:00:00 2001
From: Daniel Fleischer <daniel.fleischer@student.uni-siegen.de>
Date: Mon, 19 Oct 2020 10:57:15 +0200
Subject: [PATCH 19/21] Rearranged L2-projection and OMP-statements on
 OpenMP-branch for test purposes.

--HG--
branch : feature/openmp
---
 source/ply_l2p_module.f90 | 78 ++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 13 deletions(-)

diff --git a/source/ply_l2p_module.f90 b/source/ply_l2p_module.f90
index 108d42e..409bb7e 100644
--- a/source/ply_l2p_module.f90
+++ b/source/ply_l2p_module.f90
@@ -245,44 +245,96 @@ subroutine ply_l2_projection( nDofs, nIndeps, projected, original, matrix )
     ! integer, parameter :: vlen = nIndeps
     ! -------------------------------------------------------------------- !
 
-    !$OMP PARALLEL IF (nIndeps > 256) &
-    !$OMP DEFAULT(SHARED) &
-    !$OMP PRIVATE(iStrip, iRow, iCell, iCol, strip_ub, mval)
+! Original version (for reference)
+!!    if (nDofs > 1) then
+!!
+!!      do iStrip=1,nIndeps,vlen
+!!
+!!        ! Calculate the upper bound of the current strip
+!!        strip_ub = iStrip-1 + min(vlen, nIndeps-iStrip+1)
+!!
+!!        do iRow = 1, nDofs
+!!
+!!          do iCell = iStrip, strip_ub
+!!            projected(iCell, iRow) = 0.0_rk
+!!          end do
+!!
+!!          do iCol = 1, nDofs
+!!            mval =  matrix(iCol,iRow)
+!!            do iCell = iStrip, strip_ub
+!!              ! on SX-ACE, this can be identified as matrix multiplication
+!!              ! which results in VERY HIGH performance
+!!              projected(iCell, iRow) = projected(iCell, iRow) &
+!!                &                   + mval * original(iCol, iCell)
+!!            end do ! iCell
+!!          end do ! iCol = 1, nCols
+!!        end do ! iRow = 1, nRows
+!!      end do ! iStrip
+!!
+!!    else
+!!
+!!      projected = matrix(nDofs,1) * original
+!!
+!!    end if
 
     if (nDofs > 1) then
 
-      do iStrip=0,nIndeps-1,vlen
+      !$OMP PARALLEL DO DEFAULT(SHARED), &
+      !$OMP PRIVATE(iStrip, iRow, iCell, iCol, mval)
+      do iStrip=1,nIndeps,vlen
 
         ! Calculate the upper bound of the current strip
-        strip_ub = min(iStrip + vlen, nIndeps) - iStrip
+        strip_ub = iStrip-1 + min(vlen, nIndeps-iStrip+1)
 
-        !$OMP DO
         do iRow = 1, nDofs
 
-          projected(iStrip+1:iStrip+strip_ub, iRow) = 0.0_rk
+          do iCell = iStrip, strip_ub
+            projected(iCell, iRow) = 0.0_rk
+          end do
 
           do iCol = 1, nDofs
             mval =  matrix(iCol,iRow)
-            do iCell = iStrip+1, iStrip+strip_ub
+            do iCell = iStrip, strip_ub
               ! on SX-ACE, this can be identified as matrix multiplication
               ! which results in VERY HIGH performance
               projected(iCell, iRow) = projected(iCell, iRow) &
                 &                   + mval * original(iCol, iCell)
             end do ! iCell
           end do ! iCol = 1, nCols
-
         end do ! iRow = 1, nRows
-        !$OMP END DO
       end do ! iStrip
+      !$OMP END PARALLEL DO
 
     else
 
-      !$OMP SINGLE
       projected = matrix(nDofs,1) * original
-      !$OMP END SINGLE
 
     end if
-    !$OMP END PARALLEL
+
+
+! test-version of the loop (will be removed later)
+!!    if (nDofs > 1) then
+!!
+!!      projected(:, :) = 0.0_rk
+!!
+!!      !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(SHARED), &
+!!      !$OMP PRIVATE(iStrip, iRow, iCell, iCol, mval)
+!!      do iRow = 1, nDofs
+!!        do iCol = 1, nDofs
+!!          mval =  matrix(iCol,iRow)
+!!          do iStrip=1,nIndeps
+!!            projected(iStrip, iRow) = projected(iStrip, iRow) &
+!!              &                   + mval * original(iCol, iStrip)
+!!          end do
+!!        end do
+!!      end do
+!!      !$OMP END PARALLEL DO
+!!
+!!    else
+!!
+!!      projected = matrix(nDofs,1) * original
+!!
+!!    end if
 
   end subroutine ply_l2_projection
   ! ************************************************************************ !

From a9e57632bc585a957370b366c75bdbf7e4cc924b Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald.klimach@dlr.de>
Date: Fri, 14 Nov 2025 12:20:14 +0100
Subject: [PATCH 20/21] Adapt l2p_3D_performance test to changed tem_start
 interface

---
 utests/ply_l2p_3D_performance_test.f90 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utests/ply_l2p_3D_performance_test.f90 b/utests/ply_l2p_3D_performance_test.f90
index 571fa43..27e714d 100644
--- a/utests/ply_l2p_3D_performance_test.f90
+++ b/utests/ply_l2p_3D_performance_test.f90
@@ -43,7 +43,6 @@ program ply_l2p_3D_performance_test
 
   ! Init the Treelm environment, needed to init the log Unit
   call tem_start(codeName = 'L2P 3D Performance Test', &
-    &            version  = '1',                       &
     &            general  = general                    )
   call tem_logging_init_primary( level = 1,               &
     &                            rank = general%proc%rank )

From 27dc67c25fcf3c0c0eaf2efad59c9bdc7f9702cf Mon Sep 17 00:00:00 2001
From: Harald Klimach <harald.klimach@dlr.de>
Date: Fri, 14 Nov 2025 12:39:02 +0100
Subject: [PATCH 21/21] Adapt fpt_3D_performance_test to changed tem_start
 interface

---
 utests/with_fftw/ply_fpt_3D_performance_test.f90 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utests/with_fftw/ply_fpt_3D_performance_test.f90 b/utests/with_fftw/ply_fpt_3D_performance_test.f90
index 1fd6c7e..1555f82 100644
--- a/utests/with_fftw/ply_fpt_3D_performance_test.f90
+++ b/utests/with_fftw/ply_fpt_3D_performance_test.f90
@@ -46,7 +46,6 @@ program ply_fpt_3D_performance_test
 
   ! Init the Treelm environment, needed to init the log Unit
   call tem_start(codeName = 'FPT 3D Performance Test', &
-    &            version  = '1',                       &
     &            general  = general                    )
   call tem_logging_init_primary( level = 1,               &
     &                            rank = general%proc%rank )