From 0cf7302ee0a7114fd576cf2f3f88587bb895a8f6 Mon Sep 17 00:00:00 2001
From: Nathan Harmer <nathanharmer@nathans-air.home>
Date: Wed, 26 Feb 2025 11:38:50 +0000
Subject: [PATCH 1/3] FEAT: Introduced increased flexibility for handling
 subtensors by including offset inputs for tensors. This feature includes
 backwards compatibility for calls to hptt::create_plan without offsets and is
 therefore not a breaking change.

**Detail:**
*Makefiles* (Makefile, benchmark/Makefile, testframework/Makefile)
FIX: In the case of libomp not being discovered in LD_LIBRARY_PATH (MacOS M2 issue), user can specify a path for build.

*benchmark/benchmark.cpp*
FEAT: `transpose_ref` is for internal use and therefore changes do not include backwards compatibility for the function and hence the function call is amended to pass new nullptr arguments.

*benchmark/maxFromFiles.py*
FIX: Print statement of Error is given parentheses.

*benchmark/reference.cpp*
FEAT: Firstly, function receives new outerSize (A/B) and offset (A/B) arrays which are initialised to mimic the size array in the supplication of nullptrs. Next, the stepping through B is amended to ensure that the outerSize is traversed where the row of size is exceeded. Further offsets are inserted into the traversal. Behaviour can be verified via DEBUG.

Pseudo-Code is:
for each dimension not the innermost loop of B:
    divide the current position by the size of the next innermost loop of B that we want to traverse
    move across the offset distance as many times as we have exceeded it plus the initial offset
    further move over any space that remains after the end of the block required by size as many times as we exceed it

*benchmark/reference.h*
FEAT: Amended template to reflect new inputs of transpose_ref(), namely offsetA, offsetB, outerSizeA and outerSizeB.

*include/compute_node.h*
FEAT: Included three new members of a ComputeNode without exceeding the cache size of 64 bytes (unaligned memory in caches exceeding this programmers be warned!).

First the offset difference (A - B) which reduces the number of calculations required in adjusting for the offset in the execution of hptt. The plan is created with start and end positions inclusive of the offset of B and the difference is added to access the start and end values of A.

FIX: Secondly, the booleans of indexA and indexB indicate true when the leading dimension of A/B is 1 and the index is 0. The original code faultered when A or B's innermost dimensions were 1 causing the transpose_int functions to identify incorrect innermost indexes - especially problematic with non-zero outerSizes.

*include/hptt.h*
FEAT: New template functions provided for provision of offsets in various floatType contexts.

*include/transpose.h*
FEAT: Amended skipIndices and verifyParameter to include offset inputs as these functions are effected by the inclusion of these. Also included offsets as properties of the transpose class.

*include/utils.h*
FEAT: Amended the template of accountForRowMajor as this needs to change the orders of the offsets similarly to the other parameters.

*src/hptt.cpp*
FEAT: Implemented new offset templates and amended original templates to point to plan() with nullptrs or offsets where appropriate.

*src/transpose.cpp*
FEAT: Amended plan assignment section to include assignments for the new computeNode members.
FEAT: Included offsets in fuseIndices, skipIndices and verifyParameters functions where amendments effect offsets too and verification proves offset + size <= outerSize for all dimensions.
FEAT: axpy functions require offset differences as well and so these are calculated and the integer/array passed to the respective functions for proper calculation. Similarly, the axpy functions themselves are amended.
FEAT: in transpose_ functions offDiffAB is always added to i to get the correct start/end. Also where lda/ldb == 1 is checked, plan->indexA/B is also asserted to ensure correct blocking is passed. As result of the increased robustness, the blockingA/B can always be confidently passed and loops can be included for cases where scalar is reached and lda/ldb is not 1.
FEAT: Included a plethora of DEBUG statements (coding this was very fun).

*src/utils.cpp*
FEAT: Implemented accountForRowMajor changes for offsets mirroring the behaviour for outerSizes.

*testframework/testframework.cpp*
FEAT: Improved testing to include triggerable outerSize != size and offsets with strings printed for DEBUG cases.
FEAT: Error messages modified for clarity.
---
 Makefile                        |   6 +-
 benchmark/Makefile              |   5 +-
 benchmark/benchmark.cpp         |   2 +-
 benchmark/benchmark.sh          |   0
 benchmark/maxFromFiles.py       |   2 +-
 benchmark/reference.cpp         | 110 +++++++---
 benchmark/reference.h           |   4 +-
 include/compute_node.h          |   7 +-
 include/hptt.h                  | 124 +++++++++++-
 include/transpose.h             |  16 +-
 include/utils.h                 |   4 +-
 src/hptt.cpp                    | 223 ++++++++++++++++++---
 src/transpose.cpp               | 342 +++++++++++++++++++++++---------
 src/utils.cpp                   |  12 +-
 testframework/Makefile          |   5 +-
 testframework/testframework.cpp |  76 +++++--
 16 files changed, 760 insertions(+), 178 deletions(-)
 mode change 100644 => 100755 benchmark/benchmark.sh

diff --git a/Makefile b/Makefile
index 9283bc6..a11369d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,14 @@
-CXX_FLAGS=-O3 -std=c++11 -DNDEBUG
+CXX_FLAGS=-O3 -std=c++11
 INCLUDE_PATH=-I./include/
 
 ifeq ($(CXX),icpc)
 CXX_FLAGS += -qopenmp -xhost 
 else
 ifeq ($(CXX),g++)
-CXX_FLAGS += -fopenmp -march=native 
+CXX_FLAGS += -fopenmp -march=native -Wno-vla
 else
 ifeq ($(CXX),clang++)
-CXX_FLAGS += -fopenmp -march=native
+CXX_FLAGS += -fopenmp -march=native -fsanitize=address -Wno-vla-extension
 endif
 endif
 endif
diff --git a/benchmark/Makefile b/benchmark/Makefile
index c25f9c1..d01f2df 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -28,8 +28,11 @@ LIBS=-lhptt
 all: ${OBJ}
 	${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o benchmark.exe
 
+LIB_PATH+= ${OPENMP_LIB_PATH}
+LIBS+= -lomp
+
 %.o: %.cpp
-	${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
+	${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
 
 clean:
 	rm -rf *.o benchmark.exe
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index c07a56c..a456394 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -192,7 +192,7 @@ int main(int argc, char *argv[])
         restore(B, B_ref, total_size);
         trashCache(trash1, trash2, largerThanL3);
         auto begin_time = omp_get_wtime();
-        transpose_ref( size, perm, dim, A, alpha, B_ref, beta, false);
+        transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, B_ref, beta, nullptr, nullptr, false);
         double elapsed_time = omp_get_wtime() - begin_time;
         minTime = (elapsed_time < minTime) ? elapsed_time : minTime;
      }
diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
old mode 100644
new mode 100755
diff --git a/benchmark/maxFromFiles.py b/benchmark/maxFromFiles.py
index f3b39e6..ea60c1c 100644
--- a/benchmark/maxFromFiles.py
+++ b/benchmark/maxFromFiles.py
@@ -36,5 +36,5 @@
       else:
          f2.write(f2Content[i])
    except:
-      print "ERROR:", i
+      print("ERROR:", i)
 f2.close()
diff --git a/benchmark/reference.cpp b/benchmark/reference.cpp
index a74318a..d7289a0 100644
--- a/benchmark/reference.cpp
+++ b/benchmark/reference.cpp
@@ -16,68 +16,120 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, 
-      floatType* __restrict__ B, floatType beta, const bool conjA)
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA,  
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, const bool conjA)
 {
-   // compute stride for all dimensions w.r.t. A
+   std::vector<int> tempOuterSizeA, tempOuterSizeB, tempOffsetA, tempOffsetB, tempPointerB;
+
+   // Stride One is location of 0 in perm. Perm[0] may not be B stride one unless perm[0] == 0
+   // perm provided yields positions in A data from a B order index
+   // perm calculated below relates positions in B data to an A order index
+   tempPointerB.resize(dim);
+   for (int i = 0; i < dim; ++i)
+      for (int j = 0; j < dim; ++j)
+         if (i == perm[j])
+            tempPointerB[i] = j;
+
+   // Use default values if any of the pointers are nullptr
+   if (outerSizeA == nullptr) {
+      tempOuterSizeA.resize(dim);
+      for (int i = 0; i < dim; i++) tempOuterSizeA[i] = size[i];
+      outerSizeA = tempOuterSizeA.data();
+   }
+    
+   if (outerSizeB == nullptr) {
+      tempOuterSizeB.resize(dim);
+      for (int i = 0; i < dim; i++) tempOuterSizeB[i] = size[perm[i]];
+      outerSizeB = tempOuterSizeB.data();
+   }
+
+   if (offsetA == nullptr) {
+      tempOffsetA.resize(dim);  // Default to zeros
+      for (int i = 0; i < dim; i++) tempOffsetA[i] = 0;
+      offsetA = tempOffsetA.data();
+   }
+
+   if (offsetB == nullptr) {
+      tempOffsetB.resize(dim);  // Default to zeros
+      for (int i = 0; i < dim; i++) tempOffsetB[i] = 0;
+      offsetB = tempOffsetB.data();
+   }
+
+   // compute stride for all dimensions w.r.t. A (like lda)
    uint32_t strideA[dim];
    strideA[0] = 1;
+   for(int i=1; i < dim; ++i) 
+      strideA[i] = strideA[i-1] * outerSizeA[i-1];
+
+   // compute stride for all dimensions w.r.t. B (like ldb)
+   uint32_t strideB[dim];
+   strideB[0] = 1;
    for(int i=1; i < dim; ++i)
-      strideA[i] = strideA[i-1] * size[i-1];
+      strideB[i] = strideB[i-1] * outerSizeB[i-1];
 
    // combine all non-stride-one dimensions of B into a single dimension for
    // maximum parallelism
    uint32_t sizeOuter = 1;
    for(int i=0; i < dim; ++i)
       if( i != perm[0] )
-         sizeOuter *= size[i]; 
+         sizeOuter *= size[i];
 
    uint32_t sizeInner = size[perm[0]];
+   uint32_t strideAinner = strideA[perm[0]];
 
    // This implementation traverses the output tensor in a linear fashion
    
 #pragma omp parallel for
    for(uint32_t j=0; j < sizeOuter; ++j)
    {
-      uint32_t offsetA = 0;
-      uint32_t offsetB = 0;
-      uint32_t j_tmp = j;
+      uint32_t indexOffsetA = 0;
+      uint32_t indexOffsetB = 0;
+      uint32_t j_tmp_A = j;
+      uint32_t j_tmp_B = j;
       for(int i=1; i < dim; ++i)
       {
-         int current_index = j_tmp % size[perm[i]];
-         j_tmp /= size[perm[i]];
-         offsetA += current_index * strideA[perm[i]];
+         int current_index_A = j_tmp_A % size[perm[i]];
+         j_tmp_A /= size[perm[i]];
+         j_tmp_B /= size[perm[i]];
+         indexOffsetA += (current_index_A + offsetA[perm[i]]) * strideA[perm[i]];
+         indexOffsetB += (j_tmp_B + 1) * offsetB[i] * strideB[i];
+         indexOffsetB += j_tmp_B * (outerSizeB[i] - offsetB[i] - size[perm[i]]) * strideB[i];
       }
 
-      const floatType* __restrict__ A_ = A + offsetA;
-      floatType* __restrict__ B_ = B + j*sizeInner;
-
-      uint32_t strideAinner = strideA[perm[0]];
+      const floatType* __restrict__ A_ = A + indexOffsetA;
+      floatType* __restrict__ B_ = B + indexOffsetB + offsetB[0] + (j * outerSizeB[0]);
 
       if( beta == (floatType) 0 )
-         for(int i=0; i < sizeInner; ++i)
+         for(int i=0; i < sizeInner; ++i) {
+#ifdef DEBUG
+            printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], i + offsetB[0] + indexOffsetB + (j * outerSizeB[0]), B_[i]);
+#endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[i * strideAinner]);
+               B_[i] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real();
             else
-               B_[i] = alpha * A_[i * strideAinner];
+               B_[i] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];}
       else
-         for(int i=0; i < sizeInner; ++i)
+         for(int i=0; i < sizeInner; ++i) {
+#ifdef DEBUG
+            printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], i + offsetB[0] + indexOffsetB + (j * outerSizeB[0]), B_[i]);
+#endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[i * strideAinner]) + beta * B_[i];
+               B_[i] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i];
             else
-               B_[i] = alpha * A_[i * strideAinner] + beta * B_[i];
+               B_[i] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i];}
    }
 }
 
 template void transpose_ref<float>( uint32_t *size, uint32_t *perm, int dim, 
-      const float* __restrict__ A, float alpha, 
-      float* __restrict__ B, float beta, const bool conjA);
+      const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, 
+      float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, const bool conjA);
 template void transpose_ref<FloatComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const FloatComplex* __restrict__ A, FloatComplex alpha, 
-      FloatComplex* __restrict__ B, FloatComplex beta, const bool conjA);
+      const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, 
+      FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, const bool conjA);
 template void transpose_ref<double>( uint32_t *size, uint32_t *perm, int dim, 
-      const double* __restrict__ A, double alpha, 
-      double* __restrict__ B, double beta, const bool conjA);
+      const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, 
+      double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, const bool conjA);
 template void transpose_ref<DoubleComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const DoubleComplex* __restrict__ A, DoubleComplex alpha, 
-      DoubleComplex* __restrict__ B, DoubleComplex beta, const bool conjA);
+      const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, 
+      DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, const bool conjA);
+
diff --git a/benchmark/reference.h b/benchmark/reference.h
index 4f4a4a8..5284a0d 100644
--- a/benchmark/reference.h
+++ b/benchmark/reference.h
@@ -2,5 +2,5 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, 
-      floatType* __restrict__ B, floatType beta, const bool conjA);
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, 
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, const bool conjA);
diff --git a/include/compute_node.h b/include/compute_node.h
index b777857..8a31d95 100644
--- a/include/compute_node.h
+++ b/include/compute_node.h
@@ -5,10 +5,10 @@ namespace hptt {
    /**
     * \brief A ComputNode encodes a loop.
     */
-class ComputeNode
+class alignas(16) ComputeNode
 {
    public:
-      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), next(nullptr) {}
+      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(false), indexB(false), offDiffAB(std::numeric_limits<int>::min()), next(nullptr) {}
 
       ~ComputeNode() {
          if ( next != nullptr )
@@ -20,6 +20,9 @@ class ComputeNode
    size_t inc; //!< increment for at the current loop
    size_t lda; //!< stride of A w.r.t. the loop index
    size_t ldb; //!< stride of B w.r.t. the loop index
+   bool   indexA; //!< true if index of A is innermost (0)
+   bool   indexB; //!< true if index of B is innermost (0)
+   int    offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop
    ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
 };
 
diff --git a/include/hptt.h b/include/hptt.h
index dda7f0a..f25ff11 100644
--- a/include/hptt.h
+++ b/include/hptt.h
@@ -168,12 +168,20 @@ namespace hptt {
  *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
  *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A 
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
  *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
  *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
  * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
  *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
  *                            then the output data will be used during the
@@ -186,6 +194,8 @@ namespace hptt {
  *            HPTT from within a parallel region (i.e., via execute_expert()).
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets, 
+ * and --ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -210,7 +220,8 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const SelectionMethod selectionMethod,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets, 
+ * and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                  const float beta, float *B, const std::vector<int> &outerSizeB, 
@@ -235,8 +246,8 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                  const SelectionMethod selectionMethod,
                  const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
 
-
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -260,6 +271,84 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, 
                  const int maxAutotuningCandidates,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --int-- Offsets, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, 
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --vector int-- Offsets, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                 const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                 const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 }
 
 extern "C"
@@ -282,12 +371,20 @@ extern "C"
  *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
  *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A 
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
  *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
  *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
  * \param[in] numThreads number of threads that participate in this tensor transposition.
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
@@ -310,4 +407,25 @@ void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, 
                  const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor = 0);
+
+/* With Offsets */
+void sOffsetTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void dOffsetTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void cOffsetTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void zOffsetTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
 }
diff --git a/include/transpose.h b/include/transpose.h
index 82f0239..0f480b8 100644
--- a/include/transpose.h
+++ b/include/transpose.h
@@ -53,12 +53,20 @@ class Transpose
        *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
        *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on sub-tensors.
+       * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A
+       *                       * This parameter may be NULL, indicating that the offset is zero.
+       *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+       *                       * This option enables HPTT to operate on intermediate sub-tensors.
        * \param[in] beta scaling factor for B
        * \param[inout] B Pointer to the raw-data of the output tensor B
        * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
        *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
        *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on sub-tensors.
+       * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+       *                       * This parameter may be NULL, indicating that the offset is zero.
+       *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+       *                       * This option enables HPTT to operate on intermediate sub-tensors.
        * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
        *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
        *                            then the output data will be used during the
@@ -77,6 +85,8 @@ class Transpose
                  const int *perm, 
                  const int *outerSizeA, 
                  const int *outerSizeB, 
+                 const int *offsetA, 
+                 const int *offsetB, 
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -211,7 +221,7 @@ class Transpose
       void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
       std::shared_ptr<Plan> selectPlan( const std::vector<std::shared_ptr<Plan> > &plans );
       void fuseIndices();
-      void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int dim);
+      void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int *_offsetA, const int *_offsetB, const int dim);
       void computeLeadingDimensions();
       double loopCostHeuristic( const std::vector<int> &loopOrder ) const;
       double parallelismCostHeuristic( const std::vector<int> &loopOrder ) const;
@@ -233,7 +243,7 @@ class Transpose
                                         const std::vector<int> &loopsAllowed) const;
       float getLoadBalance( const std::vector<int> &parallelismStrategy ) const;
       float estimateExecutionTime( const std::shared_ptr<Plan> plan); //execute just a few iterations and extrapolate the result
-      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const;
+      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const int dim) const;
       void getBestParallelismStrategy ( std::vector<int> &bestParallelismStrategy ) const;
       void getBestLoopOrder( std::vector<int> &loopOrder ) const; //innermost loop idx is stored at dim_-1
       void getLoopOrders(std::vector<std::vector<int> > &loopOrders) const;
@@ -256,6 +266,8 @@ class Transpose
       std::vector<int> perm_; //!< permutation 
       std::vector<size_t> outerSizeA_; //!< outer sizes of A
       std::vector<size_t> outerSizeB_;  //!< outer sizes of B
+      std::vector<size_t> offsetA_; //!< offsets of A
+      std::vector<size_t> offsetB_;  //!< offsets of B
       std::vector<size_t> lda_;  //!< strides for all dimensions of A (first dimension has a stride of 1)
       std::vector<size_t> ldb_;  //!< strides for all dimensions of B (first dimension has a stride of 1)
       std::vector<int> threadIds_; //!< OpenMP threadIds of the threads involed in the transposition
diff --git a/include/utils.h b/include/utils.h
index a85b27c..b2b1147 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -76,8 +76,8 @@ int findPos(int value, const int *array, int n);
 
 int factorial(int n);
 
-void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, 
-               int *tmpSizeA, int *tmpOuterSizeA, int *tmpouterSizeB, int *tmpPerm, const int dim, const bool useRowMajor);
+void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, 
+               int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor);
 }
 
 
diff --git a/src/hptt.cpp b/src/hptt.cpp
index ea761c8..2fc786f 100644
--- a/src/hptt.cpp
+++ b/src/hptt.cpp
@@ -19,13 +19,15 @@
 
 namespace hptt {
 
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -35,7 +37,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
@@ -45,7 +47,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -55,19 +57,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
-
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * no Offsets, and vector ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                   const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                   const float beta, float *B, const std::vector<int> &outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -77,7 +79,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &p
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -87,7 +89,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<i
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -97,18 +99,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -120,7 +123,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -132,7 +135,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->createPlan();
    return plan;
 }
@@ -143,7 +146,140 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --vector int-- Offsets, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                  const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -157,7 +293,7 @@ void sTensorTranspose( const int *perm, const int dim,
                  const float beta,        float *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -166,28 +302,69 @@ void dTensorTranspose( const int *perm, const int dim,
                  const double beta,        double *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
 void cTensorTranspose( const int *perm, const int dim,
                  const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, 
-                 const float _Complex beta,        float _Complex *B,                   const int *outerSizeB, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, 
+                         (const hptt::FloatComplex*) A, hptt::FloatComplex(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, hptt::FloatComplex(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
 
 void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, 
-                 const double _Complex beta,        double _Complex *B,                   const int *outerSizeB, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, 
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
+
+/* With Offset */
+void sOffsetTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
+
+void dOffsetTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
+
+void cOffsetTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, 
+                         (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
+
+void zOffsetTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
diff --git a/src/transpose.cpp b/src/transpose.cpp
index f77cd5b..58e5672 100644
--- a/src/transpose.cpp
+++ b/src/transpose.cpp
@@ -373,23 +373,30 @@ static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const si
                                              const floatType alpha ,const floatType beta)
 {
 #ifdef DEBUG
-   assert( blockingA > 0 && blockingB > 0);
+   printf("Macro kernel: blockingA = %d with %zu, blockingB = %d with %zu\n", blockingA, lda, blockingB, ldb);
+   assert( blockingA >= 0 && blockingB >= 0);
 #endif
 
    if( betaIsZero )
       for(int j=0; j < blockingA; ++j)
-         for(int i=0; i < blockingB; ++i)
+         for(int i=0; i < blockingB; ++i){
+#ifdef DEBUG
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", i * lda + j, A[i * lda + j], i + j * ldb, B[i + j * ldb]);
+#endif
             if( conjA )
                B[i + j * ldb] = alpha * conj(A[i * lda + j]);
             else
-               B[i + j * ldb] = alpha * A[i * lda + j];
+               B[i + j * ldb] = alpha * A[i * lda + j];}
    else
       for(int j=0; j < blockingA; ++j)
-         for(int i=0; i < blockingB; ++i)
+         for(int i=0; i < blockingB; ++i){
+#ifdef DEBUG
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", i * lda + j, A[i * lda + j], i + (j * ldb), B[i + (j * ldb)]);
+#endif
             if( conjA )
                B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb];
             else
-               B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];
+               B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];}
 }
 
 template<int blockingA, int blockingB, int betaIsZero, typename floatType, bool useStreamingStores_, bool conjA>
@@ -575,27 +582,43 @@ void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A,
    const int32_t end = plan->end;
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
    if( plan->next->next != nullptr ){
       // recurse
       int i = plan->start;
-      if( lda == 1)
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], end - plan->start, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
-      else if( ldb == 1)
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], sizeStride1A, &B[i*ldb], end - plan->start, alpha, beta, plan->next);
+#ifdef DEBUG
+      printf("----[SCALAR]Pointing to deeper level, Shifting A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+      if( lda == 1 && plan->indexA )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], end - plan->start, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
+      else if( ldb == 1 && plan->indexB )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, &B[i*ldb], end - plan->start, alpha, beta, plan->next);
       else
-         for(; i < end; i++)
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], sizeStride1A, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
+         for(; i < end; i++){
+#ifdef DEBUG
+            printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);}
    }else{
       // macro-kernel
       const size_t lda_macro = plan->next->lda;
       const size_t ldb_macro = plan->next->ldb;
       int i = plan->start;
       const size_t scalarRemainder = plan->end - plan->start;
+#ifdef DEBUG
+      printf("----[SCALAR]Sending to macro-kernel, Shifting A[+%zu] lda - %zu, B[+%zu] ldb - %zu\n", (i+offDiffAB)*lda, lda, i*ldb, ldb);
+#endif
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         if( lda == 1 && plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);
+         else if( ldb == 1 && plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         else
+            for(; i < end; i++){
+#ifdef DEBUG
+               printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+               macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);}
       }
    }
 }
@@ -608,74 +631,123 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
    const int32_t inc = plan->inc;
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
 
    constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType);
    constexpr int blocking_ = blocking_micro_ * 4;
 
    if( plan->next->next != nullptr ){
+#ifdef DEBUG
+      printf("Pointing to deeper level.\n");
+#endif
       // recurse
       int32_t i;
       for(i = plan->start; i < end; i+= inc)
       {
-         if( i + inc < end )
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], &A[(i+1)*lda], &B[i*ldb], &B[(i+1)*ldb], alpha, beta, plan->next);
-         else
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
+#endif
+         if( i + inc < end ) {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+1+offDiffAB)*lda], &B[i*ldb], &B[(i+1)*ldb], alpha, beta, plan->next);
+         } else if( i == plan->start || i + inc >= end ) {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+offDiffAB)*lda], &B[i*ldb], &B[i*ldb], alpha, beta, plan->next);
+         } else {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+         }
       }
       // remainder
       if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){
-         if( lda == 1)
-            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1)
-            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+         else if( ldb == 1 && plan->indexB )
+            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
-         if( lda == 1)
-            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1)
-            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+         else if( ldb == 1 && plan->indexB )
+            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], scalarRemainder, &B[i*ldb], -1, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+         printf("---- Passing to scalar. Blocking_ to %zu (blA = %d, blB = %d) where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, blockingA, blockingB, lda, ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], scalarRemainder, &B[i*ldb], blockingB, alpha, beta, plan->next);
+         else if ( ldb == 1 && plan->indexB )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, &B[i*ldb], scalarRemainder, alpha, beta, plan->next);
          else
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], -1, &B[i*ldb], scalarRemainder, alpha, beta, plan->next);
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, &B[i*ldb], blockingB, alpha, beta, plan->next);
       }
    } else {
       const size_t lda_macro = plan->next->lda;
       const size_t ldb_macro = plan->next->ldb;
       // invoke macro-kernel
-      
+#ifdef DEBUG
+      printf("Sending to macro-kernel.\n");
+#endif
       int32_t i;
       for(i = plan->start; i < end; i+= inc)
-         if( i + inc < end )
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], &A[(i+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta);
-         else
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+      {
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
+         printf("--------- Case A: %zu < %zu, Case B: %zu == %zu or %zu > %zu, else Case C\n", i + inc, end, i, plan->start, i + inc, end);
+#endif
+         if( i + inc < end ) {
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], &A[(i+offDiffAB+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta);
+         } else if( i == plan->start || i + inc > end ) {
+            break; //TODO: in this case blocking may be incorrect - hopefully nothing lands here anyway.
+         } else {
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+         }
+      }
       // remainder
       if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){
-         if( lda == 1)
-            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+#ifdef DEBUG
+         printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+         else if( ldb == 1 && plan->indexB )
+            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
-         if( lda == 1)
-            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+#ifdef DEBUG
+         printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+         else if( ldb == 1 && plan->indexB )
+            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+         printf("---- Passing to scalar. Blocking_ to %zu where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, lda, ldb);
+#endif
+         if( lda == 1 && plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
+         else if( ldb == 1 && plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         else
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
       }
    }
 }
@@ -688,36 +760,48 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
    constexpr int32_t inc = 1; // TODO
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
 
    if( plan->next != nullptr )
-      for(int i = plan->start; i < end; i+= inc)
+      for(int i = plan->start; i < end; i+= inc) {
          // recurse
-         transpose_int_constStride1<betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], &B[i*ldb], alpha, beta, plan->next);
-   else 
+#ifdef DEBUG
+         printf("[CONST](Loop %zu of %zu)Shifting A[+%zu], B[+%zu]\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb);
+#endif
+         transpose_int_constStride1<betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &B[i*ldb], alpha, beta, plan->next);
+      }
+   else
       if( !betaIsZero )
       {
-         for(int32_t i = plan->start; i < end; i+= inc)
+#ifdef DEBUG
+         printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end);
+#endif
+         for(int32_t i = plan->start; i < end; i+= inc){
             if( conjA )
-               B[i] = alpha * conj(A[i]) + beta * B[i];
+               B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]) + beta * B[i*ldb];
             else
-               B[i] = alpha * A[i] + beta * B[i];
+               B[i*ldb] = alpha * A[(i+offDiffAB)*lda] + beta * B[i*ldb];
+         }
       } else {
+#ifdef DEBUG
+         printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end);
+#endif
          if( useStreamingStores)
             if( conjA )
 #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * conj(A[i]);
+                  B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]);
             else
 #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * A[i];
+                  B[i*ldb] = alpha * A[(i+offDiffAB)*lda];
          else
             if( conjA )
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * conj(A[i]);
+                  B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]);
             else
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * A[i];
+                  B[i*ldb] = alpha * A[(i+offDiffAB)*lda];
       }
 }
 
@@ -727,6 +811,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                  const int *perm, 
                  const int *outerSizeA, 
                  const int *outerSizeB, 
+                 const int *offsetA, 
+                 const int *offsetB, 
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -756,13 +842,17 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
          int tmpSizeA[dim];
          int tmpOuterSizeA[dim];
          int tmpOuterSizeB[dim];
-         accountForRowMajor(sizeA, outerSizeA, outerSizeB, perm, 
-               tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpPerm, dim, useRowMajor); 
+         int tmpOffsetA[dim];
+         int tmpOffsetB[dim];
+         accountForRowMajor(sizeA, outerSizeA, outerSizeB, offsetA, offsetB, perm, 
+               tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, tmpPerm, dim, useRowMajor); 
 
          sizeA_.resize(dim);
          perm_.resize(dim);
          outerSizeA_.resize(dim);
          outerSizeB_.resize(dim);
+         offsetA_.resize(dim);
+         offsetB_.resize(dim);
          lda_.resize(dim);
          ldb_.resize(dim);
          if(threadIds){
@@ -775,10 +865,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                threadIds_.push_back(i);
          }
 
-         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim);
+         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim);
 
          // initializes dim_, outerSizeA, outerSizeB, sizeA and perm 
-         skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim);
+         skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim);
          fuseIndices();
 
          // initializes lda_ and ldb_
@@ -803,6 +893,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                                           perm_(other.perm_),
                                           outerSizeA_(other.outerSizeA_),
                                           outerSizeB_(other.outerSizeB_),
+                                          offsetA_(other.offsetA_),
+                                          offsetB_(other.offsetB_),
                                           lda_(other.lda_),
                                           ldb_(other.ldb_),
                                           threadIds_(other.threadIds_),
@@ -867,16 +959,16 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
 
 
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
-static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads)
+static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const int &offDiffAB_, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
    {
       HPTT_DUPLICATE(spawnThreads,
          for(int32_t i = myStart; i < myEnd; i++)
             if( conjA )
-               B[i] = alpha * conj(A[i]) + beta * B[i];
+               B[i] = alpha * conj(A[i + offDiffAB_]) + beta * B[i];
             else
-               B[i] = alpha * A[i] + beta * B[i];
+               B[i] = alpha * A[i + offDiffAB_] + beta * B[i];
       )
    } else {
       if( useStreamingStores)
@@ -884,17 +976,17 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B,
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i]);
+                  B[i] = alpha * conj(A[i + offDiffAB_]);
                else
-                  B[i] = alpha * A[i];
+                  B[i] = alpha * A[i + offDiffAB_];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i]);
+                  B[i] = alpha * conj(A[i + offDiffAB_]);
                else
-                  B[i] = alpha * A[i];
+                  B[i] = alpha * A[i + offDiffAB_];
          )
    }
 }
@@ -902,37 +994,37 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B,
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
 static void axpy_2D( const floatType* __restrict__ A, const int lda, 
                         floatType* __restrict__ B, const int ldb, 
-                        const int n0, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads)
+                        const int n0, const int myStart, const int myEnd, const int (&offDiffAB_)[2], const int offsetB_, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
    {
       HPTT_DUPLICATE(spawnThreads,
          for(int32_t j = myStart; j < myEnd; j++)
-            for(int32_t i = 0; i < n0; i++)
+            for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[i + j * lda]) + beta * B[i + j * ldb];
+                  B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]) + beta * B[i + j * ldb];
                else
-                  B[i + j * ldb] = alpha * A[i + j * lda] + beta * B[i + j * ldb];
+                  B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda] + beta * B[i + j * ldb];
       )
    } else {
       if( useStreamingStores)
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
 _Pragma("vector nontemporal")
-            for(int32_t i = 0; i < n0; i++)
+            for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[i + j * lda]);
+                  B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]);
                else
-                  B[i + j * ldb] = alpha * A[i + j * lda];
+                  B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
-               for(int32_t i = 0; i < n0; i++)
+               for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                   if( conjA )
-                     B[i + j * ldb] = alpha * conj(A[i + j * lda]);
+                     B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]);
                   else
-                     B[i + j * ldb] = alpha * A[i + j * lda];
+                     B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda];
          )
    }
 }
@@ -987,22 +1079,27 @@ void Transpose<floatType>::execute_expert() noexcept
    int myStart = 0;
    int myEnd = 0;
 
+#ifdef DEBUG
+   printf("Executing expert with dim = %d\n", dim_);
+#endif
    if( dim_ == 1)
    {
       getStartEnd<spawnThreads>(sizeA_[0], myStart, myEnd);
+      const int offDiffAB_ = (int)offsetA_[0] - (int)offsetB_[0];
       if( conjA_ )
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, alpha_, beta_, numThreads_ );
       else
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, alpha_, beta_, numThreads_ );
       return;
    }
    else if( dim_ == 2 && perm_[0] == 0)
    {
       getStartEnd<spawnThreads>(sizeA_[1], myStart, myEnd);
+      const int offDiffAB_[2] = {((int)offsetA_[0] - (int)offsetB_[0]), ((int)offsetA_[1] - (int)offsetB_[1])};
       if( conjA_ )
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       else
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       return;
    }
 
@@ -1420,7 +1517,7 @@ void Transpose<floatType>::getParallelismStrategies(std::vector<std::vector<int>
 }
 
 template<typename floatType>
-void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const
+void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const int dim) const
 {
    if ( dim < 1 ) {
       fprintf(stderr,"[HPTT] ERROR: dimensionality too low.\n");
@@ -1457,6 +1554,20 @@ void Transpose<floatType>::verifyParameter(const int *size, const int* perm, con
             fprintf(stderr,"[HPTT] ERROR: outerSizeB invalid\n");
             exit(-1);
          }
+      
+   if ( offsetA != NULL )
+      for(int i=0;i < dim ; ++i)
+         if ( offsetA[i] + size[i] > outerSizeA[i] ) {
+            fprintf(stderr,"[HPTT] ERROR: offsetA invalid\n");
+            exit(-1);
+         }
+
+   if ( offsetB != NULL )
+      for(int i=0;i < dim ; ++i)
+         if ( offsetB[i] + size[perm[i]] > outerSizeB[i] ) {
+            fprintf(stderr,"[HPTT] ERROR: offsetB invalid\n");
+            exit(-1);
+         }
 }
 
 template<typename floatType>
@@ -1480,7 +1591,7 @@ void Transpose<floatType>::computeLeadingDimensions()
 }
 
 template<typename floatType>
-void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int dim)
+void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int dim)
 {
    for(int i=0;i < dim ; ++i)
    {
@@ -1494,6 +1605,14 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          outerSizeB_[i] = outerSizeB[i];
       else
          outerSizeB_[i] = sizeA[perm[i]];
+      if(offsetA)
+         offsetA_[i] = offsetA[i];
+      else
+         offsetA_[i] = 0;
+      if(offsetB)
+         offsetB_[i] = offsetB[i];
+      else
+         offsetB_[i] = 0;
    }
 
    int skipped = 0;
@@ -1508,6 +1627,8 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          sizeA_[i] = -1;
          outerSizeA_[i] = -1;
          outerSizeB_[idxB] = -1;
+         offsetA_[i] = -1;
+         offsetB_[idxB] = -1;
          perm_[idxB] = -1;
          skipped++;
       }
@@ -1530,8 +1651,10 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          for(;j < dim ; ++j)
             if( outerSizeA_[j] != -1 )
                break;
-         if( j < dim )
+         if( j < dim ) {
             std::swap(outerSizeA_[i], outerSizeA_[j]);
+            std::swap(offsetA_[i], offsetA_[j]);
+         }
       }
    for(int i=0;i < dim ; ++i)
       if( outerSizeB_[i] == -1 )
@@ -1540,8 +1663,10 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          for(;j < dim ; ++j)
             if( outerSizeB_[j] != -1 )
                break;
-         if( j < dim )
+         if( j < dim ) {
             std::swap(outerSizeB_[i], outerSizeB_[j]);
+            std::swap(offsetB_[i], offsetB_[j]);
+         }
       }
    for(int i=0;i < dim ; ++i)
       if( perm_[i] == -1 )
@@ -1565,11 +1690,15 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
       sizeA_[0] = 1;
       outerSizeA_[0] = 1;
       outerSizeB_[0] = 1;
+      offsetA_[0] = 0;
+      offsetB_[0] = 0;
    } else {
       perm_.resize(dim_);
       sizeA_.resize(dim_);
       outerSizeA_.resize(dim_);
       outerSizeB_.resize(dim_);
+      offsetA_.resize(dim_);
+      offsetB_.resize(dim_);
 
       // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0)
       int currentValue = 0;
@@ -1630,11 +1759,13 @@ void Transpose<floatType>::fuseIndices()
       sizeA_[std::get<0>(tup)] *= sizeA_[std::get<1>(tup)];
       outerSizeA_[std::get<0>(tup)] *= outerSizeA_[std::get<1>(tup)];
       outerSizeA_[std::get<1>(tup)] = -1;
+      offsetA_[std::get<1>(tup)] = -1;
 
       auto pos1 = std::find(perm_.begin(), perm_.end(), std::get<0>(tup)) - perm_.begin();
       auto pos2 = std::find(perm_.begin(), perm_.end(), std::get<1>(tup)) - perm_.begin();
       outerSizeB_[pos1] *= outerSizeB_[pos2];
       outerSizeB_[pos2] = -1;
+      offsetB_[pos2] = -1;
    }
 
    if ( fusedIndices.size() > 0 ) {
@@ -1668,8 +1799,10 @@ void Transpose<floatType>::fuseIndices()
             for(;j < dim_ ; ++j)
                if( outerSizeA_[j] != -1 )
                   break;
-            if( j < dim_ )
+            if( j < dim_ ) {
                std::swap(outerSizeA_[i], outerSizeA_[j]);
+               std::swap(offsetA_[i], offsetA_[j]);
+            }
          }
       for(int i=0;i < dim_ ; ++i)
          if( outerSizeB_[i] == -1 )
@@ -1678,12 +1811,16 @@ void Transpose<floatType>::fuseIndices()
             for(;j < dim_ ; ++j)
                if( outerSizeB_[j] != -1 )
                   break;
-            if( j < dim_ )
+            if( j < dim_ ) {
                std::swap(outerSizeB_[i], outerSizeB_[j]);
+               std::swap(offsetB_[i], offsetB_[j]);
+            }
          }
       dim_ -= fusedIndices.size();
       outerSizeA_.resize(dim_);
       outerSizeB_.resize(dim_);
+      offsetA_.resize(dim_);
+      offsetB_.resize(dim_);
       sizeA_.resize(dim_);
       perm_.resize(dim_);
 
@@ -1693,13 +1830,19 @@ void Transpose<floatType>::fuseIndices()
          printf("%d ",perm_[i]);
       printf("\nsizes_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",sizeA_[i]);
+         printf("%lu ",sizeA_[i]);
       printf("\nouterSizeA_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",outerSizeA_[i]);
+         printf("%lu ",outerSizeA_[i]);
       printf("\nouterSizeB_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",outerSizeB_[i]);
+         printf("%lu ",outerSizeB_[i]);
+      printf("\noffsetA_new: ");
+      for(int i=0;i < dim_ ; ++i)
+         printf("%lu ",offsetA_[i]);
+      printf("\noffsetB_new: ");
+      for(int i=0;i < dim_ ; ++i)
+         printf("%lu ",offsetB_[i]);
       printf("\n");
 #endif
    } 
@@ -1951,11 +2094,17 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
                   const int commId = (taskIdComm/numThreadsPerComm); //  = 0,0,1,1,2,2
                   taskIdComm = taskIdComm % numThreadsPerComm; // local taskId in next communicator // 0,1,0,1,0,1
 
-                  currentNode->start = std::min( sizeA_[index], commId * workPerThread * currentNode->inc );
-                  currentNode->end = std::min( sizeA_[index], (commId+1) * workPerThread * currentNode->inc );
+                  if (index == 0) currentNode->indexA = true;
+                  if (findPos(index, perm_) == 0) currentNode->indexB = true;
+                  currentNode->start = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], (commId * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] );
+                  currentNode->end = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], ((commId+1) * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] );
 
                   currentNode->lda = lda_[index];
                   currentNode->ldb = ldb_[findPos(index, perm_)];
+                  currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[findPos(index, perm_)];
+#ifdef DEBUG
+                  printf("(Task %d, Node %p) Level %d is IndexA %d, IndexB %d. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
+#endif
 
                   if( perm_[0] != 0 || l != dim_-1 ){
                      currentNode->next = new ComputeNode;
@@ -1966,12 +2115,17 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
                //macro-kernel
                if( perm_[0] != 0 )
                {
+                  if (posStride1A_inB == 0) currentNode->indexB = true;
                   currentNode->start = -1;
                   currentNode->end = -1;
                   currentNode->inc = -1;
                   currentNode->lda = lda_[ posStride1B_inA ];
                   currentNode->ldb = ldb_[ posStride1A_inB ];
+                  currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ];
                   currentNode->next = nullptr;
+#ifdef DEBUG
+                  printf("    Adjust Node (%p) IndexA: %d, IndexB: %d, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
+#endif
                }
             }
             plans.push_back(plan);
@@ -2078,7 +2232,7 @@ std::shared_ptr<Plan> Transpose<floatType>::selectPlan( const std::vector<std::s
          }
       }
       if( this->infoLevel_ > 0 )
-         printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
+         printf("We evaluated %d/%zu candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
    }
    return plans[bestPlan_id];
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 7742bce..abb29f8 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -46,8 +46,8 @@ int factorial(int n){
    return n * factorial(n-1);
 }
 
-void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, 
-      int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpPerm, const int dim, const bool useRowMajor)
+void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, 
+      int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor)
 {
    for(int i=0; i < dim; ++i){
       int idx = i;
@@ -66,6 +66,14 @@ void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *oute
          tmpOuterSizeB[i] = sizeA[perm[idx]];
       else
          tmpOuterSizeB[i] = outerSizeB[idx];
+      if( offsetA == nullptr )
+         tmpOffsetA[i] = 0;
+      else
+         tmpOffsetA[i] = offsetA[idx];
+      if( offsetB == nullptr )
+         tmpOffsetB[i] = 0;
+      else
+         tmpOffsetB[i] = offsetB[idx];
    }
 }
 
diff --git a/testframework/Makefile b/testframework/Makefile
index 3e87780..38eca08 100644
--- a/testframework/Makefile
+++ b/testframework/Makefile
@@ -28,8 +28,11 @@ LIBS=-lhptt
 all: ${OBJ}
 	${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o testframework.exe
 
+LIB_PATH+= $(OPENMP_LIB_PATH)
+LIBS+= -lomp
+
 %.o: %.cpp
-	${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
+	${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
 
 clean:
 	rm -rf *.o benchmark.exe
diff --git a/testframework/testframework.cpp b/testframework/testframework.cpp
index bfc11d2..e8b0140 100644
--- a/testframework/testframework.cpp
+++ b/testframework/testframework.cpp
@@ -56,6 +56,9 @@ int equal_(const floatType *A, const floatType*B, int total_size){
          }
       }
    }
+#ifdef DEBUG
+   printf("\nNumber of Errors: %d of %d\n", error, total_size);
+#endif
    return (error == 0) ? 1 : 0;
 }
 
@@ -67,22 +70,40 @@ void restore(const floatType* A, floatType* B, size_t n)
 }
 
 template<typename floatType>
-static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &beta, 
+static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, 
+      uint32_t *outerSizeA, uint32_t *outerSizeB,
+      uint32_t *offsetA, uint32_t *offsetB,
+      floatType &beta, 
       int &numThreads, 
-      std::string &perm_str, 
-      std::string &size_str, 
-      const int total_size)
+      std::string &perm_str, std::string &size_str, 
+      std::string &outerSizeA_str, std::string &offsetA_str, 
+      std::string &outerSizeB_str, std::string &offsetB_str, 
+      const int total_size, bool subTensors)
 {
    dim = (rand() % MAX_DIM) + 1;
    uint32_t maxSizeDim = std::max(1.0, std::pow(total_size, 1.0/dim));
    std::vector<int> perm_(dim);
    for(int i=0;i < dim ; ++i){
-      size[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.);
+      outerSizeA[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.);
+      size[i] = outerSizeA[i];
+      if (subTensors) 
+         size[i] = std::max((((double)rand())/RAND_MAX) * outerSizeA[i], 1.);
       perm_[i] = i;
    }
    std::random_shuffle(perm_.begin(), perm_.end());
-   for(int i=0;i < dim ; ++i)
+   for(int i=0;i < dim ; ++i) 
+   {
       perm[i] = perm_[i];
+      outerSizeB[i] = outerSizeA[perm[i]];
+      offsetA[i] = 0;
+      offsetB[i] = 0;
+      if (subTensors)
+      {
+         outerSizeB[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, (double)size[perm[i]]);
+         offsetA[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeA[i] - size[i]), 0.);
+         offsetB[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeB[i] - size[perm[i]]), 0.);
+      }
+   }
 
    numThreads = std::max(std::round((((double)rand())/RAND_MAX) * 24), 1.);
    if( rand() > RAND_MAX/2 )
@@ -93,16 +114,24 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &b
    for(int i=0;i < dim ; ++i){
       perm_str += std::to_string(perm[i]) + " ";
       size_str += std::to_string(size[i]) + " ";
+      outerSizeA_str += std::to_string(outerSizeA[i]) + " ";
+      offsetA_str += std::to_string(offsetA[i]) + " ";
+      outerSizeB_str += std::to_string(outerSizeB[i]) + " ";
+      offsetB_str += std::to_string(offsetB[i]) + " ";
    }
    printf("dim: %d\n", dim);
    printf("beta: %f\n", std::real(beta));
    printf("perm: %s\n", perm_str.c_str());
    printf("size: %s\n", size_str.c_str());
+   printf("outerSizeA: %s\n", outerSizeA_str.c_str());
+   printf("outerSizeB: %s\n", outerSizeB_str.c_str());
+   printf("offsetA: %s\n", offsetA_str.c_str());
+   printf("offsetB: %s\n", offsetB_str.c_str());
    printf("numThreads: %d\n",numThreads);
 }
 
 template<typename floatType>
-void runTests()
+void runTests(bool subTensors = false)
 {
    int numThreads = 1;
    floatType alpha = 2.;
@@ -113,6 +142,10 @@ void runTests()
    int dim;
    uint32_t perm[MAX_DIM];
    uint32_t size[MAX_DIM];
+   uint32_t outerSizeA[MAX_DIM];
+   uint32_t outerSizeB[MAX_DIM];
+   uint32_t offsetA[MAX_DIM];
+   uint32_t offsetB[MAX_DIM];
    size_t total_size = 128*1024*1024;
 
    // Allocating memory for tensors
@@ -141,21 +174,34 @@ void runTests()
    {  
       std::string perm_str = "";
       std::string size_str = "";
-      getRandomTest(dim, perm, size, beta, numThreads, perm_str, size_str, total_size);
+      std::string outerSizeA_str = "";
+      std::string outerSizeB_str = "";
+      std::string offsetA_str = "";
+      std::string offsetB_str = "";
+      std::cout<<"Test "<<j<<std::endl;
+      getRandomTest(dim, perm, size, outerSizeA, outerSizeB, offsetA, offsetB, beta, numThreads, perm_str, size_str, outerSizeA_str, offsetA_str, outerSizeB_str, offsetB_str, total_size, subTensors);
       int perm_[dim];
       int size_[dim];
+      int outerSizeA_[dim];
+      int outerSizeB_[dim];
+      int offsetA_[dim];
+      int offsetB_[dim];
       for(int i=0;i < dim ; ++i){
          perm_[i] = (int)perm[i];
          size_[i] = (int)size[i];
+         outerSizeA_[i] = (int)outerSizeA[i];
+         outerSizeB_[i] = (int)outerSizeB[i];
+         offsetA_[i] = (int)offsetA[i];
+         offsetB_[i] = (int)offsetB[i];
       }
 
       auto plan = hptt::create_plan( perm_, dim, 
-            alpha, A, size_, NULL, 
-            beta, B_hptt, NULL, 
+            alpha, A, size_, outerSizeA_, offsetA_,
+            beta, B_hptt, outerSizeB_, offsetB_,
             hptt::ESTIMATE, numThreads);
 
       restore(B, B_ref, total_size);
-      transpose_ref<floatType>( size, perm, dim, A, alpha, B_ref, beta, false);
+      transpose_ref<floatType>(size, perm, dim, A, alpha, outerSizeA_, offsetA_, B_ref, beta, outerSizeB_, offsetB_, false);
 
       restore(B, B_hptt, total_size);
       plan->execute();
@@ -163,10 +209,12 @@ void runTests()
       if( !equal_(B_ref, B_hptt, total_size) )
       {
          fprintf(stderr, "Error in HPTT.\n");
-         fprintf(stderr,"%d OMP_NUM_THREADS=%d ./benchmark.exe %d  %s  %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str());
+         fprintf(stderr,"%lu OMP_NUM_THREADS=%d ./benchmark.exe %d  %s  %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str());
          exit(-1);
       }
+      std::cout << "Test " << j << " passed." << std::endl;
    }
+   std::cout << "All tests passed." << std::endl;
    free(A);
    free(B);
    free(B_ref);
@@ -177,15 +225,19 @@ int main(int argc, char *argv[])
 {
   printf("float tests: \n");
   runTests<float>();
+  runTests<float>(true);
 
   printf("double tests: \n");
   runTests<double>();
+  runTests<double>(true);
 
   printf("float complex tests: \n");
   runTests<FloatComplex>();
+  runTests<FloatComplex>(true);
 
   printf("double complex tests: \n");
   runTests<DoubleComplex>();
+  runTests<DoubleComplex>(true);
 
   printf("[SUCCESS] All tests have passed.\n");
   return 0;

From f4a227199c5d48d5914f93c8bc849bde7610150e Mon Sep 17 00:00:00 2001
From: Nathan Harmer <njh80@cam.ac.uk>
Date: Fri, 28 Mar 2025 15:35:29 +0000
Subject: [PATCH 2/3] **FEAT: Non-integer innerStrides**

Sub-Tensors often omit their inner-most dimension meaning that they access their source data without an inner stride of one. This commit adds a basic level of support for this in a similar way to the support for offsets. Inner Strides are optional arguments and are supplied as integers.

*benchmark/benchmark.cpp*
Amends reference to `transpose_ref` to include nullptrs to inner strides.

*benchmark/reference.cpp*
`transpose_ref` can now receive non-integer inner strides - used for evaluating tests.

*benchmark/reference.h*
Amends template function.

*include/hptt.h*
Creates overloads including innerStrides (size_t) for create_plan calls

*include/transpose.h*
Amends functions to receive innerStrides as inputs.

*src/hptt.cpp*
Includes the new overloads and amends existing to pass nullptr objects in the cases where inner strides are not supplied.

*src/transpose.cpp*
Amends behaviour of execution to include innerStrides. As `transpose_int` functions are not part of the `Transpose` class, the innerStrides must be passed as new arguments, unchanged throughout, to the `macro_kernel_scalar` and `micro_kernel`. These then use the new strides.

An attempt to write support for Arch ARM and Arch AVX has been written but the execution of these is unchecked as the author is not working with access to these operating systems. Further, no support has been included for the B buffer case in the Macro-Kernel which in theory could be included.

Further, as a comment to the offset version as well, there has not been any changes made to the plan generation stages - be the effectiveness of these will likely be altered by these commits.

*testframework/testframework.cpp*
Tests have been added for innerStrides of 1 or 2 to test behaviour (in theory larger strides are fine but exceed the memory capabilities of my device).
---
 benchmark/benchmark.cpp         |   2 +-
 benchmark/reference.cpp         |  38 +-
 benchmark/reference.h           |   4 +-
 include/hptt.h                  | 129 ++++-
 include/transpose.h             |   9 +-
 src/hptt.cpp                    | 265 +++++++--
 src/transpose.cpp               | 923 +++++++++++++++++++++++---------
 testframework/testframework.cpp |  42 +-
 8 files changed, 1060 insertions(+), 352 deletions(-)

diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index a456394..4d4c520 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -192,7 +192,7 @@ int main(int argc, char *argv[])
         restore(B, B_ref, total_size);
         trashCache(trash1, trash2, largerThanL3);
         auto begin_time = omp_get_wtime();
-        transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, B_ref, beta, nullptr, nullptr, false);
+        transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, 1, B_ref, beta, nullptr, nullptr, 1, false);
         double elapsed_time = omp_get_wtime() - begin_time;
         minTime = (elapsed_time < minTime) ? elapsed_time : minTime;
      }
diff --git a/benchmark/reference.cpp b/benchmark/reference.cpp
index d7289a0..3a18e28 100644
--- a/benchmark/reference.cpp
+++ b/benchmark/reference.cpp
@@ -16,8 +16,8 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA,  
-      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, const bool conjA)
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA,  
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA)
 {
    std::vector<int> tempOuterSizeA, tempOuterSizeB, tempOffsetA, tempOffsetB, tempPointerB;
 
@@ -57,13 +57,13 @@ void transpose_ref( uint32_t *size, uint32_t *perm, int dim,
 
    // compute stride for all dimensions w.r.t. A (like lda)
    uint32_t strideA[dim];
-   strideA[0] = 1;
+   strideA[0] = innerStrideA;
    for(int i=1; i < dim; ++i) 
       strideA[i] = strideA[i-1] * outerSizeA[i-1];
 
    // compute stride for all dimensions w.r.t. B (like ldb)
    uint32_t strideB[dim];
-   strideB[0] = 1;
+   strideB[0] = innerStrideB;
    for(int i=1; i < dim; ++i)
       strideB[i] = strideB[i-1] * outerSizeB[i-1];
 
@@ -97,39 +97,39 @@ void transpose_ref( uint32_t *size, uint32_t *perm, int dim,
       }
 
       const floatType* __restrict__ A_ = A + indexOffsetA;
-      floatType* __restrict__ B_ = B + indexOffsetB + offsetB[0] + (j * outerSizeB[0]);
+      floatType* __restrict__ B_ = B + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB);
 
       if( beta == (floatType) 0 )
          for(int i=0; i < sizeInner; ++i) {
 #ifdef DEBUG
-            printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], i + offsetB[0] + indexOffsetB + (j * outerSizeB[0]), B_[i]);
+            //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
 #endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real();
+               B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real();
             else
-               B_[i] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];}
+               B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];}
       else
          for(int i=0; i < sizeInner; ++i) {
 #ifdef DEBUG
-            printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], i + offsetB[0] + indexOffsetB + (j * outerSizeB[0]), B_[i]);
+            //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
 #endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i];
+               B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i * innerStrideB];
             else
-               B_[i] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i];}
+               B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i * innerStrideB];}
    }
 }
 
 template void transpose_ref<float>( uint32_t *size, uint32_t *perm, int dim, 
-      const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, 
-      float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, const bool conjA);
+      const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<FloatComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, 
-      FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, const bool conjA);
+      const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<double>( uint32_t *size, uint32_t *perm, int dim, 
-      const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, 
-      double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, const bool conjA);
+      const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<DoubleComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, 
-      DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, const bool conjA);
+      const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 
diff --git a/benchmark/reference.h b/benchmark/reference.h
index 5284a0d..6209f84 100644
--- a/benchmark/reference.h
+++ b/benchmark/reference.h
@@ -2,5 +2,5 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, 
-      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, const bool conjA);
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA, 
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
diff --git a/include/hptt.h b/include/hptt.h
index f25ff11..31e56d8 100644
--- a/include/hptt.h
+++ b/include/hptt.h
@@ -172,6 +172,9 @@ namespace hptt {
  *                       * This parameter may be NULL, indicating that the offset is zero.
  *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
+ *                       * This parameter may be NULL, indicating that the innerStrideA is equal to 1.
+ *                       * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
@@ -182,6 +185,9 @@ namespace hptt {
  *                       * This parameter may be NULL, indicating that the offset is zero.
  *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
+ *                       * This parameter may be NULL, indicating that the innerStrideB is equal to 1.
+ *                       * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB.
  * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
  *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
  *                            then the output data will be used during the
@@ -194,8 +200,8 @@ namespace hptt {
  *            HPTT from within a parallel region (i.e., via execute_expert()).
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
-/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets, 
- * and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets
+ * or innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -220,8 +226,8 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const SelectionMethod selectionMethod,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
-/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets, 
- * and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets
+ * or innerStride, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                  const float beta, float *B, const std::vector<int> &outerSizeB, 
@@ -247,7 +253,7 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                  const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
- * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -273,7 +279,7 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
- * --int-- Offsets, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+ * --int-- Offsets, no innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
                  const float beta, float *B, const int *outerSizeB, const int *offsetB,
@@ -281,7 +287,7 @@ std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
 std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
-                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, 
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
                  const double beta, double *B, const int *outerSizeB, const int *offsetB,
                  const SelectionMethod selectionMethod,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
@@ -299,7 +305,7 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
- * --vector int-- Offsets, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+ * --vector int-- Offsets, no innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
                  const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
@@ -325,7 +331,7 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                  const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
- * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
                  const float beta, float *B, const int *outerSizeB, const int *offsetB, 
@@ -349,6 +355,84 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, 
                  const int maxAutotuningCandidates,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --int-- Offsets, --int-- innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --vector int-- Offsets, --int-- innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                 const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                 const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 }
 
 extern "C"
@@ -375,6 +459,9 @@ extern "C"
  *                       * This parameter may be NULL, indicating that the offset is zero.
  *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
+ *                       * This parameter may be NULL, indicating that the innerStrideA is equal to 1.
+ *                       * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
@@ -385,6 +472,9 @@ extern "C"
  *                       * This parameter may be NULL, indicating that the offset is zero.
  *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
+ *                       * This parameter may be NULL, indicating that the innerStrideB is equal to 1.
+ *                       * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB.
  * \param[in] numThreads number of threads that participate in this tensor transposition.
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
@@ -428,4 +518,25 @@ void zOffsetTensorTranspose( const int *perm, const int dim,
                  const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
                  const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor = 0);
+
+/* With Offsets and InnerStrides */
+void sInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void dInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void cInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void zInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
 }
diff --git a/include/transpose.h b/include/transpose.h
index 0f480b8..8b42147 100644
--- a/include/transpose.h
+++ b/include/transpose.h
@@ -57,6 +57,7 @@ class Transpose
        *                       * This parameter may be NULL, indicating that the offset is zero.
        *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on intermediate sub-tensors.
+       * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
        * \param[in] beta scaling factor for B
        * \param[inout] B Pointer to the raw-data of the output tensor B
        * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
@@ -67,6 +68,7 @@ class Transpose
        *                       * This parameter may be NULL, indicating that the offset is zero.
        *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on intermediate sub-tensors.
+       * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
        * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
        *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
        *                            then the output data will be used during the
@@ -87,6 +89,8 @@ class Transpose
                  const int *outerSizeB, 
                  const int *offsetA, 
                  const int *offsetB, 
+                 const size_t innerStrideA, 
+                 const size_t innerStrideB, 
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -243,7 +247,8 @@ class Transpose
                                         const std::vector<int> &loopsAllowed) const;
       float getLoadBalance( const std::vector<int> &parallelismStrategy ) const;
       float estimateExecutionTime( const std::shared_ptr<Plan> plan); //execute just a few iterations and extrapolate the result
-      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const int dim) const;
+      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, 
+                           const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const;
       void getBestParallelismStrategy ( std::vector<int> &bestParallelismStrategy ) const;
       void getBestLoopOrder( std::vector<int> &loopOrder ) const; //innermost loop idx is stored at dim_-1
       void getLoopOrders(std::vector<std::vector<int> > &loopOrders) const;
@@ -268,6 +273,8 @@ class Transpose
       std::vector<size_t> outerSizeB_;  //!< outer sizes of B
       std::vector<size_t> offsetA_; //!< offsets of A
       std::vector<size_t> offsetB_;  //!< offsets of B
+      size_t innerStrideA_; //!< innerStride of A
+      size_t innerStrideB_;  //!< innerStride of B
       std::vector<size_t> lda_;  //!< strides for all dimensions of A (first dimension has a stride of 1)
       std::vector<size_t> ldb_;  //!< strides for all dimensions of B (first dimension has a stride of 1)
       std::vector<int> threadIds_; //!< OpenMP threadIds of the threads involed in the transposition
diff --git a/src/hptt.cpp b/src/hptt.cpp
index 2fc786f..4a31be0 100644
--- a/src/hptt.cpp
+++ b/src/hptt.cpp
@@ -20,14 +20,14 @@
 namespace hptt {
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
- * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -37,7 +37,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
@@ -47,7 +47,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -57,19 +57,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
- * no Offsets, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+ * no Offsets or innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                   const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                   const float beta, float *B, const std::vector<int> &outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -79,7 +79,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &p
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -89,7 +89,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<i
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -99,19 +99,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
- * no Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -123,7 +123,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -135,7 +135,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->createPlan();
    return plan;
 }
@@ -146,21 +146,21 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
 }
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
- * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
                   const float beta, float *B, const int *outerSizeB, const int *offsetB,
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -170,7 +170,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
@@ -180,7 +180,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -190,19 +190,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
- * --vector int-- Offsets, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+ * --vector int-- Offsets, no innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                   const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
                   const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -212,7 +212,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &p
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -222,7 +222,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<i
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -232,19 +232,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
 /* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
- * --int-- Offsets, and ints (sizeA, outerSizeA, and outerSizeB). */
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
                   const float beta, float *B, const int *outerSizeB, const int *offsetB,
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -256,7 +256,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -268,7 +268,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->createPlan();
    return plan;
 }
@@ -279,7 +279,141 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --vector int-- Offsets, --int-- innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                  const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -293,7 +427,7 @@ void sTensorTranspose( const int *perm, const int dim,
                  const float beta,        float *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -302,7 +436,7 @@ void dTensorTranspose( const int *perm, const int dim,
                  const double beta,        double *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -311,7 +445,7 @@ void cTensorTranspose( const int *perm, const int dim,
                  const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, 
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, 
                          (const hptt::FloatComplex*) A, hptt::FloatComplex(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, hptt::FloatComplex(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
@@ -322,7 +456,7 @@ void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, dim, 
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, 
                          (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
@@ -334,7 +468,8 @@ void sOffsetTensorTranspose( const int *perm, const int dim,
                  const float beta,        float *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, 
+                           dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -343,7 +478,8 @@ void dOffsetTensorTranspose( const int *perm, const int dim,
                  const double beta,        double *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, 
+                           dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -352,7 +488,7 @@ void cOffsetTensorTranspose( const int *perm, const int dim,
                  const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, 
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, 
                          (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
@@ -363,25 +499,52 @@ void zOffsetTensorTranspose( const int *perm, const int dim,
                  const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, dim, 
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, 
                          (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
-}
-
-
-
-
-
-
-
-
-
-
-
-
 
+/* With Offset and innerStride */
+void sInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim,
+               A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void dInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+               A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void cInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+                         (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
 
+void zInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
+}
diff --git a/src/transpose.cpp b/src/transpose.cpp
index 58e5672..a59de89 100644
--- a/src/transpose.cpp
+++ b/src/transpose.cpp
@@ -34,24 +34,30 @@ namespace hptt {
 template <typename floatType, int betaIsZero, bool conjA>
 struct micro_kernel
 {
-    static void execute(const floatType* __restrict__ A, const size_t lda, floatType* __restrict__ B, const size_t ldb, const floatType alpha, const floatType beta)
+    static void execute(const floatType* __restrict__ A, const size_t lda, const size_t innerStrideA, floatType* __restrict__ B, const size_t ldb, const size_t innerStrideB, const floatType alpha, const floatType beta)
     {
        constexpr int n = (REGISTER_BITS/8) / sizeof(floatType);
 
        if( betaIsZero )
           for(int j=0; j < n; ++j)
-             for(int i=0; i < n; ++i)
+             for(int i=0; i < n; ++i){
+#ifdef DEBUG
+               //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]);
+#endif
                 if( conjA )
-                   B[i + j * ldb] = alpha * conj(A[i * lda + j]);
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]);
                 else
-                   B[i + j * ldb] = alpha * A[i * lda + j];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)];}
        else
           for(int j=0; j < n; ++j)
-             for(int i=0; i < n; ++i)
+             for(int i=0; i < n; ++i) {
+#ifdef DEBUG
+               //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]);
+#endif
                 if( conjA )
-                   B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]) + beta * B[(i * innerStrideB) + (j * ldb)];
                 else
-                   B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)] + beta * B[(i * innerStrideB) + (j * ldb)];}
     }
 };
 
@@ -77,15 +83,28 @@ static INLINE void prefetch(const floatType* A, const int lda)
 template <int betaIsZero, bool conjA>
 struct micro_kernel<double, betaIsZero, conjA>
 {
-    static void execute(const double* __restrict__ A, const size_t lda, double* __restrict__ B, const size_t ldb, const double alpha ,const double beta)
+    static void execute(const double* __restrict__ A, const size_t lda, const size_t innerStrideA, double* __restrict__ B, const size_t ldb, const size_t innerStrideB, const double alpha ,const double beta)
     {
+#ifdef DEBUG
+       printf("In AVX micro_kernel<double, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        __m256d reg_alpha = _mm256_set1_pd(alpha); // do not alter the content of B
        __m256d reg_beta = _mm256_set1_pd(beta); // do not alter the content of B
        //Load A
-       __m256d rowA0 = _mm256_loadu_pd((A +0*lda));
-       __m256d rowA1 = _mm256_loadu_pd((A +1*lda));
-       __m256d rowA2 = _mm256_loadu_pd((A +2*lda));
-       __m256d rowA3 = _mm256_loadu_pd((A +3*lda));
+       __m256d rowA0, rowA1, rowA2, rowA3;
+       __m256i indicesA;
+       if (innerStrideA != 1) {
+         indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA);
+         rowA0 = _mm256_i32gather_pd((A +0*lda), indicesA, sizeof(double));
+         rowA1 = _mm256_i32gather_pd((A +1*lda), indicesA, sizeof(double));
+         rowA2 = _mm256_i32gather_pd((A +2*lda), indicesA, sizeof(double));
+         rowA3 = _mm256_i32gather_pd((A +3*lda), indicesA, sizeof(double));
+       } else {
+         rowA0 = _mm256_loadu_pd((A +0*lda));
+         rowA1 = _mm256_loadu_pd((A +1*lda));
+         rowA2 = _mm256_loadu_pd((A +2*lda));
+         rowA3 = _mm256_loadu_pd((A +3*lda));
+       }
 
        //4x4 transpose micro kernel
        __m256d r4, r34, r3, r33;
@@ -107,26 +126,51 @@ struct micro_kernel<double, betaIsZero, conjA>
        //Load B
        if( !betaIsZero )
        {
-          __m256d rowB0 = _mm256_loadu_pd((B +0*ldb));
-          __m256d rowB1 = _mm256_loadu_pd((B +1*ldb));
-          __m256d rowB2 = _mm256_loadu_pd((B +2*ldb));
-          __m256d rowB3 = _mm256_loadu_pd((B +3*ldb));
+          __m256d rowB0, rowB1, rowB2, rowB3;
+          __m256i indicesB;
+         if (innerStrideB != 1) {
+           indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+           rowB0 = _mm256_i32gather_pd((B +0*ldb), indicesB, sizeof(double));
+           rowB1 = _mm256_i32gather_pd((B +1*ldb), indicesB, sizeof(double));
+           rowB2 = _mm256_i32gather_pd((B +2*ldb), indicesB, sizeof(double));
+           rowB3 = _mm256_i32gather_pd((B +3*ldb), indicesB, sizeof(double));
+         } else {
+           rowB0 = _mm256_loadu_pd((B +0*ldb));
+           rowB1 = _mm256_loadu_pd((B +1*ldb));
+           rowB2 = _mm256_loadu_pd((B +2*ldb));
+           rowB3 = _mm256_loadu_pd((B +3*ldb));
+         }
 
           rowB0 = _mm256_add_pd( _mm256_mul_pd(rowB0, reg_beta), rowA0);
           rowB1 = _mm256_add_pd( _mm256_mul_pd(rowB1, reg_beta), rowA1);
           rowB2 = _mm256_add_pd( _mm256_mul_pd(rowB2, reg_beta), rowA2);
           rowB3 = _mm256_add_pd( _mm256_mul_pd(rowB3, reg_beta), rowA3);
           //Store B
-          _mm256_storeu_pd((B + 0 * ldb), rowB0);
-          _mm256_storeu_pd((B + 1 * ldb), rowB1);
-          _mm256_storeu_pd((B + 2 * ldb), rowB2);
-          _mm256_storeu_pd((B + 3 * ldb), rowB3);
+          if (innerStrideB != 1) {
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(double));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(double));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(double));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(double));
+          } else {
+            _mm256_storeu_pd((B + 0 * ldb), rowB0);
+            _mm256_storeu_pd((B + 1 * ldb), rowB1);
+            _mm256_storeu_pd((B + 2 * ldb), rowB2);
+            _mm256_storeu_pd((B + 3 * ldb), rowB3);
+          }
        } else {
           //Store B
-          _mm256_storeu_pd((B + 0 * ldb), rowA0);
-          _mm256_storeu_pd((B + 1 * ldb), rowA1);
-          _mm256_storeu_pd((B + 2 * ldb), rowA2);
-          _mm256_storeu_pd((B + 3 * ldb), rowA3);
+          if (innerStrideB != 1) {
+            __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(double));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(double));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(double));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(double));
+          } else {
+            _mm256_storeu_pd((B + 0 * ldb), rowA0);
+            _mm256_storeu_pd((B + 1 * ldb), rowA1);
+            _mm256_storeu_pd((B + 2 * ldb), rowA2);
+            _mm256_storeu_pd((B + 3 * ldb), rowA3);
+          }
        }
     }
 };
@@ -134,19 +178,36 @@ struct micro_kernel<double, betaIsZero, conjA>
 template <int betaIsZero, bool conjA>
 struct micro_kernel<float, betaIsZero, conjA>
 {
-    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideB, const float alpha ,const float beta)
     {
+#ifdef DEBUG
+       printf("In AVX micro_kernel<float, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        __m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B
        __m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B
        //Load A
-       __m256 rowA0 = _mm256_loadu_ps((A +0*lda));
-       __m256 rowA1 = _mm256_loadu_ps((A +1*lda));
-       __m256 rowA2 = _mm256_loadu_ps((A +2*lda));
-       __m256 rowA3 = _mm256_loadu_ps((A +3*lda));
-       __m256 rowA4 = _mm256_loadu_ps((A +4*lda));
-       __m256 rowA5 = _mm256_loadu_ps((A +5*lda));
-       __m256 rowA6 = _mm256_loadu_ps((A +6*lda));
-       __m256 rowA7 = _mm256_loadu_ps((A +7*lda));
+       __m256 rowA0, rowA1, rowA2, rowA3, rowA4, rowA5, rowA6, rowA7;
+       __m256i indicesA;
+       if (innerStrideA == 1) {
+         rowA0 = _mm256_loadu_ps((A +0*lda));
+         rowA1 = _mm256_loadu_ps((A +1*lda));
+         rowA2 = _mm256_loadu_ps((A +2*lda));
+         rowA3 = _mm256_loadu_ps((A +3*lda));
+         rowA4 = _mm256_loadu_ps((A +4*lda));
+         rowA5 = _mm256_loadu_ps((A +5*lda));
+         rowA6 = _mm256_loadu_ps((A +6*lda));
+         rowA7 = _mm256_loadu_ps((A +7*lda));
+       } else {
+         indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA);
+         rowA0 = _mm256_i32gather_ps((A +0*lda), indicesA, sizeof(float));
+         rowA1 = _mm256_i32gather_ps((A +1*lda), indicesA, sizeof(float));
+         rowA2 = _mm256_i32gather_ps((A +2*lda), indicesA, sizeof(float));
+         rowA3 = _mm256_i32gather_ps((A +3*lda), indicesA, sizeof(float));
+         rowA4 = _mm256_i32gather_ps((A +4*lda), indicesA, sizeof(float));
+         rowA5 = _mm256_i32gather_ps((A +5*lda), indicesA, sizeof(float));
+         rowA6 = _mm256_i32gather_ps((A +6*lda), indicesA, sizeof(float));
+         rowA7 = _mm256_i32gather_ps((A +7*lda), indicesA, sizeof(float));
+       }
 
        //8x8 transpose micro kernel
        __m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28;
@@ -188,14 +249,28 @@ struct micro_kernel<float, betaIsZero, conjA>
        //Load B
        if( !betaIsZero )
        {
-          __m256 rowB0 = _mm256_loadu_ps((B +0*ldb));
-          __m256 rowB1 = _mm256_loadu_ps((B +1*ldb));
-          __m256 rowB2 = _mm256_loadu_ps((B +2*ldb));
-          __m256 rowB3 = _mm256_loadu_ps((B +3*ldb));
-          __m256 rowB4 = _mm256_loadu_ps((B +4*ldb));
-          __m256 rowB5 = _mm256_loadu_ps((B +5*ldb));
-          __m256 rowB6 = _mm256_loadu_ps((B +6*ldb));
-          __m256 rowB7 = _mm256_loadu_ps((B +7*ldb));
+          __m256 rowB0, rowB1, rowB2, rowB3, rowB4, rowB5, rowB6, rowB7;
+          __m256i indicesB;
+          if (innerStrideB != 1) {
+            indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            rowB0 = _mm256_i32gather_ps((B +0*ldb), indicesB, sizeof(float));
+            rowB1 = _mm256_i32gather_ps((B +1*ldb), indicesB, sizeof(float));
+            rowB2 = _mm256_i32gather_ps((B +2*ldb), indicesB, sizeof(float));
+            rowB3 = _mm256_i32gather_ps((B +3*ldb), indicesB, sizeof(float));
+            rowB4 = _mm256_i32gather_ps((B +4*ldb), indicesB, sizeof(float));
+            rowB5 = _mm256_i32gather_ps((B +5*ldb), indicesB, sizeof(float));
+            rowB6 = _mm256_i32gather_ps((B +6*ldb), indicesB, sizeof(float));
+            rowB7 = _mm256_i32gather_ps((B +7*ldb), indicesB, sizeof(float));
+          } else {
+            rowB0 = _mm256_loadu_ps((B +0*ldb));
+            rowB1 = _mm256_loadu_ps((B +1*ldb));
+            rowB2 = _mm256_loadu_ps((B +2*ldb));
+            rowB3 = _mm256_loadu_ps((B +3*ldb));
+            rowB4 = _mm256_loadu_ps((B +4*ldb));
+            rowB5 = _mm256_loadu_ps((B +5*ldb));
+            rowB6 = _mm256_loadu_ps((B +6*ldb));
+            rowB7 = _mm256_loadu_ps((B +7*ldb));
+          }
 
           rowB0 = _mm256_add_ps( _mm256_mul_ps(rowB0, reg_beta), rowA0);
           rowB1 = _mm256_add_ps( _mm256_mul_ps(rowB1, reg_beta), rowA1);
@@ -206,23 +281,46 @@ struct micro_kernel<float, betaIsZero, conjA>
           rowB6 = _mm256_add_ps( _mm256_mul_ps(rowB6, reg_beta), rowA6);
           rowB7 = _mm256_add_ps( _mm256_mul_ps(rowB7, reg_beta), rowA7);
           //Store B
-          _mm256_storeu_ps((B + 0 * ldb), rowB0);
-          _mm256_storeu_ps((B + 1 * ldb), rowB1);
-          _mm256_storeu_ps((B + 2 * ldb), rowB2);
-          _mm256_storeu_ps((B + 3 * ldb), rowB3);
-          _mm256_storeu_ps((B + 4 * ldb), rowB4);
-          _mm256_storeu_ps((B + 5 * ldb), rowB5);
-          _mm256_storeu_ps((B + 6 * ldb), rowB6);
-          _mm256_storeu_ps((B + 7 * ldb), rowB7);
+          if (innerStrideB != 1) {
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(float));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(float));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(float));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(float));
+            _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowB4, sizeof(float));
+            _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowB5, sizeof(float));
+            _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowB6, sizeof(float));
+            _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowB7, sizeof(float));
+          } else {
+            _mm256_storeu_ps((B + 0 * ldb), rowB0);
+            _mm256_storeu_ps((B + 1 * ldb), rowB1);
+            _mm256_storeu_ps((B + 2 * ldb), rowB2);
+            _mm256_storeu_ps((B + 3 * ldb), rowB3);
+            _mm256_storeu_ps((B + 4 * ldb), rowB4);
+            _mm256_storeu_ps((B + 5 * ldb), rowB5);
+            _mm256_storeu_ps((B + 6 * ldb), rowB6);
+            _mm256_storeu_ps((B + 7 * ldb), rowB7);
+          }
        } else {
-          _mm256_storeu_ps((B + 0 * ldb), rowA0);
-          _mm256_storeu_ps((B + 1 * ldb), rowA1);
-          _mm256_storeu_ps((B + 2 * ldb), rowA2);
-          _mm256_storeu_ps((B + 3 * ldb), rowA3);
-          _mm256_storeu_ps((B + 4 * ldb), rowA4);
-          _mm256_storeu_ps((B + 5 * ldb), rowA5);
-          _mm256_storeu_ps((B + 6 * ldb), rowA6);
-          _mm256_storeu_ps((B + 7 * ldb), rowA7);
+          if (innerStrideB != 1) {
+            __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(float));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(float));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(float));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(float));
+            _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowA4, sizeof(float));
+            _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowA5, sizeof(float));
+            _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowA6, sizeof(float));
+            _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowA7, sizeof(float));
+          } else {
+            _mm256_storeu_ps((B + 0 * ldb), rowA0);
+            _mm256_storeu_ps((B + 1 * ldb), rowA1);
+            _mm256_storeu_ps((B + 2 * ldb), rowA2);
+            _mm256_storeu_ps((B + 3 * ldb), rowA3);
+            _mm256_storeu_ps((B + 4 * ldb), rowA4);
+            _mm256_storeu_ps((B + 5 * ldb), rowA5);
+            _mm256_storeu_ps((B + 6 * ldb), rowA6);
+            _mm256_storeu_ps((B + 7 * ldb), rowA7);
+          }
        }
     }
 };
@@ -246,16 +344,62 @@ static INLINE void prefetch(const floatType* A, const int lda) { }
 template <int betaIsZero, bool conjA>
 struct micro_kernel<float, betaIsZero>
 {
-    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta)
     {
+#ifdef DEBUG
+       printf("In ARM micro_kernel<float, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        float32x4_t reg_alpha = vdupq_n_f32(alpha);
        float32x4_t reg_beta = vdupq_n_f32(beta);
 
        //Load A
-       float32x4_t rowA0 = vld1q_f32((A +0*lda));
-       float32x4_t rowA1 = vld1q_f32((A +1*lda));
-       float32x4_t rowA2 = vld1q_f32((A +2*lda));
-       float32x4_t rowA3 = vld1q_f32((A +3*lda));
+       float32x4_t rowA0, rowA1, rowA2, rowA3;
+       if (innerStrideA == 1) {
+         float32x4_t rowA0 = vld1q_f32((A +0*lda));
+         float32x4_t rowA1 = vld1q_f32((A +1*lda));
+         float32x4_t rowA2 = vld1q_f32((A +2*lda));
+         float32x4_t rowA3 = vld1q_f32((A +3*lda));
+       } else if (innerStrideA == 2) {
+         float32x4_t rowA0 = vld2q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld2q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld2q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld2q_f32((A +3*lda)).val[0];
+       } else if (innerStrideA == 3) {
+         float32x4_t rowA0 = vld3q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld3q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld3q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld3q_f32((A +3*lda)).val[0];
+       } else if (innerStrideA == 4) {
+         float32x4_t rowA0 = vld4q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld4q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld4q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld4q_f32((A +3*lda)).val[0];
+       } else {
+         float32x4_t rowA0 = vdupq_n_f32(0);
+         float32x4_t rowA1 = vdupq_n_f32(0);
+         float32x4_t rowA2 = vdupq_n_f32(0);
+         float32x4_t rowA3 = vdupq_n_f32(0);
+
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 0 * strideA, rowA0, 0);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 1 * strideA, rowA0, 1);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 2 * strideA, rowA0, 2);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 3 * strideA, rowA0, 3);
+
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 0 * strideA, rowA1, 0);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 1 * strideA, rowA1, 1);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 2 * strideA, rowA1, 2);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 3 * strideA, rowA1, 3);
+
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 0 * strideA, rowA2, 0);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 1 * strideA, rowA2, 1);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 2 * strideA, rowA2, 2);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 3 * strideA, rowA2, 3);
+
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 0 * strideA, rowA3, 0);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 1 * strideA, rowA3, 1);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 2 * strideA, rowA3, 2);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 3 * strideA, rowA3, 3);
+       }
 
        //4x4 transpose micro kernel
        float32x4x2_t t0,t1,t2,t3;
@@ -272,27 +416,110 @@ struct micro_kernel<float, betaIsZero>
 
        //Load B
        if( !betaIsZero )
-       {
-          float32x4_t rowB0 = vld1q_f32((B +0*ldb));
-          float32x4_t rowB1 = vld1q_f32((B +1*ldb));
-          float32x4_t rowB2 = vld1q_f32((B +2*ldb));
-          float32x4_t rowB3 = vld1q_f32((B +3*ldb));
+       {       
+          float32x4_t rowB0, rowB1, rowB2, rowB3;
+          if (innerStrideB == 1) {
+            float32x4_t rowB0 = vld1q_f32((B +0*ldb));
+            float32x4_t rowB1 = vld1q_f32((B +1*ldb));
+            float32x4_t rowB2 = vld1q_f32((B +2*ldb));
+            float32x4_t rowB3 = vld1q_f32((B +3*ldb));
+          } else if (innerStrideB == 2) {
+            float32x4_t rowB0 = vld2q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld2q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld2q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld2q_f32((B +3*ldb)).val[0];
+          } else if (innerStrideB == 3) {
+            float32x4_t rowB0 = vld3q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld3q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld3q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld3q_f32((B +3*ldb)).val[0];
+          } else if (innerStrideB == 4) {
+            float32x4_t rowB0 = vld4q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld4q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld4q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld4q_f32((B +3*ldb)).val[0];
+          } else {
+            float32x4_t rowB0 = vdupq_n_f32(0);
+            float32x4_t rowB1 = vdupq_n_f32(0);
+            float32x4_t rowB2 = vdupq_n_f32(0);
+            float32x4_t rowB3 = vdupq_n_f32(0);
+
+            rowB0 = vld1q_lane_f32(B + 0 * innerStrideB, rowB0, 0);
+            rowB0 = vld1q_lane_f32(B + 1 * innerStrideB, rowB0, 1);
+            rowB0 = vld1q_lane_f32(B + 2 * innerStrideB, rowB0, 2);
+            rowB0 = vld1q_lane_f32(B + 3 * innerStrideB, rowB0, 3);
+
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 0 * innerStrideB, rowB1, 0);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 1 * innerStrideB, rowB1, 1);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 2 * innerStrideB, rowB1, 2);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 3 * innerStrideB, rowB1, 3);
+
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 0 * innerStrideB, rowB2, 0);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 1 * innerStrideB, rowB2, 1);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 2 * innerStrideB, rowB2, 2);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 3 * innerStrideB, rowB2, 3);
+
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 0 * innerStrideB, rowB3, 0);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 1 * innerStrideB, rowB3, 1);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 2 * innerStrideB, rowB3, 2);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 3 * innerStrideB, rowB3, 3);
+          }
 
           rowB0 = vaddq_f32( vmulq_f32(rowB0, reg_beta), rowA0);
           rowB1 = vaddq_f32( vmulq_f32(rowB1, reg_beta), rowA1);
           rowB2 = vaddq_f32( vmulq_f32(rowB2, reg_beta), rowA2);
           rowB3 = vaddq_f32( vmulq_f32(rowB3, reg_beta), rowA3);
           //Store B
-          vst1q_f32((B + 0 * ldb), rowB0);
-          vst1q_f32((B + 1 * ldb), rowB1);
-          vst1q_f32((B + 2 * ldb), rowB2);
-          vst1q_f32((B + 3 * ldb), rowB3);
+          if (innerStrideB == 1) {
+            vst1q_f32((B + 0 * ldb), rowB0);
+            vst1q_f32((B + 1 * ldb), rowB1);
+            vst1q_f32((B + 2 * ldb), rowB2);
+            vst1q_f32((B + 3 * ldb), rowB3);
+          } else {
+            float tmp[4];
+            vst1q_f32(tmp, rowB0);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB1);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 1 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB2);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 2 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB3);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 3 * ldb] = tmp[i];
+            }
+          }
        } else {
           //Store B
-          vst1q_f32((B + 0 * ldb), rowA0);
-          vst1q_f32((B + 1 * ldb), rowA1);
-          vst1q_f32((B + 2 * ldb), rowA2);
-          vst1q_f32((B + 3 * ldb), rowA3);
+          if (innerStrideB == 1) {
+            vst1q_f32((B + 0 * ldb), rowA0);
+            vst1q_f32((B + 1 * ldb), rowA1);
+            vst1q_f32((B + 2 * ldb), rowA2);
+            vst1q_f32((B + 3 * ldb), rowA3);
+          } else {
+            float tmp[4];
+            vst1q_f32(tmp, rowB0);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB1);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 1 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB2);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 2 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB3);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 3 * ldb] = tmp[i];
+            }
+          }
        }
     }
 };
@@ -304,7 +531,7 @@ struct micro_kernel<float, betaIsZero>
 //template <int betaIsZero>
 //struct micro_kernel<float, betaIsZero>
 //{
-//    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+//    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta)
 //    {
 //       vector float reg_alpha = vec_splats(alpha);
 //
@@ -368,8 +595,8 @@ struct micro_kernel<float, betaIsZero>
 
 
 template<int betaIsZero, typename floatType, bool conjA>
-static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA,  
-                                             floatType* __restrict__ B, const size_t ldb, int blockingB,
+static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA, size_t innerStrideA,  
+                                             floatType* __restrict__ B, const size_t ldb, int blockingB, size_t innerStrideB,
                                              const floatType alpha ,const floatType beta)
 {
 #ifdef DEBUG
@@ -381,203 +608,354 @@ static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const si
       for(int j=0; j < blockingA; ++j)
          for(int i=0; i < blockingB; ++i){
 #ifdef DEBUG
-            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", i * lda + j, A[i * lda + j], i + j * ldb, B[i + j * ldb]);
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)]);
 #endif
             if( conjA )
-               B[i + j * ldb] = alpha * conj(A[i * lda + j]);
+               B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]);
             else
-               B[i + j * ldb] = alpha * A[i * lda + j];}
+               B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)];}
    else
       for(int j=0; j < blockingA; ++j)
          for(int i=0; i < blockingB; ++i){
 #ifdef DEBUG
-            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", i * lda + j, A[i * lda + j], i + (j * ldb), B[i + (j * ldb)]);
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], i + (j * ldb), B[(i * innerStrideB) + (j * ldb)]);
 #endif
             if( conjA )
-               B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb];
+               B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]) + beta * B[(i * innerStrideB) + (j * ldb)];
             else
-               B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];}
+               B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)] + beta * B[(i * innerStrideB) + (j * ldb)];}
 }
 
 template<int blockingA, int blockingB, int betaIsZero, typename floatType, bool useStreamingStores_, bool conjA>
-static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, 
-                                   floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb,
+static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, size_t innerStrideA, 
+                                   floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb, size_t innerStrideB,
                                    const floatType alpha ,const floatType beta)
 {
    constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType);
    constexpr int blocking_ = blocking_micro_ * 4;
+#ifdef DEBUG
+   printf("Macro kernel (%d): blockingA = %d with %zu, blockingB = %d with %zu\n", blocking_, blockingA, lda, blockingB, ldb);
+#endif
 
    const bool useStreamingStores = useStreamingStores_ && betaIsZero && (blockingB*sizeof(floatType))%64 == 0 && ((uint64_t)B)%32 == 0 && (ldb*sizeof(floatType))%32 == 0;
 
+#ifdef DEBUG
+   printf("Macro kernel using streaming stores? %d\n", useStreamingStores);
+#endif
+
    floatType *Btmp = B;
    size_t ldb_tmp = ldb;
    floatType buffer[blockingA * blockingB];// __attribute__((aligned(64)));
-   if( (useStreamingStores_ && useStreamingStores) ){
+   if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) ){
       Btmp = buffer;
       ldb_tmp = blockingB;
    }
 
    if( blockingA == blocking_ && blockingB == blocking_ )
    {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA *blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3* blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    }else if( blockingA == 2*blocking_micro_ && blockingB == blocking_ ) {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA *0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    }else if( blockingA == blocking_ && blockingB == 2*blocking_micro_) {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    } else {
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A, lda, Btmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 0 ) {
+#ifdef DEBUG
+         printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A, lda, innerStrideA, Btmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ * lda, lda, Btmp + blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > blocking_micro_ ) {
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_, lda, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_), lda, innerStrideA, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_, lda, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_), lda, innerStrideA, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_, lda, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_), lda, innerStrideA, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
    }
 
    // write buffer to main-memory via non-temporal stores
-   if( (useStreamingStores_ && useStreamingStores) )
+   if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) )
       for( int i = 0; i < blockingA; i++){
          for( int j = 0; j < blockingB; j+=blocking_micro_)
-            streamingStore<floatType>(B + i * ldb + j, buffer + i * ldb_tmp + j);
+            streamingStore<floatType>(B + (i * ldb) + (j * innerStrideB), buffer + i * ldb_tmp + j);
       }
 }
 
 template<int betaIsZero, typename floatType, bool conjA>
-void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, 
-                                  floatType* __restrict__ B, int sizeStride1B, const floatType alpha, const floatType beta, const ComputeNode* plan)
+void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, size_t innerStrideA,
+                                  floatType* __restrict__ B, int sizeStride1B, size_t innerStrideB, const floatType alpha, const floatType beta, const ComputeNode* plan)
 {
    const int32_t end = plan->end;
    const size_t lda = plan->lda;
@@ -589,16 +967,16 @@ void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A,
 #ifdef DEBUG
       printf("----[SCALAR]Pointing to deeper level, Shifting A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
 #endif
-      if( lda == 1 && plan->indexA )
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], end - plan->start, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
-      else if( ldb == 1 && plan->indexB )
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, &B[i*ldb], end - plan->start, alpha, beta, plan->next);
+      if( plan->indexA )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], end - plan->start, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next);
+      else if( plan->indexB )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], end - plan->start, innerStrideB, alpha, beta, plan->next);
       else
          for(; i < end; i++){
 #ifdef DEBUG
             printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
 #endif
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);}
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next);}
    }else{
       // macro-kernel
       const size_t lda_macro = plan->next->lda;
@@ -606,25 +984,25 @@ void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A,
       int i = plan->start;
       const size_t scalarRemainder = plan->end - plan->start;
 #ifdef DEBUG
-      printf("----[SCALAR]Sending to macro-kernel, Shifting A[+%zu] lda - %zu, B[+%zu] ldb - %zu\n", (i+offDiffAB)*lda, lda, i*ldb, ldb);
+      printf("----[SCALAR]Sending to macro-kernel (Remainder: %zu), Shifting A[+%zu] lda - %zu, B[+%zu] ldb - %zu\n", scalarRemainder, (i+offDiffAB)*lda, lda, i*ldb, ldb);
 #endif
       if( scalarRemainder > 0 ){
-         if( lda == 1 && plan->indexA )
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);
-         else if( ldb == 1 && plan->indexB )
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         if( plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta);
          else
             for(; i < end; i++){
 #ifdef DEBUG
                printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
 #endif
-               macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);}
+               macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta);}
       }
    }
 }
 template<int blockingA, int blockingB, int betaIsZero, typename floatType, bool useStreamingStores, bool conjA>
-void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, 
-                     floatType* __restrict__ B, const floatType* __restrict__ Bnext, const floatType alpha, const floatType beta, 
+void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, size_t innerStrideA,
+                     floatType* __restrict__ B, const floatType* __restrict__ Bnext, size_t innerStrideB, const floatType alpha, const floatType beta, 
                      const ComputeNode* plan)
 {
    const int32_t end = plan->end - (plan->inc - 1);
@@ -648,11 +1026,11 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
 #endif
          if( i + inc < end ) {
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+1+offDiffAB)*lda], &B[i*ldb], &B[(i+1)*ldb], alpha, beta, plan->next);
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+1+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[(i+1)*ldb], innerStrideB, alpha, beta, plan->next);
          } else if( i == plan->start || i + inc >= end ) {
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+offDiffAB)*lda], &B[i*ldb], &B[i*ldb], alpha, beta, plan->next);
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[i*ldb], innerStrideB, alpha, beta, plan->next);
          } else {
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
          }
       }
       // remainder
@@ -661,10 +1039,10 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1 && plan->indexB )
-            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+         if( plan->indexA )
+            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
+         else if( plan->indexB )
+            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
@@ -672,10 +1050,10 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1 && plan->indexB )
-            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+         if( plan->indexA )
+            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
+         else if( plan->indexB )
+            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
@@ -684,12 +1062,12 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
          printf("---- Passing to scalar. Blocking_ to %zu (blA = %d, blB = %d) where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, blockingA, blockingB, lda, ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], scalarRemainder, &B[i*ldb], blockingB, alpha, beta, plan->next);
-         else if ( ldb == 1 && plan->indexB )
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, &B[i*ldb], scalarRemainder, alpha, beta, plan->next);
+         if( plan->indexA )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], scalarRemainder, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next);
+         else if ( plan->indexB )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], scalarRemainder, innerStrideB, alpha, beta, plan->next);
          else
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, &B[i*ldb], blockingB, alpha, beta, plan->next);
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next);
       }
    } else {
       const size_t lda_macro = plan->next->lda;
@@ -703,14 +1081,14 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
       {
 #ifdef DEBUG
          printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
-         printf("--------- Case A: %zu < %zu, Case B: %zu == %zu or %zu > %zu, else Case C\n", i + inc, end, i, plan->start, i + inc, end);
+         printf("--------- Case A: %d < %d, Case B: %d == %zu or %d > %d, else Case C\n", i + inc, end, i, plan->start, i + inc, end);
 #endif
          if( i + inc < end ) {
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], &A[(i+offDiffAB+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta);
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], &A[(i+offDiffAB+1)*lda], lda_macro, innerStrideA, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, innerStrideB, alpha, beta);
          } else if( i == plan->start || i + inc > end ) {
-            break; //TODO: in this case blocking may be incorrect - hopefully nothing lands here anyway.
+            break; // Things land here when the last two levels contain indexA and indexB separately. 
          } else {
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
          }
       }
       // remainder
@@ -719,10 +1097,10 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1 && plan->indexB )
-            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+         if( plan->indexA )
+            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
@@ -730,10 +1108,10 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1 && plan->indexB )
-            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+         if( plan->indexA )
+            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
@@ -742,12 +1120,12 @@ void transpose_int( const floatType* __restrict__ A, const floatType* __restrict
          printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
          printf("---- Passing to scalar. Blocking_ to %zu where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, lda, ldb);
 #endif
-         if( lda == 1 && plan->indexA )
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
-         else if( ldb == 1 && plan->indexB )
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         if( plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta);
          else
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta);
       }
    }
 }
@@ -813,6 +1191,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                  const int *outerSizeB, 
                  const int *offsetA, 
                  const int *offsetB, 
+                 const size_t innerStrideA,
+                 const size_t innerStrideB,
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -827,6 +1207,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
          alpha_(alpha),
          beta_(beta),
          dim_(-1),
+         innerStrideA_(-1),
+         innerStrideB_(-1),
          numThreads_(numThreads), 
          masterPlan_(nullptr),
          selectionMethod_(selectionMethod),
@@ -865,7 +1247,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                threadIds_.push_back(i);
          }
 
-         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim);
+         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, innerStrideA, innerStrideB, dim);
+
+         innerStrideA_ = innerStrideA;
+         innerStrideB_ = innerStrideB;
 
          // initializes dim_, outerSizeA, outerSizeB, sizeA and perm 
          skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim);
@@ -895,6 +1280,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                                           outerSizeB_(other.outerSizeB_),
                                           offsetA_(other.offsetA_),
                                           offsetB_(other.offsetB_),
+                                          innerStrideA_(other.innerStrideA_),
+                                          innerStrideB_(other.innerStrideB_),
                                           lda_(other.lda_),
                                           ldb_(other.ldb_),
                                           threadIds_(other.threadIds_),
@@ -931,14 +1318,14 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
          auto rootNode = plan->getRootNode_const( taskId );
          if( std::abs(beta_) < getZeroThreshold<floatType>() ) {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
          } else {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
          }
       } else {
          auto rootNode = plan->getRootNode_const( taskId );
@@ -959,16 +1346,16 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
 
 
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
-static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const int &offDiffAB_, const floatType alpha, const floatType beta, int numThreads)
+static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const int &offDiffAB_, const size_t lda, const size_t ldb, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
    {
       HPTT_DUPLICATE(spawnThreads,
          for(int32_t i = myStart; i < myEnd; i++)
             if( conjA )
-               B[i] = alpha * conj(A[i + offDiffAB_]) + beta * B[i];
+               B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]) + beta * B[i * ldb];
             else
-               B[i] = alpha * A[i + offDiffAB_] + beta * B[i];
+               B[i * ldb] = alpha * A[(i + offDiffAB_) * lda] + beta * B[i * ldb];
       )
    } else {
       if( useStreamingStores)
@@ -976,24 +1363,24 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B,
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i + offDiffAB_]);
+                  B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]);
                else
-                  B[i] = alpha * A[i + offDiffAB_];
+                  B[i * ldb] = alpha * A[(i + offDiffAB_) * lda];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i + offDiffAB_]);
+                  B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]);
                else
-                  B[i] = alpha * A[i + offDiffAB_];
+                  B[i * ldb] = alpha * A[(i + offDiffAB_) * lda];
          )
    }
 }
 
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
-static void axpy_2D( const floatType* __restrict__ A, const int lda, 
-                        floatType* __restrict__ B, const int ldb, 
+static void axpy_2D( const floatType* __restrict__ A, const size_t (&lda)[2], 
+                        floatType* __restrict__ B, const size_t (&ldb)[2], 
                         const int n0, const int myStart, const int myEnd, const int (&offDiffAB_)[2], const int offsetB_, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
@@ -1002,9 +1389,9 @@ static void axpy_2D( const floatType* __restrict__ A, const int lda,
          for(int32_t j = myStart; j < myEnd; j++)
             for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]) + beta * B[i + j * ldb];
+                  B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]) + beta * B[(i * ldb[0]) + j * ldb[1]];
                else
-                  B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda] + beta * B[i + j * ldb];
+                  B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]] + beta * B[(i * ldb[0]) + j * ldb[1]];
       )
    } else {
       if( useStreamingStores)
@@ -1013,18 +1400,18 @@ static void axpy_2D( const floatType* __restrict__ A, const int lda,
 _Pragma("vector nontemporal")
             for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]);
+                  B[(i * ldb[0])+ j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]);
                else
-                  B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda];
+                  B[(i * ldb[0])+ j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
                for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                   if( conjA )
-                     B[i + j * ldb] = alpha * conj(A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda]);
+                     B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]);
                   else
-                     B[i + j * ldb] = alpha * A[(i + offDiffAB_[0]) + (j + offDiffAB_[1]) * lda];
+                     B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]];
          )
    }
 }
@@ -1087,9 +1474,9 @@ void Transpose<floatType>::execute_expert() noexcept
       getStartEnd<spawnThreads>(sizeA_[0], myStart, myEnd);
       const int offDiffAB_ = (int)offsetA_[0] - (int)offsetB_[0];
       if( conjA_ )
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ );
       else
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ );
       return;
    }
    else if( dim_ == 2 && perm_[0] == 0)
@@ -1097,9 +1484,9 @@ void Transpose<floatType>::execute_expert() noexcept
       getStartEnd<spawnThreads>(sizeA_[1], myStart, myEnd);
       const int offDiffAB_[2] = {((int)offsetA_[0] - (int)offsetB_[0]), ((int)offsetA_[1] - (int)offsetB_[1])};
       if( conjA_ )
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       else
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       return;
    }
 
@@ -1112,9 +1499,9 @@ void Transpose<floatType>::execute_expert() noexcept
          if ( perm_[0] != 0 ) {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode);
             else
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode);
          } else {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
@@ -1517,7 +1904,7 @@ void Transpose<floatType>::getParallelismStrategies(std::vector<std::vector<int>
 }
 
 template<typename floatType>
-void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const int dim) const
+void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const
 {
    if ( dim < 1 ) {
       fprintf(stderr,"[HPTT] ERROR: dimensionality too low.\n");
@@ -1568,12 +1955,22 @@ void Transpose<floatType>::verifyParameter(const int *size, const int* perm, con
             fprintf(stderr,"[HPTT] ERROR: offsetB invalid\n");
             exit(-1);
          }
+      
+   if ( innerStrideA < 0 ) {
+      fprintf(stderr,"[HPTT] ERROR: innerStrideA invalid\n");
+      exit(-1);
+   }
+
+   if ( innerStrideB < 0 ) {
+      fprintf(stderr,"[HPTT] ERROR: innerStrideB invalid\n");
+      exit(-1);
+   }
 }
 
 template<typename floatType>
 void Transpose<floatType>::computeLeadingDimensions()
 {
-   lda_[0] = 1;
+   lda_[0] = innerStrideA_;
    if( outerSizeA_[0] == -1 )
       for(int i=1;i < dim_ ; ++i)
          lda_[i] = lda_[i-1] * sizeA_[i-1];
@@ -1581,7 +1978,7 @@ void Transpose<floatType>::computeLeadingDimensions()
       for(int i=1;i < dim_ ; ++i)
          lda_[i] = outerSizeA_[i-1] * lda_[i-1];
 
-   ldb_[0] = 1;
+   ldb_[0] = innerStrideB_;
    if( outerSizeB_[0] == -1 )
       for(int i=1;i < dim_ ; ++i)
          ldb_[i] = ldb_[i-1] * sizeA_[perm_[i-1]];
@@ -1722,6 +2119,8 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
    printVector(sizeA_,"sizeA");
    printVector(outerSizeA_,"outerSizeA");
    printVector(outerSizeB_,"outerSizeB");
+   printf("innerStrideA: %lu\n",innerStrideA_);
+   printf("innerStrideB: %lu\n",innerStrideB_);
 #endif
 }
 
diff --git a/testframework/testframework.cpp b/testframework/testframework.cpp
index e8b0140..b170d1e 100644
--- a/testframework/testframework.cpp
+++ b/testframework/testframework.cpp
@@ -73,6 +73,7 @@ template<typename floatType>
 static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, 
       uint32_t *outerSizeA, uint32_t *outerSizeB,
       uint32_t *offsetA, uint32_t *offsetB,
+      int &innerStrideA, int &innerStrideB,
       floatType &beta, 
       int &numThreads, 
       std::string &perm_str, std::string &size_str, 
@@ -80,7 +81,7 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size,
       std::string &outerSizeB_str, std::string &offsetB_str, 
       const int total_size, bool subTensors)
 {
-   dim = (rand() % MAX_DIM) + 1;
+   dim = 8;//(rand() % MAX_DIM) + 1;
    uint32_t maxSizeDim = std::max(1.0, std::pow(total_size, 1.0/dim));
    std::vector<int> perm_(dim);
    for(int i=0;i < dim ; ++i){
@@ -111,6 +112,12 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size,
    else
       beta = 4.0;
 
+   // Provide a larger inner stride if the tensor is less than a integer factor of the total size
+   int ordinalSizeA = std::accumulate(outerSizeA, outerSizeA+dim, 1, std::multiplies<uint32_t>());
+   int ordinalSizeB = std::accumulate(outerSizeB, outerSizeB+dim, 1, std::multiplies<uint32_t>());
+   innerStrideA = (ordinalSizeA < (total_size / 4)) ? 2 : 1;
+   innerStrideB = (ordinalSizeB < (total_size / 4)) ? 2 : 1;
+
    for(int i=0;i < dim ; ++i){
       perm_str += std::to_string(perm[i]) + " ";
       size_str += std::to_string(size[i]) + " ";
@@ -127,6 +134,10 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size,
    printf("outerSizeB: %s\n", outerSizeB_str.c_str());
    printf("offsetA: %s\n", offsetA_str.c_str());
    printf("offsetB: %s\n", offsetB_str.c_str());
+   printf("ordinalSizeA: %d\n", ordinalSizeA);
+   printf("ordinalSizeB: %d\n", ordinalSizeB);
+   printf("innerStrideA: %d\n", innerStrideA);
+   printf("innerStrideB: %d\n", innerStrideB);
    printf("numThreads: %d\n",numThreads);
 }
 
@@ -136,7 +147,6 @@ void runTests(bool subTensors = false)
    int numThreads = 1;
    floatType alpha = 2.;
    floatType beta = 4.;
-   //beta = 0; 
 
    srand(time(NULL));
    int dim;
@@ -146,6 +156,8 @@ void runTests(bool subTensors = false)
    uint32_t outerSizeB[MAX_DIM];
    uint32_t offsetA[MAX_DIM];
    uint32_t offsetB[MAX_DIM];
+   int innerStrideA = 2;
+   int innerStrideB = 2;
    size_t total_size = 128*1024*1024;
 
    // Allocating memory for tensors
@@ -169,6 +181,9 @@ void runTests(bool subTensors = false)
       B_ref[i]  = B[i];
       B_hptt[i] = B[i];
    }
+   printf("Total size: %lu\n", total_size);
+   printf("Last element of A: %f\n", A[total_size-1]);
+   printf("Last element of B: %f\n", B[total_size-1]);
 
    for(int j=0; j < NUM_TESTS; ++j)
    {  
@@ -179,7 +194,15 @@ void runTests(bool subTensors = false)
       std::string offsetA_str = "";
       std::string offsetB_str = "";
       std::cout<<"Test "<<j<<std::endl;
-      getRandomTest(dim, perm, size, outerSizeA, outerSizeB, offsetA, offsetB, beta, numThreads, perm_str, size_str, outerSizeA_str, offsetA_str, outerSizeB_str, offsetB_str, total_size, subTensors);
+      getRandomTest(dim, perm, size, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, beta, numThreads, perm_str, size_str, outerSizeA_str, offsetA_str, outerSizeB_str, offsetB_str, total_size, subTensors);
+      //dim = 8;
+      //beta = 0.0;
+      //int presetPerm[8] = {6, 3, 7, 2, 0, 1, 4, 5};
+      //int presetSize[8] = {4, 5, 8, 1, 7, 4, 9, 9};
+      //int presetOuterSizeA[8] = {4, 5, 8, 1, 7, 4, 9, 9};
+      //int presetOuterSizeB[8] = {9, 1, 9, 8, 4, 5, 7, 4};
+      //int presetOffsetA[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      //int presetOffsetB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       int perm_[dim];
       int size_[dim];
       int outerSizeA_[dim];
@@ -193,16 +216,21 @@ void runTests(bool subTensors = false)
          outerSizeB_[i] = (int)outerSizeB[i];
          offsetA_[i] = (int)offsetA[i];
          offsetB_[i] = (int)offsetB[i];
+         //perm[i] = (int)presetPerm[i];
+         //size[i] = (int)presetSize[i];
+         //outerSizeA[i] = (int)presetOuterSizeA[i];
+         //outerSizeB[i] = (int)presetOuterSizeB[i];
+         //offsetA[i] = (int)presetOffsetA[i];
+         //offsetB[i] = (int)presetOffsetB[i];
       }
 
       auto plan = hptt::create_plan( perm_, dim, 
-            alpha, A, size_, outerSizeA_, offsetA_,
-            beta, B_hptt, outerSizeB_, offsetB_,
+            alpha, A, size_, outerSizeA_, offsetA_, innerStrideA,
+            beta, B_hptt, outerSizeB_, offsetB_, innerStrideB,
             hptt::ESTIMATE, numThreads);
 
       restore(B, B_ref, total_size);
-      transpose_ref<floatType>(size, perm, dim, A, alpha, outerSizeA_, offsetA_, B_ref, beta, outerSizeB_, offsetB_, false);
-
+      transpose_ref<floatType>(size, perm, dim, A, alpha, outerSizeA_, offsetA_, innerStrideA, B_ref, beta, outerSizeB_, offsetB_, innerStrideB, false);
       restore(B, B_hptt, total_size);
       plan->execute();
 

From 933981b84383056d255f168ab22545a20a8e1d87 Mon Sep 17 00:00:00 2001
From: Nathan Harmer <njh80@cam.ac.uk>
Date: Fri, 28 Mar 2025 15:54:49 +0000
Subject: [PATCH 3/3] FIX: Testing more unique cases - Inner Strides may be any
 value between one and the number of dimenions (a small random number) and
 number of dimension any value between 1 and MAX_DIM again.

---
 testframework/testframework.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/testframework/testframework.cpp b/testframework/testframework.cpp
index b170d1e..2b116ed 100644
--- a/testframework/testframework.cpp
+++ b/testframework/testframework.cpp
@@ -81,7 +81,7 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size,
       std::string &outerSizeB_str, std::string &offsetB_str, 
       const int total_size, bool subTensors)
 {
-   dim = 8;//(rand() % MAX_DIM) + 1;
+   dim = (rand() % MAX_DIM) + 1;
    uint32_t maxSizeDim = std::max(1.0, std::pow(total_size, 1.0/dim));
    std::vector<int> perm_(dim);
    for(int i=0;i < dim ; ++i){
@@ -115,8 +115,8 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size,
    // Provide a larger inner stride if the tensor is less than a integer factor of the total size
    int ordinalSizeA = std::accumulate(outerSizeA, outerSizeA+dim, 1, std::multiplies<uint32_t>());
    int ordinalSizeB = std::accumulate(outerSizeB, outerSizeB+dim, 1, std::multiplies<uint32_t>());
-   innerStrideA = (ordinalSizeA < (total_size / 4)) ? 2 : 1;
-   innerStrideB = (ordinalSizeB < (total_size / 4)) ? 2 : 1;
+   innerStrideA = (ordinalSizeA < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1;
+   innerStrideB = (ordinalSizeB < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1;
 
    for(int i=0;i < dim ; ++i){
       perm_str += std::to_string(perm[i]) + " ";