diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803f5dba41..0146130cee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,10 @@ include(cmake/CPM.cmake)
 
 find_package(Git)
 
+if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
 # By default we will build DEVEL. The different build types will pass different
 # flags to the compiler which may be strict or permissive on warnings, or
 # very verbose at run time and/or compile time.
@@ -337,7 +341,7 @@ set(CMAKE_CXX_FLAGS_STRICT
   "-Os"
   CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.")
 set(CMAKE_CXX_FLAGS_RELEASE
-  "-O3 ${CXX_OPT}"
+  "-Zi -O2 ${CXX_OPT}"
   CACHE STRING "Flags used by the C++ compiler during release builds.")
 set(CMAKE_CXX_FLAGS_HOSTDEBUG
     "-g"
@@ -359,7 +363,7 @@ set(CMAKE_C_FLAGS_STRICT
   "-Os"
   CACHE STRING "Flags used by the C compiler during strict jenkins builds.")
 set(CMAKE_C_FLAGS_RELEASE
-  "-O3"
+  "-Zi -O2"
   CACHE STRING "Flags used by the C compiler during release builds.")
 set(CMAKE_C_FLAGS_HOSTDEBUG
     "-g"
@@ -709,5 +713,7 @@ include(CTest)
 
 # add tests, utils, reference, and quda library
 add_subdirectory(lib)
-add_subdirectory(tests)
+if (NOT WIN32)
+  add_subdirectory(tests)
+endif()
 add_subdirectory(doc)
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 25f5234390..0664a40c47 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1196,7 +1196,7 @@ namespace quda
       using Accessor = FloatNOrder<Float, Ns, Nc, N, spin_project, huge_alloc, disable_ghost>;
       using GhostNOrder = GhostNOrder<Float, Ns, Nc, N, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       using Vector = typename VectorType<Float, N>::type;
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
@@ -1432,7 +1432,7 @@ namespace quda
       using Accessor = FloatNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc, disable_ghost>;
       using GhostNOrder = GhostNOrder<Float, Ns, Nc, N_, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       using Vector = int4;      // 128-bit packed type
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
diff --git a/include/communicator_quda.h b/include/communicator_quda.h
index aec02b8c2a..c4b0718904 100644
--- a/include/communicator_quda.h
+++ b/include/communicator_quda.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <unistd.h> // for gethostname()
+// #include <unistd.h> // for gethostname()
 #include <cassert>
 #include <csignal>
 #include <limits>
diff --git a/include/complex_quda.h b/include/complex_quda.h
index 18da63def5..9dbc6c95a1 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -20,6 +20,11 @@
 
 #pragma once
 
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#undef _USE_MATH_DEFINES
+#endif
 #include <cmath>
 #include <complex>
 #include <sstream>
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 38079c3cbb..2b8c067c99 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1852,7 +1852,7 @@ namespace quda {
     template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
       using Accessor = QDPOrder<Float, length>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const unsigned int volumeCB;
       QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
@@ -1898,7 +1898,7 @@ namespace quda {
     template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
       using Accessor = QDPJITOrder<Float, length>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       Float *gauge[QUDA_MAX_DIM];
       const unsigned int volumeCB;
       QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
@@ -1948,7 +1948,7 @@ namespace quda {
   template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
     using Accessor = MILCOrder<Float, length>;
     using real = typename mapper<Float>::type;
-    using complex = complex<real>;
+    using complex = quda::complex<real>;
     Float *gauge;
     const unsigned int volumeCB;
     const int geometry;
@@ -2009,7 +2009,7 @@ namespace quda {
   template <typename Float, int length> struct MILCSiteOrder : public LegacyOrder<Float,length> {
     using Accessor = MILCSiteOrder<Float, length>;
     using real = typename mapper<Float>::type;
-    using complex = complex<real>;
+    using complex = quda::complex<real>;
     Float *gauge;
     const unsigned int volumeCB;
     const int geometry;
@@ -2068,7 +2068,7 @@ namespace quda {
   template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
     using Accessor = CPSOrder<Float, length>;
     using real = typename mapper<Float>::type;
-    using complex = complex<real>;
+    using complex = quda::complex<real>;
     Float *gauge;
     const unsigned int volumeCB;
     const real anisotropy;
@@ -2137,7 +2137,7 @@ namespace quda {
     template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
       using Accessor = BQCDOrder<Float, length>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       Float *gauge;
       const unsigned int volumeCB;
       unsigned int exVolumeCB; // extended checkerboard volume
@@ -2199,7 +2199,7 @@ namespace quda {
     template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
       using Accessor = TIFROrder<Float, length>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       Float *gauge;
       const unsigned int volumeCB;
       static constexpr int Nc = 3;
@@ -2263,7 +2263,7 @@ namespace quda {
     template <typename Float, int length> struct TIFRPaddedOrder : LegacyOrder<Float,length> {
       using Accessor = TIFRPaddedOrder<Float, length>;
       using real = typename mapper<Float>::type;
-      using complex = complex<real>;
+      using complex = quda::complex<real>;
       Float *gauge;
       const unsigned int volumeCB;
       int exVolumeCB;
diff --git a/include/instantiate.h b/include/instantiate.h
index 8eee6ad269..6597fb3156 100644
--- a/include/instantiate.h
+++ b/include/instantiate.h
@@ -33,41 +33,32 @@ namespace quda
      @brief Helper function for returning if a given gauge field order is enabled
      @tparam order The order requested
    */
-  template <QudaGaugeFieldOrder order> constexpr bool is_enabled();
+  constexpr bool is_enabled(QudaGaugeFieldOrder order)
+  {
+    switch (order) {
 #ifdef BUILD_QDP_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_QDP_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_QDP_GAUGE_ORDER>() { return false; }
+  case QUDA_QDP_GAUGE_ORDER: return true;
 #endif
 #ifdef BUILD_QDPJIT_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_QDPJIT_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_QDPJIT_GAUGE_ORDER>() { return false; }
+  case QUDA_QDPJIT_GAUGE_ORDER: return true;
 #endif
 #ifdef BUILD_CPS_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_CPS_WILSON_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_CPS_WILSON_GAUGE_ORDER>() { return false; }
+  case QUDA_CPS_WILSON_GAUGE_ORDER: return true;
 #endif
 #ifdef BUILD_MILC_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_MILC_GAUGE_ORDER>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_MILC_SITE_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_MILC_GAUGE_ORDER>() { return false; }
-  template <> constexpr bool is_enabled<QUDA_MILC_SITE_GAUGE_ORDER>() { return false; }
+  case QUDA_MILC_GAUGE_ORDER: return true;
+  case QUDA_MILC_SITE_GAUGE_ORDER: return true;
 #endif
 #ifdef BUILD_BQCD_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_BQCD_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_BQCD_GAUGE_ORDER>() { return false; }
+  case QUDA_BQCD_GAUGE_ORDER: return true;
 #endif
 #ifdef BUILD_TIFR_INTERFACE
-  template <> constexpr bool is_enabled<QUDA_TIFR_GAUGE_ORDER>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_TIFR_PADDED_GAUGE_ORDER>() { return true; }
-#else
-  template <> constexpr bool is_enabled<QUDA_TIFR_GAUGE_ORDER>() { return false; }
-  template <> constexpr bool is_enabled<QUDA_TIFR_PADDED_GAUGE_ORDER>() { return false; }
+  case QUDA_TIFR_GAUGE_ORDER: return true;
+  case QUDA_TIFR_PADDED_GAUGE_ORDER: return true;
 #endif
+  default: return false;
+    }
+  }
 
   /**
      @brief Helper function for returning if a given precision is enabled
@@ -89,13 +80,18 @@ namespace quda
      @tparam reconstruct The reconstruct requested
      @return True if enabled, false if not
   */
-  template <QudaReconstructType reconstruct> constexpr bool is_enabled();
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_NO>() { return (QUDA_RECONSTRUCT & 4) ? true : false; }
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_13>() { return (QUDA_RECONSTRUCT & 2) ? true : false; }
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_12>() { return (QUDA_RECONSTRUCT & 2) ? true : false; }
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_9>() { return (QUDA_RECONSTRUCT & 1) ? true : false; }
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_8>() { return (QUDA_RECONSTRUCT & 1) ? true : false; }
-  template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_10>() { return true; }
+  constexpr bool is_enabled(QudaReconstructType reconstruct)
+  {
+    switch (reconstruct) {
+    case QUDA_RECONSTRUCT_NO: return (QUDA_RECONSTRUCT & 4) ? true : false;
+    case QUDA_RECONSTRUCT_13: return (QUDA_RECONSTRUCT & 2) ? true : false;
+    case QUDA_RECONSTRUCT_12: return (QUDA_RECONSTRUCT & 2) ? true : false;
+    case QUDA_RECONSTRUCT_9: return (QUDA_RECONSTRUCT & 1) ? true : false;
+    case QUDA_RECONSTRUCT_8: return (QUDA_RECONSTRUCT & 1) ? true : false;
+    case QUDA_RECONSTRUCT_10: return true;
+    default: return false;
+    }
+  }
 
   struct ReconstructFull {
     static constexpr std::array<QudaReconstructType, 6> recon
@@ -142,8 +138,8 @@ namespace quda
   void instantiateReconstruct(G &U, Args &&...args)
   {
     if (U.Reconstruct() == Recon::recon[i]) {
-      if constexpr (is_enabled<Recon::recon[i]>())
-        Apply<Float, nColor, Recon::recon[i]>(U, args...);
+      if constexpr (is_enabled(Recon::recon[i]))
+        Apply<Float, nColor, Recon::recon[i]> apply(U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable %d", QUDA_RECONSTRUCT, Recon::recon[i]);
     } else if constexpr (i > 0) {
@@ -471,13 +467,13 @@ namespace quda
   constexpr void instantiateGaugeStaggered(G &U, Args &&...args)
   {
     if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_NO>())
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_NO))
         // actual phase type doesn't matter because the phase is baked into the links
         Apply<store_t, nColor, QUDA_RECONSTRUCT_NO, QUDA_STAGGERED_PHASE_NO>(U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable %d", QUDA_RECONSTRUCT, QUDA_RECONSTRUCT_NO);
     } else if (U.Reconstruct() == QUDA_RECONSTRUCT_13) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_13>()) {
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_13)) {
         if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_NO)
           Apply<store_t, nColor, QUDA_RECONSTRUCT_13, QUDA_STAGGERED_PHASE_NO>(U, args...);
         else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC)
@@ -488,7 +484,7 @@ namespace quda
         errorQuda("QUDA_RECONSTRUCT=%d does not enable %d", QUDA_RECONSTRUCT, QUDA_RECONSTRUCT_13);
       }
     } else if (U.Reconstruct() == QUDA_RECONSTRUCT_12) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_12>()) {
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_12)) {
         errorQuda("QUDA_RECONSTRUCT=%d has not been implemented for HISQ gauge routines yet.", QUDA_RECONSTRUCT_12);
       } else {
         errorQuda("QUDA_RECONSTRUCT=%d does not enable %d\n", QUDA_RECONSTRUCT, QUDA_RECONSTRUCT_12);
@@ -545,38 +541,43 @@ namespace quda
      @tparam dslash_type The dslash_type requested
      @return True if enabled, false if not
   */
-  template <QudaDslashType dslash_type> constexpr bool is_enabled() { return false; }
+  constexpr bool is_enabled(QudaDslashType dslash_type)
+  {
+    switch (dslash_type) {
 #ifdef GPU_WILSON_DIRAC
-  template <> constexpr bool is_enabled<QUDA_WILSON_DSLASH>() { return true; }
+  case QUDA_WILSON_DSLASH: return true;
 #endif
 #ifdef GPU_CLOVER_DIRAC
-  template <> constexpr bool is_enabled<QUDA_CLOVER_WILSON_DSLASH>() { return true; }
+  case QUDA_CLOVER_WILSON_DSLASH: return true;
 #endif
 #ifdef GPU_CLOVER_HASENBUSCH_TWIST
-  template <> constexpr bool is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>() { return true; }
+  case QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH: return true;
 #endif
 #ifdef GPU_DOMAIN_WALL_DIRAC
-  template <> constexpr bool is_enabled<QUDA_DOMAIN_WALL_DSLASH>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_MOBIUS_DWF_DSLASH>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_MOBIUS_DWF_EOFA_DSLASH>() { return true; }
+  case QUDA_DOMAIN_WALL_DSLASH: return true;
+  case QUDA_DOMAIN_WALL_4D_DSLASH: return true;
+  case QUDA_MOBIUS_DWF_DSLASH: return true;
+  case QUDA_MOBIUS_DWF_EOFA_DSLASH: return true;
 #endif
 #ifdef GPU_STAGGERED_DIRAC
-  template <> constexpr bool is_enabled<QUDA_STAGGERED_DSLASH>() { return true; }
-  template <> constexpr bool is_enabled<QUDA_ASQTAD_DSLASH>() { return true; }
+  case QUDA_STAGGERED_DSLASH: return true;
+  case QUDA_ASQTAD_DSLASH: return true;
 #endif
 #ifdef GPU_TWISTED_MASS_DIRAC
-  template <> constexpr bool is_enabled<QUDA_TWISTED_MASS_DSLASH>() { return true; }
+  case QUDA_TWISTED_MASS_DSLASH: return true;
 #endif
 #ifdef GPU_TWISTED_CLOVER_DIRAC
-  template <> constexpr bool is_enabled<QUDA_TWISTED_CLOVER_DSLASH>() { return true; }
+  case QUDA_TWISTED_CLOVER_DSLASH: return true;
 #endif
 #ifdef GPU_LAPLACE
-  template <> constexpr bool is_enabled<QUDA_LAPLACE_DSLASH>() { return true; }
+  case QUDA_LAPLACE_DSLASH: return true;
 #endif
 #ifdef GPU_COVDEV
-  template <> constexpr bool is_enabled<QUDA_COVDEV_DSLASH>() { return true; }
+  case QUDA_COVDEV_DSLASH: return true;
 #endif
+    default: return false;
+    }
+  }
 
 #ifdef GPU_DISTANCE_PRECONDITIONING
   constexpr bool is_enabled_distance_precondition() { return true; }
diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h
index eab0ead243..d577c097af 100644
--- a/include/instantiate_dslash.h
+++ b/include/instantiate_dslash.h
@@ -22,17 +22,17 @@ namespace quda
                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
   {
     if (U.Reconstruct() == Recon::recon[0]) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_NO>())
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_NO))
         Apply<Float, nColor, Recon::recon[0]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-18", QUDA_RECONSTRUCT);
     } else if (U.Reconstruct() == Recon::recon[1]) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_12>())
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_12))
         Apply<Float, nColor, Recon::recon[1]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12/13", QUDA_RECONSTRUCT);
     } else if (U.Reconstruct() == Recon::recon[2]) {
-      if constexpr (is_enabled<QUDA_RECONSTRUCT_8>())
+      if constexpr (is_enabled(QUDA_RECONSTRUCT_8))
         Apply<Float, nColor, Recon::recon[2]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8/9", QUDA_RECONSTRUCT);
diff --git a/include/kernel_helper.h b/include/kernel_helper.h
index dcb33baba0..336c9fe297 100644
--- a/include/kernel_helper.h
+++ b/include/kernel_helper.h
@@ -2,6 +2,9 @@
 
 #include "comm_quda.h"
 
+#undef TRUE
+#undef FALSE
+
 namespace quda
 {
 
diff --git a/include/kernels/dslash_mdw_fused.cuh b/include/kernels/dslash_mdw_fused.cuh
index 67f98b30cc..a28180605d 100644
--- a/include/kernels/dslash_mdw_fused.cuh
+++ b/include/kernels/dslash_mdw_fused.cuh
@@ -165,7 +165,7 @@ namespace quda {
       bool ret = false;
 #pragma unroll
       for (int d = 0; d < 4; d++) {
-        ret = ret or (coordinate[d] >= dim[d] - halo_shift[d] or coordinate[d] < halo_shift[d]);
+        ret = ret || (coordinate[d] >= dim[d] - halo_shift[d] || coordinate[d] < halo_shift[d]);
       }
       return ret;
     }
diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh
index 7bea55d908..cf8d7bdbfe 100644
--- a/include/kernels/evec_project.cuh
+++ b/include/kernels/evec_project.cuh
@@ -11,8 +11,8 @@ namespace quda {
   
   using spinor_array = array<double, 8>;
 
-  constexpr unsigned long max_nx = 4;
-  constexpr unsigned long max_ny = 4;
+  constexpr unsigned long long max_nx = 4;
+  constexpr unsigned long long max_ny = 4;
 
   template <typename Float, int nColor_>
   struct EvecProjectionArg : public ReduceArg<spinor_array>
diff --git a/include/kernels/gauge_random.cuh b/include/kernels/gauge_random.cuh
index 92c2b1995e..82d4ffea40 100644
--- a/include/kernels/gauge_random.cuh
+++ b/include/kernels/gauge_random.cuh
@@ -52,6 +52,8 @@ namespace quda {
       temp2[i] *= radius[i];
     }
 
+    printf("%f %f\n", phi[0], radius[0]);
+
     // construct Anti-Hermitian matrix
     const real rsqrt_3 = quda::rsqrt(3.0);
     ret(0, 0) = complex<real>(0.0, temp1[2] + rsqrt_3 * temp2[3]);
@@ -82,12 +84,12 @@ namespace quda {
       getCoords(x, x_cb, arg.X, parity);
       for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
-      if (arg.group and arg.sigma == 0.0) {
+      if (arg.group && arg.sigma == 0.0) {
         // if sigma = 0 then we just set the output matrix to the identity and finish
         Link I;
         setIdentity(&I);
         for (int mu = 0; mu < 4; mu++) arg.U(mu, linkIndex(x, arg.E), parity) = I;
-      } else if (not arg.group and arg.sigma == 0.0) {
+      } else if (! arg.group && arg.sigma == 0.0) {
         // if sigma = 0 then we just set the output matrix to the zero and finish
         Link O = {};
         for (int mu = 0; mu < 4; mu++) arg.U(mu, linkIndex(x, arg.E), parity) = O;
diff --git a/include/multi_blas_helper.cuh b/include/multi_blas_helper.cuh
index 93dd15249c..58e105f956 100644
--- a/include/multi_blas_helper.cuh
+++ b/include/multi_blas_helper.cuh
@@ -235,7 +235,7 @@ namespace quda
       static_assert(coeff_nyw != 0, "coeff_nyw is zero");
 
       // additional limit since there's diminished benefit past a certain point
-      constexpr auto max_nyw = 128lu;
+      constexpr auto max_nyw = 128llu;
 
       return std::min(arg_nyw, std::min(coeff_nyw, max_nyw));
     }
@@ -290,7 +290,7 @@ namespace quda
       const auto coeff_nyw = Functor::coeff_mul ? max_array_size() / (NXZ * sizeof(typename Functor::coeff_t)) : arg_nyw;
 
       // additional limit since there's diminished benefit past a certain point
-      constexpr auto max_nyw = 128lu;
+      constexpr auto max_nyw = 128llu;
 
       return std::min(arg_nyw, std::min(coeff_nyw, max_nyw));
     }
diff --git a/include/polynomial.h b/include/polynomial.h
index aa51d372ed..69eeef551d 100644
--- a/include/polynomial.h
+++ b/include/polynomial.h
@@ -1,5 +1,10 @@
 #pragma once
 
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#undef _USE_MATH_DEFINES
+#endif
 #include <cmath>
 #include <array>
 #include <vector>
diff --git a/include/quda.h b/include/quda.h
index 155791b60a..fc1580e12f 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -15,7 +15,23 @@
 #include <quda_constants.h>
 
 #ifndef __CUDACC_RTC__
+#ifdef _MSC_VER
+#ifdef __cplusplus
+#include <complex>
+extern "C" typedef struct {
+  double real, imag;
+  operator std::complex<double>() const { return std::complex<double>(real, imag); }
+  operator std::complex<float>() const { return std::complex<float>(real, imag); }
+} _Dcomplex;
+#else
+typedef struct {
+  double real, imag;
+} _Dcomplex;
+#endif
+#define double_complex _Dcomplex
+#else
 #define double_complex double _Complex
+#endif
 #else // keep NVRTC happy since it can't handle C types
 #define double_complex double2
 #endif
@@ -1856,15 +1872,17 @@ extern "C" {
    * @param[in] inv_param Meta-data structure
    * @param[in] X Lattice dimensions
    */
-  void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark,
+  void laphSinkProject(double_complex *host_sinks, void **host_quark, int n_quark, int tile_quark,
                        void **host_evec, int nevec, int tile_evec, QudaInvertParam *inv_param, const int X[4]);
 
 #ifdef __cplusplus
 }
 #endif
 
+#ifndef _INTERFACE_
 // remove NVRTC WAR
 #undef double_complex
+#endif
 
 /* #include <quda_new_interface.h> */
 
diff --git a/include/quda_matrix.h b/include/quda_matrix.h
index d3a306f68e..8290d09762 100644
--- a/include/quda_matrix.h
+++ b/include/quda_matrix.h
@@ -995,11 +995,11 @@ namespace quda {
       // if its argument (tmp) is zero and then return unity.
 
       Complex tmp = a3 + sg2h3;
-      if (tmp.real() == 0 and tmp.imag() == 0) {
+      if (tmp.real() == 0 && tmp.imag() == 0) {
         // Making sure q is a zero matrix
         bool iszero = true;
         for (int i = 0; i < 9; i++) {
-          if (q(i).real() != 0 or q(i).imag() != 0) {
+          if (q(i).real() != 0 || q(i).imag() != 0) {
             iszero = false;
             break;
           }
diff --git a/include/reliable_updates.h b/include/reliable_updates.h
index ec70fb4f3c..b87e8ce393 100644
--- a/include/reliable_updates.h
+++ b/include/reliable_updates.h
@@ -130,8 +130,8 @@ namespace quda
     {
       if (params.alternative_reliable) {
         // alternative reliable updates
-        updateX = ((d <= deps * sqrt(r2_old)) or (dfac * dinit > deps * r0Norm)) and (d_new > deps * rNorm)
-          and (d_new > dfac * dinit);
+        updateX = ((d <= deps * sqrt(r2_old)) || (dfac * dinit > deps * r0Norm)) && (d_new > deps * rNorm)
+          && (d_new > dfac * dinit);
         updateR = 0;
       } else {
         if (rNorm > maxrx) maxrx = rNorm;
@@ -205,19 +205,19 @@ namespace quda
     bool reliable_break(double r2, double stop, bool &L2breakdown, double L2breakdown_eps)
     {
       // break-out check if we have reached the limit of the precision
-      if (sqrt(r2) > r0Norm && updateX and not L2breakdown) { // reuse r0Norm for this
+      if (sqrt(r2) > r0Norm && updateX && ! L2breakdown) { // reuse r0Norm for this
         resIncrease++;
         resIncreaseTotal++;
         warningQuda("new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
                     sqrt(r2), r0Norm, resIncreaseTotal);
 
-        if ((params.use_heavy_quark_res and sqrt(r2) < L2breakdown_eps) or resIncrease > params.maxResIncrease
-            or resIncreaseTotal > params.maxResIncreaseTotal or r2 < stop) {
+        if ((params.use_heavy_quark_res && sqrt(r2) < L2breakdown_eps) || resIncrease > params.maxResIncrease
+            || resIncreaseTotal > params.maxResIncreaseTotal || r2 < stop) {
           if (params.use_heavy_quark_res) {
             L2breakdown = true;
             warningQuda("L2 breakdown %e, %e", sqrt(r2), L2breakdown_eps);
           } else {
-            if (resIncrease > params.maxResIncrease or resIncreaseTotal > params.maxResIncreaseTotal or r2 < stop) {
+            if (resIncrease > params.maxResIncrease || resIncreaseTotal > params.maxResIncreaseTotal || r2 < stop) {
               warningQuda("solver exiting due to too many true residual norm increases");
               return true;
             }
@@ -239,7 +239,7 @@ namespace quda
     bool reliable_heavy_quark_break(bool L2breakdown, double heavy_quark_res, double heavy_quark_res_old,
                                     bool &heavy_quark_restart)
     {
-      if (params.use_heavy_quark_res and L2breakdown) {
+      if (params.use_heavy_quark_res && L2breakdown) {
         hqresRestartTotal++; // count the number of heavy quark restarts we've done
         delta = 0;
         warningQuda("CG: Restarting without reliable updates for heavy-quark residual (total #inc %i)",
diff --git a/include/targets/cuda/block_reduction_kernel.h b/include/targets/cuda/block_reduction_kernel.h
index 27551430c8..2bb75affaa 100644
--- a/include/targets/cuda/block_reduction_kernel.h
+++ b/include/targets/cuda/block_reduction_kernel.h
@@ -129,13 +129,13 @@ namespace quda
      per thread (in the x dimension).  Not supported at present.
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __launch_bounds__(Arg::launch_bounds ?
-                      Arg::block_size :
-                      0) __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> BlockKernel2D()
-  {
-    static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
-    BlockKernel2D_impl<Functor, Arg>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  // __launch_bounds__(Arg::launch_bounds ?
+  //                     Arg::block_size :
+  //                     0) __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> BlockKernel2D()
+  // {
+  //   static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
+  //   BlockKernel2D_impl<Functor, Arg>(device::get_arg<Arg>());
+  // }
 
 } // namespace quda
diff --git a/include/targets/cuda/fast_intdiv.h b/include/targets/cuda/fast_intdiv.h
index 3dc1d61a22..a85631140d 100644
--- a/include/targets/cuda/fast_intdiv.h
+++ b/include/targets/cuda/fast_intdiv.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#ifdef __CUDACC_RTC__
+
 // declaration of class we wish to specialize
 template <bool> struct mul_hi;
 
@@ -12,4 +14,6 @@ template <> struct mul_hi<true> {
   }
 };
 
+#endif
+
 #include "../generic/fast_intdiv.h"
diff --git a/include/targets/cuda/kernel.h b/include/targets/cuda/kernel.h
index 313457d1d4..486639a132 100644
--- a/include/targets/cuda/kernel.h
+++ b/include/targets/cuda/kernel.h
@@ -64,11 +64,11 @@ namespace quda
      per thread.
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel1D()
-  {
-    Kernel1D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  // __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel1D()
+  // {
+  //   Kernel1D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
+  // }
 
   /**
      @brief Kernel2D_impl is the implementation of the generic 2-d
@@ -130,11 +130,11 @@ namespace quda
      per thread (in the x dimension)
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel2D()
-  {
-    Kernel2D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  // __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel2D()
+  // {
+  //   Kernel2D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
+  // }
 
   /**
      @brief Kernel3D_impl is the implementation of the generic 3-d
@@ -198,11 +198,11 @@ namespace quda
      per thread (in the x dimension)
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel3D()
-  {
-    Kernel3D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  // __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Kernel3D()
+  // {
+  //   Kernel3D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
+  // }
 
   /**
      @brief raw_kernel is used for CUDA-specific kernels where we want
diff --git a/include/targets/cuda/math_helper.cuh b/include/targets/cuda/math_helper.cuh
index bdc333297a..1c944d36ce 100644
--- a/include/targets/cuda/math_helper.cuh
+++ b/include/targets/cuda/math_helper.cuh
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
 #include <cmath>
 #include <target_device.h>
 
@@ -38,7 +42,7 @@ namespace quda {
     template <typename T> __device__ inline void operator()(const T& a, T *s, T *c)
     {
       BUILTIN_ASSUME(fabs(a) <= 2.0 * M_PI);
-      sincos(a, s, c);
+      ::sincos(a, s, c);
     }
   };
 
@@ -76,7 +80,7 @@ namespace quda {
   };
 
   template <> struct sincospi_impl<true> {
-    template <typename T> __device__ inline void operator()(const T& a, T *s, T *c) { sincospi(a, s, c); }
+    template <typename T> __device__ inline void operator()(const T& a, T *s, T *c) { ::sincospi(a, s, c); }
   };
 
 
diff --git a/include/targets/cuda/reduction_kernel.h b/include/targets/cuda/reduction_kernel.h
index b2d23e7897..0adac8b0fa 100644
--- a/include/targets/cuda/reduction_kernel.h
+++ b/include/targets/cuda/reduction_kernel.h
@@ -73,11 +73,11 @@ namespace quda
      per thread (in the x dimension)
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
-  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Reduction2D()
-  {
-    Reduction2D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = true>
+  // __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> Reduction2D()
+  // {
+  //   Reduction2D_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
+  // }
 
   /**
      @brief MultiReduction_impl is the implementation of the generic
@@ -150,10 +150,10 @@ namespace quda
      per thread (in the x dimension)
      @param[in] arg Kernel argument
    */
-  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
-  __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> MultiReduction()
-  {
-    MultiReduction_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
-  }
+  // template <template <typename> class Functor, typename Arg, bool grid_stride = true>
+  // __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> MultiReduction()
+  // {
+  //   MultiReduction_impl<Functor, Arg, grid_stride>(device::get_arg<Arg>());
+  // }
 
 } // namespace quda
diff --git a/include/timer.h b/include/timer.h
index 9cbb58792a..568b24a265 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -1,6 +1,34 @@
 #pragma once
 
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <winsock.h>
+#undef NOMINMAX
+#include <stdint.h>
+inline int gettimeofday(struct timeval * tv, struct timezone * tz)
+{
+  // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
+  // This magic number is the number of 100 nanosecond intervals since January 1, 1601 (UTC)
+  // until 00:00:00 January 1, 1970 
+  static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL);
+
+  SYSTEMTIME  system_time;
+  FILETIME    file_time;
+  uint64_t    time;
+
+  GetSystemTime( &system_time );
+  SystemTimeToFileTime( &system_time, &file_time );
+  time =  ((uint64_t)file_time.dwLowDateTime )      ;
+  time += ((uint64_t)file_time.dwHighDateTime) << 32;
+
+  tv->tv_sec  = (long) ((time - EPOCH) / 10000000L);
+  tv->tv_usec = (long) (system_time.wMilliseconds * 1000);
+  return 0;
+}
+#else
 #include <sys/time.h>
+#endif
+
 #include <stack>
 #include <quda_internal.h>
 #include <util_quda.h>
diff --git a/include/util_quda.h b/include/util_quda.h
index 3d68fb5a2e..98699feec9 100644
--- a/include/util_quda.h
+++ b/include/util_quda.h
@@ -70,6 +70,10 @@ const char *getOmpThreadStr();
 
 void errorQuda_(const char *func, const char *file, int line, ...);
 
+#ifdef _MSC_VER
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
 #define errorQuda(...)                                                                                                 \
   do {                                                                                                                 \
     fprintf(getOutputFile(), "%sERROR: ", getOutputPrefix());                                                          \
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 950ac83a09..dcb2a2084c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -301,8 +301,13 @@ if(NOT DEFINED QUDA_MAX_MULTI_RHS)
 endif()
 
 # make one library
-target_sources(quda PRIVATE $<TARGET_OBJECTS:quda_cpp> $<$<TARGET_EXISTS:quda_pack>:$<TARGET_OBJECTS:quda_pack>>
-                            ${QUDA_CU_OBJS})
+if (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  target_sources(quda PRIVATE $<TARGET_OBJECTS:quda_cpp> $<$<TARGET_EXISTS:quda_pack>:$<TARGET_OBJECTS:quda_pack>>
+                              ${QUDA_CU_OBJS} quda.def)
+else()
+  target_sources(quda PRIVATE $<TARGET_OBJECTS:quda_cpp> $<$<TARGET_EXISTS:quda_pack>:$<TARGET_OBJECTS:quda_pack>>
+                              ${QUDA_CU_OBJS})
+endif()
 
 # for a develop build reduce the size by compressing the debug information
 include(CheckLinkerFlag)
@@ -346,6 +351,10 @@ target_link_libraries(quda_cpp PRIVATE $<BUILD_INTERFACE:Eigen>)
 target_compile_definitions(quda_cpp PRIVATE $<TARGET_PROPERTY:quda,COMPILE_DEFINITIONS>)
 target_include_directories(quda_cpp PRIVATE $<TARGET_PROPERTY:quda,INCLUDE_DIRECTORIES>)
 target_compile_options(quda_cpp PRIVATE $<TARGET_PROPERTY:quda,COMPILE_OPTIONS>)
+if (WIN32)
+  target_link_libraries(quda PUBLIC WS2_32 DbgHelp)
+  target_link_libraries(quda_cpp PUBLIC WS2_32 DbgHelp)
+endif()
 
 add_subdirectory(targets/generic)
 target_include_directories(quda PRIVATE ../include/targets/generic)
diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index 2c581886da..f16e647cc3 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -1,4 +1,10 @@
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <winsock.h>
+#undef NOMINMAX
+#else
 #include <unistd.h> // for gethostname()
+#endif
 #include <assert.h>
 #include <limits>
 
diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh
index f9c40ba9fc..cd99337c12 100644
--- a/lib/copy_color_spinor.cuh
+++ b/lib/copy_color_spinor.cuh
@@ -112,13 +112,13 @@ namespace quda
       CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);
     } else if (out.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
       using O = PaddedSpaceSpinorColorOrder<FloatOut, Ns, Nc>;
-      if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>())
+      if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER))
         CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);
       else
         errorQuda("TIFR interface has not been built");
     } else if (out.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER) {
       using O = QDPJITDiracOrder<FloatOut, Ns, Nc>;
-      if constexpr (is_enabled<QUDA_QDPJIT_GAUGE_ORDER>())
+      if constexpr (is_enabled(QUDA_QDPJIT_GAUGE_ORDER))
         CopyColorSpinor<Ns, Nc, O, I, param_t>(out, in, param);
       else
         errorQuda("QDPJIT interface has not been built");
@@ -143,13 +143,13 @@ namespace quda
       genericCopyColorSpinor<Ns, Nc, I>(param);
     } else if (in.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
       using ColorSpinor = PaddedSpaceSpinorColorOrder<FloatIn, Ns, Nc>;
-      if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>())
+      if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER))
         genericCopyColorSpinor<Ns, Nc, ColorSpinor>(param);
       else
         errorQuda("TIFR interface has not been built");
     } else if (in.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER) {
       using ColorSpinor = QDPJITDiracOrder<FloatIn, Ns, Nc>;
-      if constexpr (is_enabled<QUDA_QDPJIT_GAUGE_ORDER>())
+      if constexpr (is_enabled(QUDA_QDPJIT_GAUGE_ORDER))
         genericCopyColorSpinor<Ns, Nc, ColorSpinor>(param);
       else
         errorQuda("QDPJIT interface has not been built");
diff --git a/lib/copy_gauge.in.cpp b/lib/copy_gauge.in.cpp
index 24bbf9ddbb..99e03df012 100644
--- a/lib/copy_gauge.in.cpp
+++ b/lib/copy_gauge.in.cpp
@@ -39,7 +39,7 @@ namespace quda {
 
   void checkMomOrder(const GaugeField &u) {
     if (u.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
-      if (u.Reconstruct() != QUDA_RECONSTRUCT_10 and u.Reconstruct() != QUDA_RECONSTRUCT_NO)
+      if (u.Reconstruct() != QUDA_RECONSTRUCT_10 && u.Reconstruct() != QUDA_RECONSTRUCT_NO)
 	errorQuda("Unsuported order %d and reconstruct %d combination", u.Order(), u.Reconstruct());
     } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER || u.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
       if (u.Reconstruct() != QUDA_RECONSTRUCT_NO)
diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu
index d4f3f1a4fe..d3f1eb3baf 100644
--- a/lib/copy_gauge_inc.cu
+++ b/lib/copy_gauge_inc.cu
@@ -299,7 +299,7 @@ namespace quda {
         // momentum only currently supported on MILC (10), TIFR (18) and Float2 (10) fields currently
 	if (out.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
 	  if (in.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
-	    if (in.Reconstruct() == QUDA_RECONSTRUCT_10 and out.Reconstruct() == QUDA_RECONSTRUCT_10) {
+	    if (in.Reconstruct() == QUDA_RECONSTRUCT_10 && out.Reconstruct() == QUDA_RECONSTRUCT_10) {
 	      typedef FloatNOrder<FloatIn,10,2,10> momIn;
 	      typedef FloatNOrder<FloatOut,10,2,10> momOut;
               CopyGaugeArg<FloatOut, FloatIn, 10, fine_grain(), momOut, momIn> arg(momOut(out, Out, 0),
diff --git a/lib/covariant_derivative.cu b/lib/covariant_derivative.cu
index 501448789f..ca85e45727 100644
--- a/lib/covariant_derivative.cu
+++ b/lib/covariant_derivative.cu
@@ -161,7 +161,7 @@ namespace quda
   void ApplyCovDev(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
                    int mu, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_COVDEV_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_COVDEV_DSLASH)) {
       instantiate<CovDevApply>(out, in, in, U, mu, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Covariant derivative kernels have not been built");
diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
index 62f4f7639e..4ae1f93282 100644
--- a/lib/dirac_coarse.cpp
+++ b/lib/dirac_coarse.cpp
@@ -63,7 +63,7 @@ namespace quda {
     mapped(Y_d->MemType() == QUDA_MEMORY_MAPPED)
   {
 
-    constexpr QudaGaugeFieldOrder gOrder = QUDA_MILC_GAUGE_ORDER;
+    static constexpr QudaGaugeFieldOrder gOrder = QUDA_MILC_GAUGE_ORDER;
 
     auto create_gauge_copy = [](const GaugeField &X) -> auto
     {
diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu
index aba5db87ca..68eec66565 100644
--- a/lib/dslash5_domain_wall.cu
+++ b/lib/dslash5_domain_wall.cu
@@ -179,7 +179,7 @@ namespace quda
                     cvector_ref<const ColorSpinorField> &x, double m_f, double m_5, const Complex *b_5,
                     const Complex *c_5, double a, bool dagger, Dslash5Type type)
   {
-    if (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       if (in.PCType() != QUDA_4D_PC) errorQuda("Only 4-d preconditioned fields are supported");
       checkLocation(out, in, x); // check all locations match
       instantiate_recurse3<Dslash5>(out, in, x, m_f, m_5, b_5, c_5, a, dagger, type);
diff --git a/lib/dslash5_mobius_eofa.cu b/lib/dslash5_mobius_eofa.cu
index dd1d39636a..b102b6172c 100644
--- a/lib/dslash5_mobius_eofa.cu
+++ b/lib/dslash5_mobius_eofa.cu
@@ -177,7 +177,7 @@ namespace quda
                        const Complex *c_5, double a, int eofa_pm, double inv, double kappa, const double *eofa_u,
                        const double *eofa_x, const double *eofa_y, double sherman_morrison, bool dagger, Dslash5Type type)
     {
-      if constexpr (is_enabled<QUDA_MOBIUS_DWF_EOFA_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_MOBIUS_DWF_EOFA_DSLASH)) {
         checkLocation(out, in, x); // check all locations match
         instantiate_recurse3<Dslash5>(out, in, x, m_f, m_5, b_5, c_5, a, eofa_pm, inv, kappa, eofa_u, eofa_x, eofa_y,
                                       sherman_morrison, dagger, type);
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index 2fc342bfb4..a2c50557db 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -46,7 +46,7 @@ namespace quda {
   void ApplyClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    const CloverField &clover, bool inverse, int parity)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH)) {
       instantiate_recurse2<Clover>(out, in, clover, inverse, parity);
     } else {
       errorQuda("Clover dslash has not been built");
@@ -135,7 +135,7 @@ namespace quda {
                         const CloverField &clover, double kappa, double mu, double epsilon, int parity, int dagger,
                         QudaTwistGamma5Type twist)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH)) {
       instantiate_recurse2<TwistClover>(out, in, clover, kappa, mu, epsilon, parity, dagger, twist);
     } else {
       errorQuda("Twisted-clover operator has not been built");
diff --git a/lib/dslash_domain_wall_4d.cu b/lib/dslash_domain_wall_4d.cu
index 9ba7d1d689..0bcc093a79 100644
--- a/lib/dslash_domain_wall_4d.cu
+++ b/lib/dslash_domain_wall_4d.cu
@@ -60,7 +60,7 @@ namespace quda
                          cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
                          TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>() || is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH) || is_enabled(QUDA_TWISTED_CLOVER_DSLASH)) {
       instantiate<DomainWall4DApply>(out, in, x, U, a, m_5, b_5, c_5, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Domain-wall dslash has not been built");
diff --git a/lib/dslash_domain_wall_4d_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv.cu
index e739dddadb..cc9193befc 100644
--- a/lib/dslash_domain_wall_4d_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5inv.cu
@@ -15,7 +15,7 @@ namespace quda
                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
index 870c311fb2..ea7559ce81 100644
--- a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
@@ -15,7 +15,7 @@ namespace quda
                                    cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS_M5_INV_DAG>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
index fedfd154b9..cd55315702 100644
--- a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
+++ b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
@@ -15,7 +15,7 @@ namespace quda
                                    cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS_M5_PRE>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5mob.cu b/lib/dslash_domain_wall_4d_m5mob.cu
index 76527b9321..188c6597fd 100644
--- a/lib/dslash_domain_wall_4d_m5mob.cu
+++ b/lib/dslash_domain_wall_4d_m5mob.cu
@@ -15,7 +15,7 @@ namespace quda
                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5pre.cu b/lib/dslash_domain_wall_4d_m5pre.cu
index b9ea9dae29..182269b93b 100644
--- a/lib/dslash_domain_wall_4d_m5pre.cu
+++ b/lib/dslash_domain_wall_4d_m5pre.cu
@@ -15,7 +15,7 @@ namespace quda
                               cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS_PRE>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
index 33cb13c1b1..eb75138145 100644
--- a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
@@ -15,7 +15,7 @@ namespace quda
                                    cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_PRE_MOBIUS_M5_INV>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
index 7ceabeec6a..ba9c57eccd 100644
--- a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
+++ b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
@@ -15,7 +15,7 @@ namespace quda
                                    cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_4D_DSLASH)) {
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS_PRE_M5_MOB>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
                                             dummy_list, profile);
diff --git a/lib/dslash_domain_wall_5d.cu b/lib/dslash_domain_wall_5d.cu
index 3dce0be391..8998fc7bb6 100644
--- a/lib/dslash_domain_wall_5d.cu
+++ b/lib/dslash_domain_wall_5d.cu
@@ -85,7 +85,7 @@ namespace quda
                          const GaugeField &U, double a, double m_f, cvector_ref<const ColorSpinorField> &x, int parity,
                          bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_DOMAIN_WALL_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_DOMAIN_WALL_DSLASH)) {
       instantiate<DomainWall5DApply>(out, in, x, U, a, m_f, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 9d53ffbd32..8052293e53 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -98,7 +98,7 @@ namespace quda {
   void ApplyTwistGamma(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, int d, double kappa,
                        double mu, double epsilon, int dagger, QudaTwistGamma5Type type)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate_recurse2<TwistGammaApply>(out, in, d, kappa, mu, epsilon, dagger, type);
     } else {
       errorQuda("Twisted mass operator has not been built");
@@ -138,7 +138,7 @@ namespace quda {
   // out(x) = tau_1*in
   void ApplyTau(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, int d)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate_recurse2<TauApply>(out, in, d);
     } else {
       errorQuda("Twisted mass operator has not been built");
diff --git a/lib/dslash_improved_staggered.cu b/lib/dslash_improved_staggered.cu
index 586c61b84a..5b33855fb3 100644
--- a/lib/dslash_improved_staggered.cu
+++ b/lib/dslash_improved_staggered.cu
@@ -172,7 +172,7 @@ namespace quda
                               const GaugeField &U, const GaugeField &L, double a, cvector_ref<const ColorSpinorField> &x,
                               int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_ASQTAD_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_ASQTAD_DSLASH)) {
       for (int i = 0; i < 4; i++) {
         if (comm_dim_partitioned(i) && (U.X()[i] < 6)) {
           errorQuda("partitioned dimension with local size less than 6 is not supported in improved staggered dslash");
diff --git a/lib/dslash_ndeg_twisted_clover.cu b/lib/dslash_ndeg_twisted_clover.cu
index 30809f2eda..794fcbc9d7 100644
--- a/lib/dslash_ndeg_twisted_clover.cu
+++ b/lib/dslash_ndeg_twisted_clover.cu
@@ -95,7 +95,7 @@ namespace quda
                               cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
                               TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_CLOVER_DSLASH)) {
       instantiate<NdegTwistedCloverApply>(out, in, x, U, A, a, b, c, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Non-degenerate twisted-clover operator has not been built");
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.cu b/lib/dslash_ndeg_twisted_clover_preconditioned.cu
index 71c161804b..e4a5235637 100644
--- a/lib/dslash_ndeg_twisted_clover_preconditioned.cu
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.cu
@@ -115,7 +115,7 @@ namespace quda
                                             bool xpay, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                             const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_CLOVER_DSLASH)) {
       instantiate<NdegTwistedCloverPreconditionedApply>(out, in, x, U, A, a, b, c, xpay, parity, dagger, comm_override,
                                                         profile);
     } else {
diff --git a/lib/dslash_ndeg_twisted_mass.cu b/lib/dslash_ndeg_twisted_mass.cu
index 6e363e1e91..90deda53da 100644
--- a/lib/dslash_ndeg_twisted_mass.cu
+++ b/lib/dslash_ndeg_twisted_mass.cu
@@ -70,7 +70,7 @@ namespace quda
                             const GaugeField &U, double a, double b, double c, cvector_ref<const ColorSpinorField> &x,
                             int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate<NdegTwistedMassApply>(out, in, x, U, a, b, c, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Non-degenerate twisted-mass operator has not been built");
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.cu b/lib/dslash_ndeg_twisted_mass_preconditioned.cu
index 8acd16a730..fc94ac1dee 100644
--- a/lib/dslash_ndeg_twisted_mass_preconditioned.cu
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.cu
@@ -123,7 +123,7 @@ namespace quda
                                           cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                           bool asymmetric, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate<NdegTwistedMassPreconditionedApply>(out, in, x, U, a, b, c, xpay, parity, dagger, asymmetric,
                                                       comm_override, profile);
     } else {
diff --git a/lib/dslash_policy.hpp b/lib/dslash_policy.hpp
index f7d024085f..7c66262e5f 100644
--- a/lib/dslash_policy.hpp
+++ b/lib/dslash_policy.hpp
@@ -1,3 +1,4 @@
+#include <array>
 #include <memory>
 #include <tune_quda.h>
 #include <index_helper.cuh>
@@ -481,11 +482,11 @@ namespace quda
       const int packIndex = device::get_default_stream_idx();
       constexpr MemoryLocation location = static_cast<MemoryLocation>(Shmem);
 
-      if (!((shmem & 2) and (shmem & 1))) {
+      if (!((shmem & 2) && (shmem & 1))) {
         issuePack(halo, in, dslash, 1 - dslashParam.parity, location, packIndex, shmem);
       }
 
-      dslash.setPack(((shmem & 2) or (shmem & 1)), location); // enable fused kernel packing
+      dslash.setPack(((shmem & 2) || (shmem & 1)), location); // enable fused kernel packing
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
 
diff --git a/lib/dslash_staggered.cu b/lib/dslash_staggered.cu
index 566c1cbcda..87f43ab507 100644
--- a/lib/dslash_staggered.cu
+++ b/lib/dslash_staggered.cu
@@ -59,7 +59,7 @@ namespace quda
 
       if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC
           || (U.LinkType() == QUDA_GENERAL_LINKS && U.Reconstruct() == QUDA_RECONSTRUCT_NO)) {
-        if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_MILC_GAUGE_ORDER)) {
           StaggeredArg<Float, nColor, nDim, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
@@ -69,7 +69,7 @@ namespace quda
           errorQuda("MILC interface has not been built so MILC phase staggered fermions not enabled");
         }
       } else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
-        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER)) {
           StaggeredArg<Float, nColor, nDim, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
@@ -86,7 +86,7 @@ namespace quda
                       double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                       const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       instantiate<StaggeredApply, ReconstructStaggered>(out, in, x, U, a, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Staggered operator has not been built");
diff --git a/lib/dslash_twisted_clover.cu b/lib/dslash_twisted_clover.cu
index b48e137f05..8afc9b7587 100644
--- a/lib/dslash_twisted_clover.cu
+++ b/lib/dslash_twisted_clover.cu
@@ -89,7 +89,7 @@ namespace quda
                           cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
                           TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_CLOVER_DSLASH)) {
       instantiate<TwistedCloverApply>(out, in, x, U, C, a, b, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Twisted-clover operator has not been built");
diff --git a/lib/dslash_twisted_clover_preconditioned.cu b/lib/dslash_twisted_clover_preconditioned.cu
index 91f7f147f8..944d91aeb0 100644
--- a/lib/dslash_twisted_clover_preconditioned.cu
+++ b/lib/dslash_twisted_clover_preconditioned.cu
@@ -137,7 +137,7 @@ namespace quda
                                         cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                         const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_CLOVER_DSLASH)) {
       instantiate<TwistedCloverPreconditionedApply>(out, in, x, U, C, a, b, xpay, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Twisted-clover operator has not been built");
diff --git a/lib/dslash_twisted_mass.cu b/lib/dslash_twisted_mass.cu
index cc3d2c256a..22a46c3f29 100644
--- a/lib/dslash_twisted_mass.cu
+++ b/lib/dslash_twisted_mass.cu
@@ -72,7 +72,7 @@ namespace quda
                         const GaugeField &U, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
                         bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate<TwistedMassApply>(out, in, x, U, a, b, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Twisted-mass operator has not been built");
diff --git a/lib/dslash_twisted_mass_preconditioned.cu b/lib/dslash_twisted_mass_preconditioned.cu
index 43addab081..5980ff8f67 100644
--- a/lib/dslash_twisted_mass_preconditioned.cu
+++ b/lib/dslash_twisted_mass_preconditioned.cu
@@ -105,7 +105,7 @@ namespace quda
                                       cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, bool asymmetric,
                                       const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_TWISTED_MASS_DSLASH)) {
       instantiate<TwistedMassPreconditionedApply>(out, in, x, U, a, b, xpay, parity, dagger, asymmetric, comm_override,
                                                   profile);
     } else {
diff --git a/lib/dslash_wilson.cu b/lib/dslash_wilson.cu
index 0dc7f8ebb1..4c6f271d05 100644
--- a/lib/dslash_wilson.cu
+++ b/lib/dslash_wilson.cu
@@ -11,7 +11,7 @@ namespace quda
                    TimeProfile &profile)
   {
     if (in.Ndim() == 5) errorQuda("Unexpected nDim = 5");
-    if constexpr (is_enabled<QUDA_WILSON_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_WILSON_DSLASH)) {
       auto dummy = DistanceType<false>();
       instantiate<WilsonApply>(out, in, x, U, a, 0, -1, parity, dagger, comm_override, dummy, profile);
     } else {
diff --git a/lib/dslash_wilson_clover.cu b/lib/dslash_wilson_clover.cu
index 96413eeba8..ab2c32f2b3 100644
--- a/lib/dslash_wilson_clover.cu
+++ b/lib/dslash_wilson_clover.cu
@@ -14,7 +14,7 @@ namespace quda
                          const GaugeField &U, const CloverField &A, double a, cvector_ref<const ColorSpinorField> &x,
                          int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH)) {
       auto dummy = DistanceType<false>();
       instantiate<WilsonCloverApply>(out, in, x, U, A, a, 0, -1, parity, dagger, comm_override, dummy, profile);
     } else {
diff --git a/lib/dslash_wilson_clover_distance.cu b/lib/dslash_wilson_clover_distance.cu
index 8ff71313da..64638a62f8 100644
--- a/lib/dslash_wilson_clover_distance.cu
+++ b/lib/dslash_wilson_clover_distance.cu
@@ -17,7 +17,7 @@ namespace quda
                                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                  const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>() && is_enabled_distance_precondition()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH) && is_enabled_distance_precondition()) {
       auto dummy = DistanceType<true>();
       instantiate<WilsonCloverApply>(out, in, x, U, A, a, alpha0, t0, parity, dagger, comm_override, dummy, profile);
     } else {
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.cu b/lib/dslash_wilson_clover_hasenbusch_twist.cu
index 3d09cc5af6..f85117fd9a 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.cu
@@ -96,7 +96,7 @@ namespace quda
                                         cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                         const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH)) {
       instantiate<WilsonCloverHasenbuschTwistApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Clover Hasensbuch Twist operator has not been built");
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
index e0938ebe7a..1059fae0db 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
@@ -144,7 +144,7 @@ namespace quda
                                                    cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                                    const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH)) {
       instantiate<WilsonCloverHasenbuschTwistPCNoClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override,
                                                                profile);
     } else {
@@ -290,7 +290,7 @@ namespace quda
                                                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                                  const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH)) {
       instantiate<WilsonCloverHasenbuschTwistPCClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override,
                                                              profile);
     } else {
diff --git a/lib/dslash_wilson_clover_preconditioned.cu b/lib/dslash_wilson_clover_preconditioned.cu
index 6102d9c2e2..7309371123 100644
--- a/lib/dslash_wilson_clover_preconditioned.cu
+++ b/lib/dslash_wilson_clover_preconditioned.cu
@@ -15,7 +15,7 @@ namespace quda
                                        cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                        const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH)) {
       auto dummy = DistanceType<false>();
       instantiate<WilsonCloverPreconditionedApply>(out, in, x, U, A, a, 0.0, -1, parity, dagger, comm_override, dummy,
                                                    profile);
diff --git a/lib/dslash_wilson_clover_preconditioned_distance.cu b/lib/dslash_wilson_clover_preconditioned_distance.cu
index eac8ed1ff9..d974cf3260 100644
--- a/lib/dslash_wilson_clover_preconditioned_distance.cu
+++ b/lib/dslash_wilson_clover_preconditioned_distance.cu
@@ -18,7 +18,7 @@ namespace quda
                                                cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                                const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_CLOVER_WILSON_DSLASH>() && is_enabled_distance_precondition()) {
+    if constexpr (is_enabled(QUDA_CLOVER_WILSON_DSLASH) && is_enabled_distance_precondition()) {
       auto dummy = DistanceType<true>();
       instantiate<WilsonCloverPreconditionedApply>(out, in, x, U, A, a, alpha0, t0, parity, dagger, comm_override,
                                                    dummy, profile);
diff --git a/lib/dslash_wilson_distance.cu b/lib/dslash_wilson_distance.cu
index 414924f992..a531d58c59 100644
--- a/lib/dslash_wilson_distance.cu
+++ b/lib/dslash_wilson_distance.cu
@@ -13,7 +13,7 @@ namespace quda
                            int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
     if (in.Ndim() == 5) errorQuda("Unexpected nDim = 5");
-    if constexpr (is_enabled<QUDA_WILSON_DSLASH>() && is_enabled_distance_precondition()) {
+    if constexpr (is_enabled(QUDA_WILSON_DSLASH) && is_enabled_distance_precondition()) {
       auto dummy = DistanceType<true>();
       instantiate<WilsonApply>(out, in, x, U, a, alpha0, t0, parity, dagger, comm_override, dummy, profile);
     } else {
diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp
index 6e73da086f..2f972ad911 100644
--- a/lib/eig_block_trlm.cpp
+++ b/lib/eig_block_trlm.cpp
@@ -440,12 +440,12 @@ namespace quda
     // Use Sum of all beta values in the final block for
     // the convergence condition
     double beta_sum = 0;
-    for (int i = 0; i < block_data_length; i++) beta_sum += fabs(block_beta[n_kr * block_size - block_data_length + i]);
+    for (int i = 0; i < block_data_length; i++) beta_sum += abs(block_beta[n_kr * block_size - block_data_length + i]);
 
     for (int i = 0; i < blocks; i++) {
       for (int b = 0; b < block_size; b++) {
         idx = b * (block_size + 1);
-        residua[i * block_size + b + num_locked] = fabs(beta_sum * block_ritz_mat[dim * (i * block_size + b + 1) - 1]);
+        residua[i * block_size + b + num_locked] = abs(beta_sum * block_ritz_mat[dim * (i * block_size + b + 1) - 1]);
       }
     }
 
diff --git a/lib/extract_gauge_ghost.in.cu b/lib/extract_gauge_ghost.in.cu
index f0f4fe27c1..6206912c21 100644
--- a/lib/extract_gauge_ghost.in.cu
+++ b/lib/extract_gauge_ghost.in.cu
@@ -19,28 +19,28 @@ namespace quda {
           using G = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type;
           ExtractGhost<Float, nColor, G>(u, Ghost, extract, offset);
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_12>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_12)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type;
             ExtractGhost<Float, nColor, G>(u, Ghost, extract, offset);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_12", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_8>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_8)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type;
             ExtractGhost<Float, nColor, G>(u, Ghost, extract, offset);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_8", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_13>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_13)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type;
             ExtractGhost<Float, nColor, G>(u, Ghost, extract, offset);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_13", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_9>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_9)) {
             if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) {
               using G = typename gauge_mapper<Float, QUDA_RECONSTRUCT_9, 18, QUDA_STAGGERED_PHASE_MILC>::type;
               ExtractGhost<Float, nColor, G>(u, Ghost, extract, offset);
@@ -56,7 +56,7 @@ namespace quda {
         }
       } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_QDP_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_QDP_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, QDPOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("QDP interface has not been built");
@@ -64,7 +64,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_QDPJIT_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_QDPJIT_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, QDPJITOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("QDPJIT interface has not been built");
@@ -72,7 +72,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_CPS_WILSON_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_CPS_WILSON_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, CPSOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("CPS interface has not been built");
@@ -80,7 +80,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_MILC_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, MILCOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("MILC interface has not been built");
@@ -88,7 +88,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_BQCD_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_BQCD_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, BQCDOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("BQCD interface has not been built");
@@ -96,7 +96,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, TIFROrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("TIFR interface has not been built");
@@ -104,7 +104,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER)) {
           ExtractGhost<Float, nColor, TIFRPaddedOrder<Float,length>>(u, Ghost, extract, offset);
         } else {
           errorQuda("TIFR interface has not been built");
diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu
index 37c4f7235a..6df28cf779 100644
--- a/lib/extract_gauge_ghost_extended.cu
+++ b/lib/extract_gauge_ghost_extended.cu
@@ -66,28 +66,28 @@ namespace quda {
           using G = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type;
           ExtractGhostEx<G>(u, dim, R, ghost, extract);
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_12>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_12)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type;
             ExtractGhostEx<G>(u, dim, R, ghost, extract);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_12", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_8>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_8)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type;
             ExtractGhostEx<G>(u, dim, R, ghost, extract);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_8", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_13>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_13)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type;
             ExtractGhostEx<G>(u, dim, R, ghost, extract);
           } else {
             errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_13", QUDA_RECONSTRUCT);
           }
         } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
-          if constexpr (is_enabled<QUDA_RECONSTRUCT_9>()) {
+          if constexpr (is_enabled(QUDA_RECONSTRUCT_9)) {
             using G = typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type;
             ExtractGhostEx<G>(u, dim, R, ghost, extract);
           } else {
@@ -96,7 +96,7 @@ namespace quda {
         }
       } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_QDP_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_QDP_GAUGE_ORDER)) {
           ExtractGhostEx<QDPOrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("QDP interface has not been built");
@@ -104,7 +104,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_QDPJIT_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_QDPJIT_GAUGE_ORDER)) {
           ExtractGhostEx<QDPJITOrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("QDPJIT interface has not been built");
@@ -112,7 +112,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_CPS_WILSON_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_CPS_WILSON_GAUGE_ORDER)) {
           ExtractGhostEx<CPSOrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("CPS interface has not been built");
@@ -120,7 +120,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_MILC_GAUGE_ORDER)) {
           ExtractGhostEx<MILCOrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("MILC interface has not been built");
@@ -128,7 +128,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_BQCD_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_BQCD_GAUGE_ORDER)) {
           ExtractGhostEx<BQCDOrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("BQCD interface has not been built");
@@ -136,7 +136,7 @@ namespace quda {
 
       } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
 
-        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
+        if constexpr (is_enabled(QUDA_TIFR_GAUGE_ORDER)) {
           ExtractGhostEx<TIFROrder<Float,length>>(u, dim, R, ghost, extract);
         } else {
           errorQuda("TIFR interface has not been built");
diff --git a/lib/extract_gauge_ghost_mg.in.cu b/lib/extract_gauge_ghost_mg.in.cu
index 962b40bc05..c2b9d86454 100644
--- a/lib/extract_gauge_ghost_mg.in.cu
+++ b/lib/extract_gauge_ghost_mg.in.cu
@@ -19,7 +19,7 @@ namespace quda {
       ExtractGhost<storeFloat, Nc, G>(u, Ghost, extract, offset);
     } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
       
-      if constexpr (is_enabled<QUDA_QDP_GAUGE_ORDER>()) {
+      if constexpr (is_enabled(QUDA_QDP_GAUGE_ORDER)) {
         using G = typename gauge::FieldOrder<Float,Nc,1,QUDA_QDP_GAUGE_ORDER,true,storeFloat>;
         ExtractGhost<storeFloat, Nc, G>(u, Ghost, extract, offset);
       } else {
diff --git a/lib/hisq_paths_force_quda.cu b/lib/hisq_paths_force_quda.cu
index 5bbdf8fa7e..51fb7bad31 100644
--- a/lib/hisq_paths_force_quda.cu
+++ b/lib/hisq_paths_force_quda.cu
@@ -546,7 +546,7 @@ namespace quda {
 
     void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff_array[6])
     {
-      if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
         getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
         checkNative(link, oprod, newOprod);
         checkLocation(newOprod, oprod, link);
@@ -640,7 +640,7 @@ namespace quda {
 
     void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oldOprod, const GaugeField &link, double coeff)
     {
-      if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
         getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
         checkNative(link, oldOprod, newOprod);
         checkLocation(newOprod, oldOprod, link);
@@ -713,7 +713,7 @@ namespace quda {
 
     void hisqCompleteForce(GaugeField &force, const GaugeField &link)
     {
-      if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
         getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
         checkNative(link, force);
         checkLocation(force, link);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index e5b678acfd..8729418617 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -1,9 +1,9 @@
+#define _INTERFACE_
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
-#include <sys/time.h>
 
 #include <quda.h>
 #include <quda_internal.h>
@@ -436,7 +436,7 @@ static void init_default_comms()
 }
 
 
-extern char* gitversion;
+extern const char* gitversion;
 
 /*
  * Set the device that QUDA uses.
@@ -1466,7 +1466,7 @@ namespace quda {
       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC;
       diracParam.Ls = inv_param->Ls;
       // check we are safe to cast into a Complex (= std::complex<double>)
-      static_assert(sizeof(Complex) == sizeof(double _Complex),
+      static_assert(sizeof(Complex) == sizeof(double_complex),
                     "Irreconcilable difference between interface and internal complex number conventions");
 
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
@@ -1482,7 +1482,7 @@ namespace quda {
 	errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS);
       diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC;
       diracParam.Ls = inv_param->Ls;
-      if (sizeof(Complex) != sizeof(double _Complex)) {
+      if (sizeof(Complex) != sizeof(double_complex)) {
         errorQuda("Irreconcilable difference between interface and internal complex number conventions");
       }
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
@@ -2353,9 +2353,9 @@ namespace quda
   bool canReuseResidentGauge(QudaInvertParam *param)
   {
     if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
-      return (gaugePrecise != nullptr) and param->cuda_prec == gaugePrecise->Precision();
+      return (gaugePrecise != nullptr) && param->cuda_prec == gaugePrecise->Precision();
     } else {
-      return (gaugeFatPrecise != nullptr) and param->cuda_prec == gaugeFatPrecise->Precision();
+      return (gaugeFatPrecise != nullptr) && param->cuda_prec == gaugeFatPrecise->Precision();
     }
   }
 
@@ -2521,7 +2521,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   popVerbosity();
 }
 
-void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param)
+void eigensolveQuda(void **host_evecs, double_complex *host_evals, QudaEigParam *eig_param)
 {
   if (!initialized) errorQuda("QUDA not initialized");
 
@@ -4898,7 +4898,7 @@ void polyakovLoopQuda(double ploop[2], int dir)
   ploop[1] = obsParam.ploop[1];
 }
 
-void computeGaugeLoopTraceQuda(double _Complex *traces, int **input_path_buf, int *path_length, double *loop_coeff,
+void computeGaugeLoopTraceQuda(double_complex *traces, int **input_path_buf, int *path_length, double *loop_coeff,
                                int num_paths, int max_length, double factor)
 {
   if (!gaugePrecise) errorQuda("Cannot compute gauge loop traces as there is no resident gauge field");
@@ -5611,7 +5611,7 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
   gaugeObservables(*gauge, *param);
 }
 
-static void check_param(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
+static void check_param(double_complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
                         int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4])
 {
   if (host_sinks == nullptr) errorQuda("Invalid host_sink ptr");
@@ -5628,7 +5628,7 @@ static void check_param(double _Complex *host_sinks, void **host_quark, int n_qu
     if (X[i] < 1 || X[i] > 512) errorQuda("Invalid lattice dimension %d", i);
 }
 
-void laphSinkProject(double _Complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
+void laphSinkProject(double_complex *host_sinks, void **host_quark, int n_quark, int tile_quark, void **host_evec,
                      int n_evec, int tile_evec, QudaInvertParam *inv_param, const int X[4])
 {
   auto profile = pushProfile(profileSinkProject, inv_param);
diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp
index 86b6429532..c8c05d4e78 100644
--- a/lib/inv_ca_cg.cpp
+++ b/lib/inv_ca_cg.cpp
@@ -536,7 +536,7 @@ namespace quda
           warningQuda(
             "CA-CG: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
             sqrt(r2[0]), sqrt(r2_old[0]), resIncreaseTotal);
-          if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal > maxResIncreaseTotal) {
             warningQuda("CA-CG: solver exiting due to too many true residual norm increases");
             break;
           }
diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp
index aa41c9409b..9456ce8f7f 100644
--- a/lib/inv_ca_gcr.cpp
+++ b/lib/inv_ca_gcr.cpp
@@ -344,7 +344,7 @@ namespace quda
           warningQuda(
             "CA-GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
             sqrt(r2[0]), sqrt(r2_old[0]), resIncreaseTotal);
-          if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal > maxResIncreaseTotal) {
             warningQuda("CA-GCR: solver exiting due to too many true residual norm increases");
             break;
           }
diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp
index 75d6cc93d9..d61dacfa25 100644
--- a/lib/inv_cg3_quda.cpp
+++ b/lib/inv_cg3_quda.cpp
@@ -177,7 +177,7 @@ namespace quda {
         if (convergence(r2, heavy_quark_res, stop, stop_hq) && param.delta >= param.tol) update = true;
 
         // For heavy-quark inversion force a reliable update if we continue after
-        if (use_heavy_quark_res and L2breakdown and convergenceHQ(heavy_quark_res, stop_hq) and param.delta >= param.tol) {
+        if (use_heavy_quark_res && L2breakdown && convergenceHQ(heavy_quark_res, stop_hq) && param.delta >= param.tol) {
           update = true;
         }
 
@@ -216,7 +216,7 @@ namespace quda {
           warningQuda(
             "CG3: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
             sqrt(r2[0]), r0Norm, resIncreaseTotal);
-          if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal > maxResIncreaseTotal) {
             if (use_heavy_quark_res) {
               L2breakdown = true;
             } else {
@@ -229,7 +229,7 @@ namespace quda {
         }
 
         // if L2 broke down we turn off reliable updates and restart the CG
-        if (use_heavy_quark_res and L2breakdown) {
+        if (use_heavy_quark_res && L2breakdown) {
           delta = 0;
           heavy_quark_check = 1;
           warningQuda("CG3: Restarting without reliable updates for heavy-quark residual");
@@ -267,7 +267,7 @@ namespace quda {
           warningQuda(
             "CG3: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
             sqrt(r2[0]), r0Norm, resIncreaseTotal);
-          if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal > maxResIncreaseTotal) {
             warningQuda("CG3: solver exiting due to too many true residual norm increases");
             break;
           }
diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp
index 43df31e1b2..38652d6adf 100644
--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -173,7 +173,7 @@ namespace quda {
 
     vector<double> r2_old(r2.size(), 0.0);
     for (auto i = 0u; i < b.size(); i++) {
-      if (r2_old_init[i] != 0.0 and !p_init[i].empty()) {
+      if (r2_old_init[i] != 0.0 && !p_init[i].empty()) {
         // FIXME vectorize this
         r2_old[i] = r2_old_init[i];
         Complex rp = blas::cDotProduct(r_sloppy[i], x_update_batch[i].get_current_field()) / (r2[i]);
diff --git a/lib/inv_eigcg_quda.cpp b/lib/inv_eigcg_quda.cpp
index 9961f8ef8e..7cbe6daa00 100644
--- a/lib/inv_eigcg_quda.cpp
+++ b/lib/inv_eigcg_quda.cpp
@@ -497,7 +497,7 @@ namespace quda {
 
       PrintStats("eigCG", k, r2, b2, heavy_quark_res);
       // check convergence, if convergence is satisfied we only need to check that we had a reliable update for the heavy quarks recently
-      converged = convergence(r2, heavy_quark_res, args.global_stop, param.tol_hq) or convergence(r2, heavy_quark_res, local_stop, param.tol_hq);
+      converged = convergence(r2, heavy_quark_res, args.global_stop, param.tol_hq) || convergence(r2, heavy_quark_res, local_stop, param.tol_hq);
     }
 
     args.ResetArgs();//eigCG cycle finished, this cleans V2k as well
diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp
index 8e9c1df32e..b44e4b654a 100644
--- a/lib/inv_gcr_quda.cpp
+++ b/lib/inv_gcr_quda.cpp
@@ -11,7 +11,13 @@
 #include <util_quda.h>
 #include <color_spinor_field.h>
 
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <winsock.h>
+#undef NOMINMAX
+#else
 #include <sys/time.h>
+#endif
 
 namespace quda {
 
@@ -378,7 +384,7 @@ namespace quda {
           warningQuda(
             "GCR: new reliable residual norm %e is greater than previous reliable residual norm %e (total #inc %i)",
             sqrt(r2[0]), sqrt(r2_old[0]), resIncreaseTotal);
-          if (resIncrease > maxResIncrease or resIncreaseTotal > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal > maxResIncreaseTotal) {
             warningQuda("GCR: solver exiting due to too many true residual norm increases");
             break;
           }
diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp
index dba0e11f4e..392303a2dc 100644
--- a/lib/inv_multi_cg_quda.cpp
+++ b/lib/inv_multi_cg_quda.cpp
@@ -359,7 +359,7 @@ namespace quda {
           warningQuda("Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)",
                       reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]);
 
-          if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) {
+          if (resIncrease > maxResIncrease || resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) {
             warningQuda("solver exiting due to too many true residual norm increases");
             break;
           }
@@ -409,7 +409,7 @@ namespace quda {
       num_offset_now -= converged;
 
       // exit early so that we can finish of shift 0 using CG and allowing for mixed precison refinement
-      if ( (mixed || zero_refinement) and param.compute_true_res and num_offset_now==1) {
+      if ( (mixed || zero_refinement) && param.compute_true_res && num_offset_now==1) {
         exit_early=true;
         num_offset_now--;
       }
@@ -449,7 +449,7 @@ namespace quda {
         // only calculate true residual if we need to:
         // 1.) For higher shifts if we did not use mixed precision
         // 2.) For shift 0 if we did not exit early  (we went to the full solution)
-        if ( (i > 0 and not mixed) or (i == 0 and not exit_early) ) {
+        if ( (i > 0 && ! mixed) || (i == 0 && ! exit_early) ) {
           mat(r, x[i]);
           if (r.Nspin() == 4) {
             blas::axpy(offset[i], x[i], r); // Offset it.
diff --git a/lib/laplace.cu b/lib/laplace.cu
index aab6dd2d8e..f8270b9f4b 100644
--- a/lib/laplace.cu
+++ b/lib/laplace.cu
@@ -178,7 +178,7 @@ namespace quda
                     int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                     const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_LAPLACE_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_LAPLACE_DSLASH)) {
       instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, dagger, comm_override, profile);
     } else {
       errorQuda("Laplace operator has not been enabled");
diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu
index a5feb80bb6..8d3151f852 100644
--- a/lib/llfat_quda.cu
+++ b/lib/llfat_quda.cu
@@ -169,7 +169,7 @@ namespace quda {
 
   void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       computeLongLink(lng, u, coeff[1]);
       getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
@@ -180,7 +180,7 @@ namespace quda {
 
   void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
       GaugeFieldParam gParam(u);
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index 23473eaa4b..3ea531e5c8 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -1,3 +1,4 @@
+#define _INTERFACE_
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
@@ -742,7 +743,7 @@ void qudaGaugeLoopTracePhased(int precision, double *traces, int **input_path_bu
 
   QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam();
   obsParam.compute_gauge_loop_trace = QUDA_BOOLEAN_TRUE;
-  obsParam.traces = reinterpret_cast<double _Complex *>(traces);
+  obsParam.traces = reinterpret_cast<double_complex *>(traces);
   obsParam.input_path_buff = input_path_buf;
   obsParam.path_length = path_length;
   obsParam.loop_coeff = loop_coeff;
@@ -820,7 +821,7 @@ void qudaGaugeMeasurementsPhased(int precision, double plaq[3], double ploop[2],
   obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE;
   obsParam.compute_polyakov_loop = QUDA_BOOLEAN_TRUE;
   obsParam.compute_gauge_loop_trace = QUDA_BOOLEAN_TRUE;
-  obsParam.traces = reinterpret_cast<double _Complex *>(traces);
+  obsParam.traces = reinterpret_cast<double_complex *>(traces);
   obsParam.input_path_buff = input_path_buf;
   obsParam.path_length = path_length;
   obsParam.loop_coeff = loop_coeff;
diff --git a/lib/quda.def b/lib/quda.def
new file mode 100644
index 0000000000..98be04a3e4
--- /dev/null
+++ b/lib/quda.def
@@ -0,0 +1,92 @@
+LIBRARY quda
+EXPORTS
+    setVerbosityQuda
+    initCommsGridQuda
+    initQudaDevice
+    initQudaMemory
+    initQuda
+    endQuda
+    updateR
+    newQudaGaugeParam
+    newQudaInvertParam
+    newQudaMultigridParam
+    newQudaEigParam
+    newQudaGaugeObservableParam
+    newQudaGaugeSmearParam
+    newQudaBLASParam
+    printQudaGaugeParam
+    printQudaInvertParam
+    printQudaMultigridParam
+    printQudaEigParam
+    printQudaGaugeObservableParam
+    printQudaBLASParam
+    printQudaGaugeParam
+    printQudaInvertParam
+    printQudaMultigridParam
+    printQudaEigParam
+    printQudaGaugeObservableParam
+    printQudaBLASParam
+    loadGaugeQuda
+    freeGaugeQuda
+    freeUniqueGaugeQuda
+    freeGaugeSmearedQuda
+    freeGaugeTwoLinkQuda
+    saveGaugeQuda
+    loadCloverQuda
+    freeCloverQuda
+    eigensolveQuda
+    invertQuda
+    invertMultiSrcQuda
+    invertMultiShiftQuda
+    newMultigridQuda
+    destroyMultigridQuda
+    updateMultigridQuda
+    dumpMultigridQuda
+    dslashQuda
+    covDevQuda
+    shiftQuda
+    spinTasteQuda
+    dslashMultiSrcQuda
+    cloverQuda
+    MatQuda
+    MatDagMatQuda
+    computeKSLinkQuda
+    computeTwoLinkQuda
+    momResidentQuda
+    computeGaugeForceQuda
+    computeGaugePathQuda
+    computeGaugeLoopTraceQuda
+    updateGaugeFieldQuda
+    staggeredPhaseQuda
+    projectSU3Quda
+    momActionQuda
+    createGaugeFieldQuda
+    saveGaugeFieldQuda
+    destroyGaugeFieldQuda
+    createCloverQuda
+    computeCloverForceQuda
+    computeTMCloverForceQuda
+    computeStaggeredForceQuda
+    computeHISQForceQuda
+    gaussGaugeQuda
+    gaussMomQuda
+    plaqQuda
+    polyakovLoopQuda
+    copyExtendedResidentGaugeQuda
+    performWuppertalnStep
+    performGaugeSmearQuda
+    performWFlowQuda
+    performGFlowQuda
+    gaugeObservablesQuda
+    contractQuda
+    contractFTQuda
+    computeGaugeFixingOVRQuda
+    computeGaugeFixingFFTQuda
+    blasGEMMQuda
+    blasLUInvQuda
+    flushChronoQuda
+    newDeflationQuda
+    destroyDeflationQuda
+    setMPICommHandleQuda
+    performTwoLinkGaussianSmearNStep
+    laphSinkProject
diff --git a/lib/solve.cpp b/lib/solve.cpp
index c9ba9baf21..10fb049251 100644
--- a/lib/solve.cpp
+++ b/lib/solve.cpp
@@ -279,7 +279,7 @@ namespace quda
                   basis.size());
       }
 
-      if (not param.chrono_replace_last) {
+      if (!param.chrono_replace_last) {
         // if we have not filled the space yet just augment
         if ((int)basis.size() < param.chrono_max_dim) {
           ColorSpinorParam cs_param(out[0]);
diff --git a/lib/spin_taste.cu b/lib/spin_taste.cu
index 345bf57ecf..e2846b0c9d 100644
--- a/lib/spin_taste.cu
+++ b/lib/spin_taste.cu
@@ -72,7 +72,7 @@ namespace quda
 
   void applySpinTaste(ColorSpinorField &out, const ColorSpinorField &in, QudaSpinTasteGamma gamma)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       instantiate<SpinTastePhase_>(out, in, gamma);
     } else {
       errorQuda("Staggered operator has not been built");
diff --git a/lib/staggered_kd_apply_xinv.cu b/lib/staggered_kd_apply_xinv.cu
index f8e00604f3..fd54462245 100644
--- a/lib/staggered_kd_apply_xinv.cu
+++ b/lib/staggered_kd_apply_xinv.cu
@@ -65,7 +65,7 @@ namespace quda {
   void ApplyStaggeredKahlerDiracInverse(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                                         const GaugeField &Xinv, bool dagger)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>() && is_enabled_multigrid()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH) && is_enabled_multigrid()) {
       // Instantiate based on precision, number of colors
       instantiate_recurse2<StaggeredKDBlock>(out, in, Xinv, dagger);
     } else {
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index 39569cece9..b8549f7696 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -113,7 +113,7 @@ namespace quda {
 
   void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       if (nFace == 1) {
         computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
diff --git a/lib/staggered_quark_smearing.cu b/lib/staggered_quark_smearing.cu
index a24944fed5..48b36fd38c 100644
--- a/lib/staggered_quark_smearing.cu
+++ b/lib/staggered_quark_smearing.cu
@@ -211,7 +211,7 @@ namespace quda
                             const GaugeField &U, int t0, bool is_tslice_kernel, int parity, int dir, bool dagger,
                             const int *comm_override, TimeProfile &profile)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       // Local lattice size should be bigger than or equal to 6 in every partitioned direction.
       for (int i = 0; i < 4; i++) {
         if (comm_dim_partitioned(i) && (U.X()[i] < 6)) {
diff --git a/lib/staggered_two_link_quda.cu b/lib/staggered_two_link_quda.cu
index 1fd9fa356c..2b66699fbd 100644
--- a/lib/staggered_two_link_quda.cu
+++ b/lib/staggered_two_link_quda.cu
@@ -38,7 +38,7 @@ namespace quda
 
   void computeTwoLink(GaugeField &twoLink, const GaugeField &link)
   {
-    if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+    if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
       getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(twoLink, link);
       checkLocation(twoLink, link);
diff --git a/lib/targets/cuda/device.cpp b/lib/targets/cuda/device.cpp
index 69955f4dc5..38779eba09 100644
--- a/lib/targets/cuda/device.cpp
+++ b/lib/targets/cuda/device.cpp
@@ -97,7 +97,7 @@ namespace quda
         }
       }
       // d) QUDA built for same major compute capability but lower minor
-      if (deviceProp.major == my_major and deviceProp.minor > my_minor) {
+      if (deviceProp.major == my_major && deviceProp.minor > my_minor) {
         warningQuda(
           "** Running on a device with compute capability %i.%i but QUDA was compiled for %i.%i. **\n -- This might "
           "result in a lower performance. Please consider adjusting QUDA_GPU_ARCH when running cmake.\n",
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 2b0d3c97ba..63cc2f2f85 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -2,8 +2,21 @@
 #include <cstdio>
 #include <string>
 #include <map>
+#ifdef _MSC_VER
+#define NOMINMAX
+#include <Windows.h>
+#include <DbgHelp.h>
+#undef NOMINMAX
+int getpagesize(...)
+{
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  return (int)info.dwPageSize;
+}
+#else
 #include <unistd.h>   // for getpagesize()
 #include <execinfo.h> // for backtrace
+#endif
 #include <quda_internal.h>
 #include <device.h>
 #include <shmem_helper.cuh>
@@ -82,12 +95,27 @@ namespace quda
   {
     void *array[10];
     size_t size;
+#ifdef _MSC_VER
+    SYMBOL_INFO *symbol;
+    HANDLE process = GetCurrentProcess();
+    SymInitialize(process, NULL, TRUE);
+    size = CaptureStackBackTrace(0, 10, array, NULL);
+    symbol = (SYMBOL_INFO *)malloc(sizeof(SYMBOL_INFO) + 256 * sizeof(char));
+    symbol->MaxNameLen = 255;
+    symbol->SizeOfStruct = sizeof(SYMBOL_INFO) + 256 * sizeof(char);
+    for (int i = 0; i < size; i++) {
+        SymFromAddr(process, (DWORD64)(array[i]), 0, symbol);
+        printfQuda("%s \n", symbol->Name);
+    }
+    free(symbol);
+#else
     char **strings;
     size = backtrace(array, 10);
     strings = backtrace_symbols(array, size);
     printfQuda("Obtained %zd stack frames.\n", size);
     for (size_t i = 0; i < size; i++) printfQuda("%s\n", strings[i]);
     free(strings);
+#endif
   }
 
   static void print_alloc_header()
@@ -158,8 +186,13 @@ namespace quda
     // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
     static int page_size = 2 * getpagesize();
     a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size
+#ifdef _MSC_VER
+    ptr = _aligned_malloc(a.base_size, page_size);
+    if (!ptr) {
+#else
     int align = posix_memalign(&ptr, page_size, a.base_size);
     if (!ptr || align != 0) {
+#endif
 #endif
       errorQuda("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
                 a.func.c_str());
@@ -490,7 +523,11 @@ namespace quda
       cudaError_t err = cudaHostUnregister(ptr);
       if (err != cudaSuccess) { errorQuda("Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); }
       track_free(PINNED, ptr);
+#ifdef _MSC_VER
+      _aligned_free(ptr);
+#else
       free(ptr);
+#endif
     } else if (alloc[MAPPED].count(ptr)) {
 #ifdef HOST_ALLOC
       cudaError_t err = cudaFreeHost(ptr);
@@ -502,7 +539,11 @@ namespace quda
         errorQuda("Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
       }
       track_free(MAPPED, ptr);
+#ifdef _MSC_VER
+      _aligned_free(ptr);
+#else
       free(ptr);
+#endif
 #endif
     } else {
       printfQuda("ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index eb71bb9d53..2f14eecdb3 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -45,7 +45,7 @@ set(CMAKE_CUDA_FLAGS_STRICT
     "-O3"
     CACHE STRING "Flags used by the CUDA compiler during strict jenkins builds.")
 set(CMAKE_CUDA_FLAGS_RELEASE
-    "-O3 -Xcompiler \"${CXX_OPT}\""
+    "-Zi -O2 -Xcompiler \"${CXX_OPT}\""
     CACHE STRING "Flags used by the CUDA compiler during release builds.")
 set(CMAKE_CUDA_FLAGS_HOSTDEBUG
     "-g"
@@ -265,6 +265,9 @@ target_compile_options(
   quda
   PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:
           -Wreorder
+          $<$<CXX_COMPILER_ID:MSVC>:
+          -Xcompiler=/Zc:__cplusplus
+          >
           $<$<CXX_COMPILER_ID:Clang>:
           -Xcompiler=-Wno-unused-function
           -Xcompiler=-Wno-unknown-pragmas
diff --git a/lib/transform_reduce.cu b/lib/transform_reduce.cu
index 9a4ab393df..4da95cb97d 100644
--- a/lib/transform_reduce.cu
+++ b/lib/transform_reduce.cu
@@ -95,10 +95,10 @@ namespace quda
   template void transform_reduce<plus<double>, complex<signed char>, unsigned int, abs_<double, signed char>, identity>(
     QudaFieldLocation, std::vector<double> &, std::vector<complex<signed char> *> const &, unsigned int,
     abs_<double, signed char>, identity);
-  template double transform_reduce<plus<double>, complex<double>, unsigned long, abs_<double, double>>(
-    QudaFieldLocation, complex<double> const *, unsigned long, abs_<double, double>);
-  template double transform_reduce<plus<double>, complex<float>, unsigned long, abs_<double, float>>(
-    QudaFieldLocation, complex<float> const *, unsigned long, abs_<double, float>);
+  template double transform_reduce<plus<double>, complex<double>, unsigned long long, abs_<double, double>>(
+    QudaFieldLocation, complex<double> const *, unsigned long long, abs_<double, double>);
+  template double transform_reduce<plus<double>, complex<float>, unsigned long long, abs_<double, float>>(
+    QudaFieldLocation, complex<float> const *, unsigned long long, abs_<double, float>);
   // square
   template void transform_reduce<plus<double>, complex<double>, unsigned int, square_<double, double>, identity>(
     QudaFieldLocation, std::vector<double> &, std::vector<complex<double> *> const &, unsigned int,
@@ -115,10 +115,10 @@ namespace quda
   template void transform_reduce<plus<double>, complex<short>, unsigned int, square_<double, short>, identity>(
     QudaFieldLocation, std::vector<double> &, std::vector<complex<short> *> const &, unsigned int,
     square_<double, short>, identity);
-  template double transform_reduce<plus<double>, complex<double>, unsigned long, square_<double, double>>(
-    QudaFieldLocation, complex<double> const *, unsigned long, square_<double, double>);
-  template double transform_reduce<plus<double>, complex<float>, unsigned long, square_<double, float>>(
-    QudaFieldLocation, complex<float> const *, unsigned long, square_<double, float>);
+  template double transform_reduce<plus<double>, complex<double>, unsigned long long, square_<double, double>>(
+    QudaFieldLocation, complex<double> const *, unsigned long long, square_<double, double>);
+  template double transform_reduce<plus<double>, complex<float>, unsigned long long, square_<double, float>>(
+    QudaFieldLocation, complex<float> const *, unsigned long long, square_<double, float>);
   template double transform_reduce<plus<double>, complex<float>, unsigned int, square_<double, float>>(
     QudaFieldLocation, complex<float> const *, unsigned int, square_<double, float>);
   template double transform_reduce<plus<double>, complex<short>, unsigned int, square_<double, short>>(
@@ -140,16 +140,16 @@ namespace quda
   template void transform_reduce<maximum<float>, complex<short>, unsigned int, abs_max_<float, short>, identity>(
     QudaFieldLocation, std::vector<float> &, std::vector<complex<short> *> const &, unsigned int,
     abs_max_<float, short>, identity);
-  template double transform_reduce<maximum<double>, complex<double>, unsigned long, abs_max_<double, double>>(
-    QudaFieldLocation, complex<double> const *, unsigned long, abs_max_<double, double>);
+  template double transform_reduce<maximum<double>, complex<double>, unsigned long long, abs_max_<double, double>>(
+    QudaFieldLocation, complex<double> const *, unsigned long long, abs_max_<double, double>);
   template double transform_reduce<maximum<double>, complex<double>, unsigned int, abs_max_<double, double>>(
     QudaFieldLocation, complex<double> const *, unsigned int, abs_max_<double, double>);
-  template float transform_reduce<maximum<float>, complex<float>, unsigned long, abs_max_<float, float>>(
-    QudaFieldLocation, complex<float> const *, unsigned long, abs_max_<float, float>);
+  template float transform_reduce<maximum<float>, complex<float>, unsigned long long, abs_max_<float, float>>(
+    QudaFieldLocation, complex<float> const *, unsigned long long, abs_max_<float, float>);
   template float transform_reduce<maximum<float>, complex<float>, unsigned int, abs_max_<float, float>>(
     QudaFieldLocation, complex<float> const *, unsigned int, abs_max_<float, float>);
-  template float transform_reduce<maximum<float>, complex<short>, unsigned long, abs_max_<float, short>>(
-    QudaFieldLocation, complex<short> const *, unsigned long, abs_max_<float, short>);
+  template float transform_reduce<maximum<float>, complex<short>, unsigned long long, abs_max_<float, short>>(
+    QudaFieldLocation, complex<short> const *, unsigned long long, abs_max_<float, short>);
   template float transform_reduce<maximum<float>, complex<short>, unsigned int, abs_max_<float, short>>(
     QudaFieldLocation, complex<short> const*, unsigned int, abs_max_<float, short>);
   // abs_min
@@ -169,10 +169,10 @@ namespace quda
   transform_reduce<minimum<float>, complex<signed char>, unsigned int, abs_min_<float, signed char>, identity>(
     QudaFieldLocation, std::vector<float> &, std::vector<complex<signed char> *> const &, unsigned int,
     abs_min_<float, signed char>, identity);
-  template double transform_reduce<minimum<double>, complex<double>, unsigned long, abs_min_<double, double>>(
-    QudaFieldLocation, complex<double> const *, unsigned long, abs_min_<double, double>);
-  template float transform_reduce<minimum<float>, complex<float>, unsigned long, abs_min_<float, float>>(
-    QudaFieldLocation, complex<float> const *, unsigned long, abs_min_<float, float>);
+  template double transform_reduce<minimum<double>, complex<double>, unsigned long long, abs_min_<double, double>>(
+    QudaFieldLocation, complex<double> const *, unsigned long long, abs_min_<double, double>);
+  template float transform_reduce<minimum<float>, complex<float>, unsigned long long, abs_min_<float, float>>(
+    QudaFieldLocation, complex<float> const *, unsigned long long, abs_min_<float, float>);
 
   template void quda::transform_reduce<plus<double>, complex<double>, unsigned int, square_<double, double>, milc_mapper>(
     QudaFieldLocation, std::vector<double> &, std::vector<quda::complex<double> *> const &, unsigned int,
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 3bcd7da964..0f7686ba5c 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -2,7 +2,22 @@
 #include <comm_quda.h>
 #include <quda.h>     // for QUDA_VERSION_STRING
 #include <timer.h>
+#ifdef _MSC_VER
+// Windows does not define the S_ISREG and S_ISDIR macros in stat.h, so we do.
+// We have to define _CRT_INTERNAL_NONSTDC_NAMES 1 before #including sys/stat.h
+// in order for Microsoft's stat.h to define names like S_IFMT, S_IFREG, and S_IFDIR,
+// rather than just defining  _S_IFMT, _S_IFREG, and _S_IFDIR as it normally does.
+#define _CRT_INTERNAL_NONSTDC_NAMES 1
+#include <sys/stat.h>
+#if !defined(S_ISREG) && defined(S_IFMT) && defined(S_IFREG)
+  #define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
+#endif
+#if !defined(S_ISDIR) && defined(S_IFMT) && defined(S_IFDIR)
+  #define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
+#endif
+#else
 #include <sys/stat.h> // for stat()
+#endif
 #include <fcntl.h>
 #include <cfloat> // for FLT_MAX
 #include <ctime>
@@ -10,7 +25,16 @@
 #include <typeinfo>
 #include <map>
 #include <list>
+#ifdef _MSC_VER
+#include <io.h>
+#include <synchapi.h>
+#define sleep(sec) Sleep(sec * 1000)
+#else
 #include <unistd.h>
+#define _open open
+#define _write write
+#define _close close
+#endif
 #include <uint_to_char.h>
 #include <target_device.h>
 
@@ -23,7 +47,7 @@
 #include <communicator_quda.h>
 
 //#define LAUNCH_TIMER
-extern char *gitversion;
+extern const char *gitversion;
 
 namespace quda
 {
@@ -457,7 +481,7 @@ namespace quda
       // Acquire lock.  Note that this is only robust if the filesystem supports flock() semantics, which is true for
       // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
       lock_path = resource_path + (error ? "/tunecache_error.lock" : "/tunecache.lock");
-      lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
+      lock_handle = _open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
       if (lock_handle == -1) {
         warningQuda("Unable to lock cache file.  Tuned launch parameters will not be cached to disk.  "
                     "If you are certain that no other instances of QUDA are accessing this filesystem, "
@@ -467,7 +491,7 @@ namespace quda
       }
       char msg[] = "If no instances of applications using QUDA are running,\n"
                    "this lock file shouldn't be here and is safe to delete.";
-      int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
+      int stat = _write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
       if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
 
       cache_path = resource_path + (error ? "/tunecache_error.tsv" : "/tunecache.tsv");
@@ -492,7 +516,7 @@ namespace quda
       cache_file.close();
 
       // Release lock.
-      close(lock_handle);
+      _close(lock_handle);
       remove(lock_path.c_str());
 
       initial_cache_size = tunecache.size();
@@ -543,7 +567,7 @@ namespace quda
       // Acquire lock.  Note that this is only robust if the filesystem supports flock() semantics, which is true for
       // NFS on recent versions of linux but not Lustre by default (unless the filesystem was mounted with "-o flock").
       lock_path = resource_path + "/profile.lock";
-      lock_handle = open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
+      lock_handle = _open(lock_path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0666);
       if (lock_handle == -1) {
         warningQuda("Unable to lock profile file.  Profile will not be saved to disk.  "
                     "If you are certain that no other instances of QUDA are accessing this filesystem, "
@@ -553,7 +577,7 @@ namespace quda
       }
       char msg[] = "If no instances of applications using QUDA are running,\n"
                    "this lock file shouldn't be here and is safe to delete.";
-      int stat = write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
+      int stat = _write(lock_handle, msg, sizeof(msg)); // check status to avoid compiler warning
       if (stat == -1) warningQuda("Unable to write to lock file for some bizarre reason");
 
       // profile counter for writing out unique profiles
@@ -660,7 +684,7 @@ namespace quda
       }
 
       // Release lock.
-      close(lock_handle);
+      _close(lock_handle);
       remove(lock_path.c_str());
     }
   }
diff --git a/lib/unitarize_force_quda.cu b/lib/unitarize_force_quda.cu
index 1b5dcbd7ff..8f2d97b6ce 100644
--- a/lib/unitarize_force_quda.cu
+++ b/lib/unitarize_force_quda.cu
@@ -55,7 +55,7 @@ namespace quda {
     void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &u,
 			int* fails)
     {
-      if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
         getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
         checkReconstruct(u, oldForce, newForce);
         checkPrecision(u, oldForce, newForce);
@@ -93,7 +93,7 @@ namespace quda {
 
     void unitarizeForceCPU(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &u)
     {
-      if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
+      if constexpr (is_enabled(QUDA_STAGGERED_DSLASH)) {
         if (checkLocation(newForce, oldForce, u) != QUDA_CPU_FIELD_LOCATION) errorQuda("Location must be CPU");
         int num_failures = 0;
         constexpr int nColor = 3;
diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp
index e279c0f43e..77ed489adb 100644
--- a/lib/util_quda.cpp
+++ b/lib/util_quda.cpp
@@ -3,7 +3,7 @@
 #include <cstring>
 #include <stack>
 #include <sstream>
-#include <sys/time.h>
+// #include <sys/time.h>
 
 #include <enum_quda.h>
 #include <util_quda.h>