diff --git a/Config.txt b/Config.txt
index e450e62..615d689 100644
--- a/Config.txt
+++ b/Config.txt
@@ -6,6 +6,7 @@ tests\
 samples\
 	vulkan; framework\vulkan
 		vk_extensions
+			cooperative_matrix
 			graph_pipelines
 			image_processing
 			tile_shading
@@ -21,7 +22,7 @@ samples\
 framework\
 	base; framework\external\GameSampleAssets, framework\external\glm, framework\external\tinyobjloader, framework\external\tinygltf, framework\external\json, framework\external\eigen
 	generic; framework\base, framework\external\imgui, framework\external\implot, framework\external\portable-file-dialogs, framework\external\KTX-Software, Tools
-	vulkan; framework\generic, framework\external\volk, framework\external\SPIRV-Cross, framework\external\glslang, framework\external\slang, framework\external\VulkanMemoryAllocator, framework\external\Vulkan-Headers
+	vulkan; framework\generic, framework\external\volk, framework\external\SPIRV-Cross, framework\external\glslang, framework\external\half, framework\external\VulkanMemoryAllocator, framework\external\Vulkan-Headers
 	external\
 			VulkanMemoryAllocator @ https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator/archive/refs/tags/v3.0.1.tar.gz MD5:8571f3def0ff86f228e2864c907ba0b3
 			tinyobjloader @ https://github.com/tinyobjloader/tinyobjloader/archive/e39c1737bc61c8dce28be7932cfe839d408e7838.zip
@@ -35,9 +36,9 @@ framework\
 			volk @ https://github.com/zeux/volk/archive/1e0ec168f1726e6389b8647435a3018f0cef9428.zip
 			SPIRV-Cross @ https://github.com/KhronosGroup/SPIRV-Cross/archive/7affe74d77f93a622bb5002789d5332d32e512ee.zip
 			glslang @ https://github.com/KhronosGroup/glslang/archive/3a7f78758f8faa9a6e059b09e25fc64ede7fbfb0.zip
-			slang @ https://github.com/shader-slang/slang/archive/9c2024a7509baae921083d49a56e1321c51f00ec.zip
 			json @ https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz MD5:4b67aba51ddf17c798e80361f527f50e
 			eigen @ https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz MD5:4c527a9171d71a72a9d4186e65bea559
 			KTX-Software @ https://github.com/KhronosGroup/KTX-Software/archive/refs/tags/v4.1.0.tar.gz MD5:b35fc412cdb3a00aa92aadcdd1e5f004 PATCH:..\cmake\KTX-Software.diff; Tools
 			D3D12MemoryAllocator @ https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator/archive/7597f717c7b32b74d263009ecc15985b517585c7.zip
+			half @ https://github.com/ramenhut/half/archive/43473931db0fae8ecef4ff1492ad18061e3600ec.zip
 
diff --git a/framework/CMakeLists.txt b/framework/CMakeLists.txt
index 238fe67..1e54461 100644
--- a/framework/CMakeLists.txt
+++ b/framework/CMakeLists.txt
@@ -519,12 +519,22 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan)
   endif()
 
   # Add vulkan framework dependency libraries
+  set(ENABLE_OPT OFF CACHE BOOL "Disable SPIRV-Tools optimizer" FORCE)
+  set(ENABLE_C_INTERFACE ON CACHE BOOL "" FORCE)
   add_subdirectory(external/volk)
-  #add_subdirectory(external/SPIRV-Cross)
-  #add_subdirectory(external/glslang)
+  add_subdirectory(external/SPIRV-Cross)
+  add_subdirectory(external/glslang)
   #add_subdirectory(external/slang)
 
   add_library(framework_vulkan STATIC ${CPP_VULKAN_SRC} ${EXTERNAL_VULKAN_SRC})
+
+  # Make sure framework_vulkan builds after glslang libs
+  add_dependencies(framework_vulkan
+    glslang
+    glslang-default-resource-limits
+    SPIRV
+  )
+
   target_include_directories(framework_vulkan   PUBLIC code)
   target_include_directories(framework_vulkan   PUBLIC external)
   target_include_directories(framework_vulkan   PUBLIC external/glm)  # so code can do #include "glm/mat3x3.hpp" etc
@@ -532,7 +542,7 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan)
   target_include_directories(framework_vulkan   PUBLIC external/imgui)
   target_include_directories(framework_vulkan   PUBLIC external/implot)
 
-  target_link_libraries(framework_vulkan framework)
+  target_link_libraries(framework_vulkan PUBLIC framework)
 
   get_target_property(VulkanHeaders_INCLUDE_DIRS Vulkan::Headers INTERFACE_INCLUDE_DIRECTORIES)
   target_include_directories(framework_vulkan PUBLIC "${VulkanHeaders_INCLUDE_DIRS}")
@@ -545,18 +555,20 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan)
   target_include_directories(framework_vulkan   PUBLIC external/slang/include)
 
   # Link vulkan framework library dependencies
-  target_link_libraries(framework_vulkan volk)
-  #target_link_libraries(framework_vulkan spirv-cross-core)
-  #target_link_libraries(framework_vulkan spirv-cross-cpp)
-  #target_link_libraries(framework_vulkan spirv-cross-glsl)
-  #target_link_libraries(framework_vulkan spirv-cross-hlsl)
-  #target_link_libraries(framework_vulkan spirv-cross-msl)
-  #target_link_libraries(framework_vulkan spirv-cross-reflect)
-  #target_link_libraries(framework_vulkan spirv-cross-util)
-  #target_link_libraries(framework_vulkan SPIRV)
-  #target_link_libraries(framework_vulkan glslang)
-  #target_link_libraries(framework_vulkan glslang-default-resource-limits)
-  #target_link_libraries(framework_vulkan slang)
+  target_link_libraries(
+    framework_vulkan
+    PUBLIC
+    glslang
+    glslang-default-resource-limits
+    spirv-cross-core
+    spirv-cross-cpp
+    spirv-cross-glsl
+    spirv-cross-hlsl
+    spirv-cross-msl
+    spirv-cross-reflect
+    spirv-cross-util
+    SPIRV
+  )
   
   target_compile_definitions(framework_vulkan PUBLIC VK_ENABLE_BETA_EXTENSIONS)
 
@@ -660,6 +672,21 @@ if(FRAMEWORK_LIB_OUTPUT)
 	    target_link_libraries(framework_dx12_shared PUBLIC framework_dx12)
     endif()
 
+
+  # Copy external libraries using a local target's post-build step
+  add_custom_command(TARGET framework_vulkan POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:glslang>
+            ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$<TARGET_FILE_NAME:glslang>)
+  add_custom_command(TARGET framework_vulkan POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:glslang-default-resource-limits>
+            ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$<TARGET_FILE_NAME:glslang-default-resource-limits>)
+  add_custom_command(TARGET framework_vulkan POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:SPIRV>
+            ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$<TARGET_FILE_NAME:SPIRV>)
+
 endif()
 
 
diff --git a/framework/code/graphicsApi/renderTarget.hpp b/framework/code/graphicsApi/renderTarget.hpp
index 5d0e331..5ec33fc 100644
--- a/framework/code/graphicsApi/renderTarget.hpp
+++ b/framework/code/graphicsApi/renderTarget.hpp
@@ -13,12 +13,15 @@
 class GraphicsApiBase;
 template<typename T_GFXAPI> class RenderTarget;
 
+class RenderTargetBase;
+
 struct RenderTargetInitializeInfo
 {
     uint32_t                                Width = 0;
     uint32_t                                Height = 0;
     std::span<const TextureFormat>          LayerFormats = {};
     TextureFormat                           DepthFormat = TextureFormat::UNDEFINED;
+    RenderTargetBase*                       InheritedDepthAttachment = nullptr;
     const std::span<const TEXTURE_TYPE>     TextureTypes = {};
     const std::optional<const TEXTURE_TYPE> DepthTextureType = std::nullopt;
     std::span<const Msaa>                   Msaa = {};
diff --git a/framework/code/vulkan/extensionLib.cpp b/framework/code/vulkan/extensionLib.cpp
index eeb93aa..0c35d66 100644
--- a/framework/code/vulkan/extensionLib.cpp
+++ b/framework/code/vulkan/extensionLib.cpp
@@ -261,6 +261,20 @@ namespace ExtensionLib
     }
 #endif // VK_KHR_fragment_shading_rate
 
+#if VK_KHR_cooperative_matrix
+    void Ext_VK_KHR_cooperative_matrix::PrintFeatures() const
+    {
+        LOGI("VK_KHR_fragment_shading_rate (VkPhysicalDeviceCooperativeMatrixFeaturesKHR): ");
+        LOGI("    pipelineFragmentShadingRate: %s", this->AvailableFeatures.cooperativeMatrix ? "True" : "False");
+        LOGI("    primitiveFragmentShadingRate: %s", this->AvailableFeatures.cooperativeMatrixRobustBufferAccess ? "True" : "False");
+    }
+    void Ext_VK_KHR_cooperative_matrix::PrintProperties() const
+    {
+        LOGI("VK_KHR_fragment_shading_rate (VkPhysicalDeviceFragmentShadingRatePropertiesKHR): ");
+        LOGI("    cooperativeMatrixSupportedStages: %d", this->Properties.cooperativeMatrixSupportedStages);
+    }
+#endif // VK_KHR_cooperative_matrix
+
 #if VK_KHR_create_renderpass2
     void Ext_VK_KHR_create_renderpass2::LookupFunctionPointers( VkInstance vkInstance )
     {
diff --git a/framework/code/vulkan/extensionLib.hpp b/framework/code/vulkan/extensionLib.hpp
index beda17d..37055e6 100644
--- a/framework/code/vulkan/extensionLib.hpp
+++ b/framework/code/vulkan/extensionLib.hpp
@@ -239,6 +239,7 @@ namespace ExtensionLib
 
 #endif // VK_KHR_create_renderpass2
 
+
 #if VK_KHR_draw_indirect_count
 
     struct Ext_VK_KHR_draw_indirect_count : public VulkanFunctionPointerExtensionHelper<VulkanExtensionType::eDevice>
@@ -253,6 +254,36 @@ namespace ExtensionLib
 
 #endif // VK_KHR_draw_indirect_count
 
+#if VK_KHR_cooperative_matrix
+
+    struct Ext_VK_KHR_cooperative_matrix : public VulkanFeaturesPropertiesAndFunctionPointerExtensionHelper<
+        VkPhysicalDeviceCooperativeMatrixFeaturesKHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR,
+        VkPhysicalDeviceCooperativeMatrixPropertiesKHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR>
+    {
+        static constexpr auto Name = VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME;
+        explicit Ext_VK_KHR_cooperative_matrix(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) : VulkanFeaturesPropertiesAndFunctionPointerExtensionHelper(Name, status)
+        {
+        }
+
+        void PopulateRequestedFeatures() override
+        {
+            RequestedFeatures.sType = AvailableFeatures.sType;
+            RequestedFeatures.cooperativeMatrix = AvailableFeatures.cooperativeMatrix;
+            RequestedFeatures.cooperativeMatrixRobustBufferAccess = AvailableFeatures.cooperativeMatrixRobustBufferAccess;
+        }
+        void LookupFunctionPointers(VkInstance vkInstance) override
+        {
+            m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vkInstance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR");
+        }
+        void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {}
+        void PrintFeatures() const override;
+        void PrintProperties() const override;
+
+        PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = nullptr;
+    };
+
+#endif // VK_KHR_cooperative_matrix
+
 #if VK_KHR_depth_stencil_resolve
 
     struct Ext_VK_KHR_depth_stencil_resolve : public VulkanFunctionPointerExtensionHelper<VulkanExtensionType::eDevice>
diff --git a/framework/code/vulkan/renderTarget.cpp b/framework/code/vulkan/renderTarget.cpp
index 89a7c75..dee2f38 100644
--- a/framework/code/vulkan/renderTarget.cpp
+++ b/framework/code/vulkan/renderTarget.cpp
@@ -53,6 +53,8 @@ RenderTarget<Vulkan>& RenderTarget<Vulkan>::operator=( RenderTarget<Vulkan>&& sr
         m_ClearColorValues = std::move( src.m_ClearColorValues );
         m_ResolveAttachments = std::move( src.m_ResolveAttachments );
         m_DepthAttachment = std::move( src.m_DepthAttachment );
+        m_InheritedDepthAttachment = std::move( src.m_InheritedDepthAttachment);
+        src.m_InheritedDepthAttachment = nullptr;
         m_FrameBuffer = std::move( src.m_FrameBuffer );
         m_FrameBufferDepthOnly = std::move( src.m_FrameBufferDepthOnly );
 
@@ -107,15 +109,30 @@ bool RenderTarget<Vulkan>::Initialize( Vulkan* pVulkan, const RenderTargetInitia
 
     m_pLayerFormats.assign( info.LayerFormats.begin(), info.LayerFormats.end() );
 
-    if (!InitializeDepth(depthTextureType))
-        return false;
+    if (info.InheritedDepthAttachment)
+    {
+        auto* inheritedDepthAttachment = apiCast<Vulkan>(info.InheritedDepthAttachment);
+        if (!inheritedDepthAttachment->m_DepthAttachment)
+        {
+            return false;
+        }
+
+        m_InheritedDepthAttachment = &inheritedDepthAttachment->m_DepthAttachment;
+        m_DepthFormat = m_InheritedDepthAttachment->Format;
+    }
+    else
+    {
+        if (!InitializeDepth(depthTextureType))
+            return false;
+    }
+
     if (!InitializeColor(colorTextureTypes))
         return false;
     if (!InitializeResolve( info.ResolveTextureFormats ))
         return false;
-    if (renderPass && *renderPass && !CreateFrameBuffer( *renderPass, m_ColorAttachments, &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBuffer ))
+    if (renderPass && *renderPass && !CreateFrameBuffer( *renderPass, m_ColorAttachments, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBuffer ))
         return false;
-    if (renderPassDepthOnly && *renderPassDepthOnly && m_DepthAttachment && !CreateFrameBuffer( *renderPassDepthOnly, {}, &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBufferDepthOnly ))
+    if (renderPassDepthOnly && *renderPassDepthOnly && (m_InheritedDepthAttachment || m_DepthAttachment) && !CreateFrameBuffer( *renderPassDepthOnly, {}, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBufferDepthOnly ))
         return false;
 
     return true;
@@ -154,7 +171,7 @@ bool RenderTarget<Vulkan>::Initialize(Vulkan* pVulkan, uint32_t uiWidth, uint32_
 bool RenderTarget<Vulkan>::InitializeFrameBuffer( Vulkan* pVulkan, const RenderPass<Vulkan>& renderPass )
 //-----------------------------------------------------------------------------
 {
-    bool success = CreateFrameBuffer( renderPass, m_ColorAttachments, &m_DepthAttachment, m_ResolveAttachments, nullptr, &m_FrameBuffer );
+    bool success = CreateFrameBuffer( renderPass, m_ColorAttachments, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr, &m_FrameBuffer );
     return success;
 }
 
@@ -162,7 +179,7 @@ bool RenderTarget<Vulkan>::InitializeFrameBuffer( Vulkan* pVulkan, const RenderP
 bool RenderTarget<Vulkan>::InitializeFrameBufferDepthOnly( Vulkan* pVulkan, const RenderPass<Vulkan>& renderPassDepthOnly )
 //-----------------------------------------------------------------------------
 {
-    bool success = CreateFrameBuffer( renderPassDepthOnly, {}, &m_DepthAttachment, {}, nullptr, &m_FrameBufferDepthOnly );
+    bool success = CreateFrameBuffer( renderPassDepthOnly, {}, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, {}, nullptr, &m_FrameBufferDepthOnly );
     return success;
 }
 
@@ -321,6 +338,7 @@ void RenderTarget<Vulkan>::Release()
     m_Msaa.clear();
 
     m_DepthAttachment.Release(m_pVulkan);
+    m_InheritedDepthAttachment = nullptr;
     m_DepthFormat = TextureFormat::UNDEFINED;
 
     m_FrameBufferDepthOnly = {};
diff --git a/framework/code/vulkan/renderTarget.hpp b/framework/code/vulkan/renderTarget.hpp
index 7ea397f..8f51a9f 100644
--- a/framework/code/vulkan/renderTarget.hpp
+++ b/framework/code/vulkan/renderTarget.hpp
@@ -225,6 +225,7 @@ class RenderTarget<Vulkan> final : public RenderTargetBase
 
     // The Depth Attachment
     TextureVulkan               m_DepthAttachment;
+    TextureVulkan*              m_InheritedDepthAttachment = nullptr; // Note: Not owning
 
     // The Frame Buffer
     Framebuffer<Vulkan>         m_FrameBuffer;
diff --git a/framework/code/vulkan/vulkan.cpp b/framework/code/vulkan/vulkan.cpp
index 260b1ef..3a9e7d3 100644
--- a/framework/code/vulkan/vulkan.cpp
+++ b/framework/code/vulkan/vulkan.cpp
@@ -787,6 +787,9 @@ bool Vulkan::RegisterKnownExtensions()
     m_ExtKhrSynchronization2 = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_synchronization2>();
     m_ExtKhrDrawIndirectCount = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_draw_indirect_count>();
     m_ExtRenderPass2 = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_create_renderpass2>();
+    m_ExtBufferDeviceAddress = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_buffer_device_address>();
+    m_Ext8BitStorage = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_8bit_storage>();
+    m_ExtCooperativeMatrix = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_cooperative_matrix>();
     m_ExtFragmentShadingRate = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_fragment_shading_rate>();
     m_ExtMeshShader = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_mesh_shader>();
     m_ExtDynamicRendering = m_DeviceExtensions.GetExtension<ExtensionLib::Ext_VK_KHR_dynamic_rendering>();
@@ -1234,6 +1237,7 @@ bool Vulkan::GetDataGraphProcessingEngine()
     // isn't fully supported publicly by the driver
 #if defined(OS_ANDROID)
     {
+#if 0
         auto* Ext_VK_ARM_tensors = static_cast<ExtensionLib::Ext_VK_ARM_tensors*>(m_DeviceExtensions.GetExtension(VK_ARM_TENSORS_EXTENSION_NAME));
         auto* Ext_VK_ARM_data_graph = static_cast<ExtensionLib::Ext_VK_ARM_data_graph*>(m_DeviceExtensions.GetExtension(VK_ARM_DATA_GRAPH_EXTENSION_NAME));
         auto fpGetDeviceProcAddr = (PFN_vkGetDeviceProcAddr)vkGetInstanceProcAddr(GetVulkanInstance(), "vkGetDeviceProcAddr");
@@ -1244,14 +1248,27 @@ bool Vulkan::GetDataGraphProcessingEngine()
         {
             LOGI("Forcing registering and enabling Graph Pipelines extensions for Android");
 
-            Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded;
-            Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr);
-            Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanInstance);
+            try
+            {
+                Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded;
+                Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr);
+                Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanInstance);
+
+                Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded;
+                Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr);
+                Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanInstance);
+
+                LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Done");
+            }
+            catch (...)
+            {
+                Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded;
+                Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded;
 
-            Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded;
-            Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr);
-            Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanInstance);
+                LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Failed, disabling EXT");
+            }
         }
+#endif
     }
 #endif
 
diff --git a/framework/code/vulkan/vulkan.hpp b/framework/code/vulkan/vulkan.hpp
index b463a87..3d74fa2 100644
--- a/framework/code/vulkan/vulkan.hpp
+++ b/framework/code/vulkan/vulkan.hpp
@@ -67,6 +67,9 @@ namespace ExtensionLib {
     struct Ext_VK_EXT_hdr_metadata;
     struct Ext_VK_KHR_fragment_shading_rate;
     struct Ext_VK_KHR_create_renderpass2;
+    struct Ext_VK_KHR_buffer_device_address;
+    struct Ext_VK_KHR_8bit_storage;
+    struct Ext_VK_KHR_cooperative_matrix;
     struct Ext_VK_KHR_synchronization2;
     struct Ext_VK_QCOM_tile_properties;
     struct Ext_VK_QCOM_tile_shading;
@@ -1077,6 +1080,9 @@ class Vulkan : public ::GraphicsApiBase
     const ExtensionLib::Ext_VK_EXT_hdr_metadata*             m_ExtHdrMetadata = nullptr;
     const ExtensionLib::Ext_VK_KHR_fragment_shading_rate*    m_ExtFragmentShadingRate = nullptr;
     const ExtensionLib::Ext_VK_KHR_create_renderpass2*       m_ExtRenderPass2 = nullptr;
+    const ExtensionLib::Ext_VK_KHR_buffer_device_address*    m_ExtBufferDeviceAddress = nullptr;
+    const ExtensionLib::Ext_VK_KHR_8bit_storage*             m_Ext8BitStorage = nullptr;
+    const ExtensionLib::Ext_VK_KHR_cooperative_matrix*       m_ExtCooperativeMatrix = nullptr;
     const ExtensionLib::Ext_VK_KHR_synchronization2*         m_ExtKhrSynchronization2 = nullptr;
     const ExtensionLib::Ext_VK_QCOM_tile_properties*         m_ExtQcomTileProperties = nullptr;
     const ExtensionLib::Ext_VK_QCOM_tile_shading*            m_ExtQcomTileShading = nullptr;
diff --git a/project/android/cmake/FrameworkApplicationHelper.cmake b/project/android/cmake/FrameworkApplicationHelper.cmake
index c820f96..3ecb364 100644
--- a/project/android/cmake/FrameworkApplicationHelper.cmake
+++ b/project/android/cmake/FrameworkApplicationHelper.cmake
@@ -55,6 +55,20 @@ target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../..
 #endif()
 target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libframework.a )
 target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libframework_base.a )
+
+target_link_libraries(
+    ${TARGET_NAME}
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libglslang.a
+)
+target_link_libraries(
+    ${TARGET_NAME}
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libglslang-default-resource-limits.a
+)
+target_link_libraries(
+    ${TARGET_NAME}
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libSPIRV.a
+)
+
 target_include_directories(${TARGET_NAME} PUBLIC ../../framework/code)
 target_include_directories(${TARGET_NAME} PUBLIC ../../framework/external)
 target_include_directories(${TARGET_NAME} PUBLIC ../../framework/external/glm)      # so code can do #include "glm/mat3x3.hpp" etc
diff --git a/samples/README.md b/samples/README.md
new file mode 100644
index 0000000..17be11b
--- /dev/null
+++ b/samples/README.md
@@ -0,0 +1,100 @@
+# Samples
+
+Unless noted all samples run on Windows and Android.
+
+## [empty](empty)
+
+Empty app.  Minimal app linked against Framework. 
+
+## [hello-gltf](hello-gltf)
+
+Scene (gltf) loading app. Implements a working scene with camera movement and minimal lightning.
+
+## [AODemo](AODemo)
+
+Vulkan implementation of Neural Network Ambient Occlusion.
+
+## [FrameworkTest](FrameworkTest)
+
+Simple test project that initializes the Vulkan Framework and displays a textured sphere.
+
+## [MLClothApp](MLClothApp)
+
+Sample project using machine learning to lower cloth simulation cost.
+
+## [deferredLpac](deferredLpac)
+
+App that renders a (reasonably) complex scene using forward rendering and compute shaders.
+
+Where LPAC (Low Priority Asyncronous Compute) is available the Compute jobs will be done on a low priority queue during shadow pass z-buffer write.
+
+## [DspOffload](dspOffload)
+
+App illustrating how the Hexagon DSP can be used to run graphics tasks and write results to GPU accessable Android Hardware Buffers.
+
+## [forward](forward)
+
+App illustrating a resonably complex forward rendered scene.
+
+## [hdrSwapchain](hdrSwapchain)
+
+Demonstrates the use of different swapchain image formats and colorspaces.  Has a gui dropdown that allows for switching buffer formats on the fly.
+
+Also demonstrates Qualcomm Vulkan render-pass transform extension [VK_QCOM_render_pass_transform](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_QCOM_render_pass_transform.html)
+
+## [rayQueryShadows](rayQueryShadows)
+
+Uses Vulkan Ray Tracing extension (VK_KHR_ray_tracing) to implement shadows using Ray Queries.
+
+Currently Windows only.
+
+## [rotatedCopy](rotatedCopy)
+
+Uses VK_QCOM_rotated_copy_commands (and VK_KHR_copy_commands2) extension to blit from a (lower resolution) intermediate render target to the device framebuffer rotated to match the devices native orientation (and thus avoiding the Android SurfaceFlinger doing an additional rotation/composition step).
+
+## [shaderResolve](shaderResolve)
+
+Uses VK_QCOM_render_pass_shader_resolve extension to implement MSAA and order-independent transparency in a deferred renderer.
+
+## [shaderResolveTonemap](shaderResolveTonemap)
+
+Uses VK_QCOM_render_pass_shader_resolve to perform a filmic tonemapping operator (on a simple forward rendered scene) as part of the MSAA resolve.
+
+Optionally runs the tonemap/resolve as a subpass of the main scene pass.  Has onscreen UI controls to modify MSAA sample counts and to enable/disable the shader resolve and use of subpasses (for measuring GPU subpass/shader-resolve efficiency).
+
+## [atmospherics](atmospherics)
+
+Atmospheric lighting.
+
+# Configuration
+
+Each sample can be configured by adding an 'app_config.txt' file in the root of the relevant sample (ie samples/forward/app_config.txt).
+
+On Android the app_config.txt needs to be pushed to device, into /sdcard/Android/data/ANDROID_APP_ID/files/. , many samples have a batch file to do this (eg 07_InstallConfig.bat).
+
+If this file is missing or empty the sample application should run with 'reasonable' defaults.
+
+Samples share a set of common settings and can define additional settings specific to the sample's functionality.
+
+## Common config settings
+
+gFramesToRender = x
+
+Render a specific number of frames before exiting the app.  x should be in integer.  0 (default) will render 'forever'.
+
+# File handling
+
+## Windows
+
+Executables are compiled to project\windows\solution\samples\APPLICATION\Debug\APPLICATION.exe
+
+Executables should be run from the samples\APPLICATION folder and data files (textures, models, shaders) are loaded from the Media subfolder.  The Visual Studio solution is pre-configured to run the exe from the correct folder.
+
+## Android
+
+Apk application bundles are complied to build\android\APPLICATION\outputs\apk\debug\APPLICATION-debug.apk
+
+So long as the sample's Media files were prepared (02_PrepareMedia.bat) before building the apk, the apk is stand-alone and contains the application executable and Media files.
+
+If desired any files in the Media folder can be 'overridden' by copying the relevant file to /sdcard/Android/data/ANDROID_APP_ID/files/. with the expected folder path.  Eg you can copy a shader file from Media\Shaders\. to /sdcard/Android/data/ANDROID_APP_ID/files/Media/Shaders/. and see your new shader code when the application is re-launched.
+
diff --git a/samples/cooperative_matrix/CMakeLists.txt b/samples/cooperative_matrix/CMakeLists.txt
new file mode 100644
index 0000000..3fa97e6
--- /dev/null
+++ b/samples/cooperative_matrix/CMakeLists.txt
@@ -0,0 +1,97 @@
+
+cmake_minimum_required (VERSION 3.21)
+
+project (cooperative_matrix C CXX)
+set(CMAKE_CXX_STANDARD 20)
+
+#
+# Source files included in this application.
+#
+set(CPP_SRC
+    code/main/application.cpp
+    code/main/application.hpp
+    code/main/cooperative_matrix_tester.cpp
+    code/main/cooperative_matrix_tester.hpp
+    code/main/runtime_shader.cpp
+    code/main/runtime_shader.hpp
+)
+
+#
+# Setup the module path to include the 'project directory' (project/windows or project/android)
+#
+if(NOT DEFINED PROJECT_ROOT_DIR)
+    # Windows can use CMAKE_SOURCE_DIR; Android's gradle passes -DPROJECT_ROOT_DIR=${project.rootDir}
+    set(PROJECT_ROOT_DIR ${CMAKE_SOURCE_DIR})
+endif()
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_ROOT_DIR}/cmake)
+
+#
+# Do all the build steps for a Framework application.
+# (Defines ${TARGET_NAME} inside the helper; do NOT modify the helper that links .a files.)
+#
+include(FrameworkApplicationHelper)
+
+# ------------------------------------------------------------------------------
+# Half (float16) dependency under ../../framework/external
+# Build as a normal CMake target and link it to ${TARGET_NAME}.
+# ------------------------------------------------------------------------------
+
+add_library(half STATIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.cpp
+)
+
+# Expose headers to dependents (your app target will inherit this include path)
+target_include_directories(half
+    PUBLIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external
+)
+
+# Nicer grouping in IDEs
+source_group(TREE "${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external" PREFIX "external" FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.h
+)
+
+# Link half into the application/library target defined by the helper
+target_link_libraries(${TARGET_NAME} half)
+
+#
+# Setup asset source and target folders
+#
+
+# cmake will use our GameSampleAssets (default for no parameter) as root directory for any asset request
+# (see FrameworkApplicationHelper.cmake for more info)
+inject_root_asset_path()
+
+# Register local variables for asset request, while also defining them in the C++ code for easy access
+# Here we use the default destination paths, all defined at FrameworkApplicationHelper.cmake
+register_local_asset_path(SHADER_DESTINATION  "${DEFAULT_LOCAL_SHADER_DESTINATION}")
+register_local_asset_path(MESH_DESTINATION    "${DEFAULT_LOCAL_MESH_DESTINATION}")
+register_local_asset_path(TEXTURE_DESTINATION "${DEFAULT_LOCAL_TEXTURE_DESTINATION}")
+
+#
+# Add in the contents of 'shaders' directory
+#
+include(AddShadersDir)
+
+# Search and include all project shaders
+scan_for_shaders()
+
+#
+# Copy required models to local folders
+#
+include(ModelPackager)
+
+# Scene GLTF
+add_gltf(scenes/SteamPunkSauna/SteamPunkSauna.gltf)
+
+#
+# Convert and copy textures to local folders
+#
+include(TexturePackager)
+
+# Scene Textures
+add_textures_from_path(scenes/SteamPunkSauna UASTC)
+
+# Supporting Textures
+add_textures_from_path(textures)
diff --git a/samples/cooperative_matrix/README.md b/samples/cooperative_matrix/README.md
new file mode 100644
index 0000000..9b1226c
--- /dev/null
+++ b/samples/cooperative_matrix/README.md
@@ -0,0 +1,66 @@
+# Hello-GLTF Sample
+
+![Screenshot](img/screenshot.png)
+
+## Overview
+
+Hello GLTF sample demonstrates the most basic usage of the Framework to produce a native Vulkan application and it is designed to be small and simple and meant as a starting point for developers to expand its functionality.
+
+It is recommended that this sample is used as a starting point for other applications using this Framework. To do so, this folder can be copied to a desired location and the respective changes to point the source and include files to the Framework `src` and `include` folders are addressed. For simplicity, the folder of the new sample can be created alongside the `hello-gltf` sample within the `samples` folder. This way, no additional modifications are required in any configuration file to build it out of the box.
+
+## Building
+
+### Dependencies
+
+The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable.
+
+* Android SDK
+* Andorid NDK
+* Gradle
+* CMake
+* Android Studio
+
+### Pre-Build
+
+Compile the underlying shaders to .spv by running the batch file below:
+
+```
+01_CompileShaders.bat
+```
+
+And convert the needed textures and shaders to the correct format using the batch file below:
+
+```
+02_PrepareMedia.bat
+```
+
+Note: The sample assumes there are user provided asset files at the following path: **'Media\Meshes\Museum.gltf'** and **'Media\Meshes\Museum.bin'**.
+Texture dependencies from this asset should be added to **'Media\Textures\'** and are required to have the *.ktx* extension.
+There are 3 extra require supporting textures that should also go to the same texture path listed above: **white_d.ktx**, **black_d.ktx** and **normal_default.ktx**.
+The framework team is working to build a centralized asset repository that should minimize these requirements in the near future.
+
+### Build
+
+Once the dependencies are installed and shaders compiled, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system:
+
+```
+01_BuildAndroid.bat
+02_BuildWindows.bat
+```
+
+### Deploy (android-only)
+
+To deploy the media files and the .apk to a connected device, run the batch files below:
+
+```
+02_CopyMediaToDevice.bat
+03_Install_APK.bat
+```
+
+If desired, you can keep track of any logging by running one of the logcat batch files (which you can find on the current directory).
+
+## Android Studio
+
+This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging.
+
+To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio.
diff --git a/samples/cooperative_matrix/code/main/application.cpp b/samples/cooperative_matrix/code/main/application.cpp
new file mode 100644
index 0000000..b4327fa
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/application.cpp
@@ -0,0 +1,733 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+
+#include "application.hpp"
+#include "main/applicationEntrypoint.hpp"
+#include "camera/cameraController.hpp"
+#include "camera/cameraControllerTouch.hpp"
+#include "camera/cameraData.hpp"
+#include "camera/cameraGltfLoader.hpp"
+#include "gui/imguiVulkan.hpp"
+#include "material/vulkan/computable.hpp"
+#include "material/vulkan/drawable.hpp"
+#include "material/drawableLoader.hpp"
+#include "material/vulkan/materialManager.hpp"
+#include "material/vulkan/shaderModule.hpp"
+#include "material/vulkan/shaderManager.hpp"
+#include "material/vulkan/specializationConstantsLayout.hpp"
+#include "mesh/meshHelper.hpp"
+#include "mesh/meshLoader.hpp"
+#include "system/math_common.hpp"
+#include "texture/vulkan/textureManager.hpp"
+#include "vulkan/extensionHelpers.hpp"
+#include "vulkan/extensionLib.hpp"
+#include "imgui.h"
+
+#include <random>
+#include <iostream>
+#include <filesystem>
+
+namespace
+{
+    static constexpr std::array<const char*, NUM_RENDER_PASSES> sRenderPassNames = { "RP_HUD", "RP_BLIT" };
+
+    glm::vec3 gCameraStartPos = glm::vec3(26.48f, 20.0f, -5.21f);
+    glm::vec3 gCameraStartRot = glm::vec3(0.0f, 110.0f, 0.0f);
+
+    float   gFOV = PI_DIV_4;
+    float   gNearPlane = 1.0f;
+    float   gFarPlane = 1800.0f;
+    float   gNormalAmount = 0.3f;
+    float   gNormalMirrorReflectAmount = 0.05f;
+}
+
+///
+/// @brief Implementation of the Application entrypoint (called by the framework)
+/// @return Pointer to Application (derived from @FrameworkApplicationBase).
+/// Creates the Application class.  Ownership is passed to the calling (framework) function.
+/// 
+FrameworkApplicationBase* Application_ConstructApplication()
+{
+    return new Application();
+}
+
+Application::Application() : ApplicationHelperBase()
+{
+}
+
+Application::~Application()
+{
+}
+
+//-----------------------------------------------------------------------------
+void Application::PreInitializeSetVulkanConfiguration(Vulkan::AppConfiguration& config)
+//-----------------------------------------------------------------------------
+{
+    ApplicationHelperBase::PreInitializeSetVulkanConfiguration(config);
+    config.RequiredExtension<ExtensionLib::Ext_VK_KHR_get_physical_device_properties2>();
+    config.RequiredExtension<ExtensionLib::Ext_VK_KHR_synchronization2>();
+    config.RequiredExtension<ExtensionLib::Ext_VK_KHR_create_renderpass2>();
+    config.RequiredExtension<ExtensionLib::Ext_VK_KHR_buffer_device_address>();
+    config.RequiredExtension<ExtensionLib::Ext_VK_KHR_8bit_storage>();
+    config.OptionalExtension<ExtensionLib::Ext_VK_KHR_cooperative_matrix>();
+}
+
+//-----------------------------------------------------------------------------
+bool Application::Initialize(uintptr_t windowHandle, uintptr_t hInstance)
+//-----------------------------------------------------------------------------
+{
+    if (!ApplicationHelperBase::Initialize( windowHandle, hInstance ))
+    {
+        return false;
+    }
+
+    if (GetVulkan()->HasLoadedVulkanDeviceExtension(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME))
+    {
+        GetVulkan()->WaitUntilIdle();
+        m_cooperative_matrix_runner = std::make_unique<CooperativeMatrixRunner>(*GetVulkan());
+        LOGI("Initializing cooperative matrix runner");
+        if (!m_cooperative_matrix_runner->InitializeRunner())
+        {
+            return false;
+        }
+        LOGI("Cooperative matrix runner initialized!");
+    }
+
+    if (!InitializeCamera())
+    {
+        return false;
+    }
+    
+    if (!LoadShaders())
+    {
+        return false;
+    }
+
+    if (!CreateRenderTargets())
+    {
+        return false;
+    }
+
+    if (!InitAllRenderPasses())
+    {
+        return false;
+    }
+
+    if (!InitGui(windowHandle))
+    {
+        return false;
+    }
+
+    if (!LoadMeshObjects())
+    {
+        return false;
+    }
+
+    if (!InitCommandBuffers())
+    {
+        return false;
+    }
+
+    if (!InitLocalSemaphores())
+    {
+        return false;
+    }
+
+    if (!BuildCmdBuffers())
+    {
+        return false;
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+void Application::Destroy()
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+
+    // Cmd buffers
+    for (int whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        for (auto& cmdBuffer : m_RenderPassData[whichPass].PassCmdBuffer)
+        {
+            cmdBuffer.Release();
+        }
+
+        for (auto& cmdBuffer : m_RenderPassData[whichPass].ObjectsCmdBuffer)
+        {
+            cmdBuffer.Release();
+        }
+
+        m_RenderPassData[whichPass].RenderTarget.Release();
+    }
+
+    // Render passes / Context / Semaphores
+    for (int whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        vkDestroySemaphore(pVulkan->m_VulkanDevice, m_RenderPassData[whichPass].PassCompleteSemaphore, nullptr);
+        m_RenderPassData[whichPass].RenderContext.clear();
+    }
+
+    // Drawables
+    m_BlitQuadDrawable.reset();
+
+    // Internal
+    m_ShaderManager.reset();
+    m_MaterialManager.reset();
+    m_CameraController.reset();
+    m_AssetManager.reset();
+
+    ApplicationHelperBase::Destroy();
+}
+
+//-----------------------------------------------------------------------------
+bool Application::InitializeCamera()
+//-----------------------------------------------------------------------------
+{
+    LOGI("******************************");
+    LOGI("Initializing Camera...");
+    LOGI("******************************");
+
+    m_Camera.SetPosition(gCameraStartPos, glm::quat(gCameraStartRot * TO_RADIANS));
+    m_Camera.SetAspect(float(gRenderWidth) / float(gRenderHeight));
+    m_Camera.SetFov(gFOV);
+    m_Camera.SetClipPlanes(gNearPlane, gFarPlane);
+
+    // Camera Controller //
+
+#if defined(OS_ANDROID)
+    typedef CameraControllerTouch           tCameraController;
+#else
+    typedef CameraController                tCameraController;
+#endif
+
+    auto cameraController = std::make_unique<tCameraController>();
+    if (!cameraController->Initialize(gRenderWidth, gRenderHeight))
+    {
+        return false;
+    }
+
+    m_CameraController = std::move(cameraController);
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::LoadShaders()
+//-----------------------------------------------------------------------------
+{
+    m_ShaderManager = std::make_unique<ShaderManager>(*GetVulkan());
+    m_ShaderManager->RegisterRenderPassNames(sRenderPassNames);
+
+    m_MaterialManager = std::make_unique<MaterialManager>(*GetVulkan());
+
+    LOGI("******************************");
+    LOGI("Loading Shaders...");
+    LOGI("******************************");
+
+    typedef std::pair<std::string, std::string> tIdAndFilename;
+    for (const tIdAndFilename& i :
+            { tIdAndFilename { "Blit",  "Blit.json" }
+            })
+    {
+        if (!m_ShaderManager->AddShader(*m_AssetManager, i.first, i.second, SHADER_DESTINATION_PATH))
+        {
+            LOGE("Error Loading shader %s from %s", i.first.c_str(), i.second.c_str());
+            LOGI("Please verify if you have all required assets on the sample media folder");
+            LOGI("If you are running on Android, don't forget to run the `02_CopyMediaToDevice.bat` script to copy all media files into the device memory");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::CreateRenderTargets()
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+
+    LOGI("**************************");
+    LOGI("Creating Render Targets...");
+    LOGI("**************************");
+
+    TextureFormat vkDesiredDepthFormat = pVulkan->GetBestSurfaceDepthFormat();
+    TextureFormat desiredDepthFormat = vkDesiredDepthFormat;
+
+    const TextureFormat MainColorType[] = { TextureFormat::R8G8B8A8_SRGB };
+    const TextureFormat HudColorType[]  = { TextureFormat::R8G8B8A8_SRGB };
+
+    // Notice no depth on the HUD RT
+    if (!m_RenderPassData[RP_HUD].RenderTarget.Initialize(pVulkan, gSurfaceWidth, gSurfaceHeight, HudColorType, TextureFormat::UNDEFINED, Msaa::Samples1, "HUD RT"))
+    {
+        LOGE("Unable to create hud render target");
+        return false;
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::InitAllRenderPasses()
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+
+    //                                       ColorInputUsage |               ClearDepthRenderPass | ColorOutputUsage |                     DepthOutputUsage |              ClearColor
+    m_RenderPassData[RP_HUD].PassSetup   = { RenderPassInputUsage::Clear,    false,                 RenderPassOutputUsage::StoreReadOnly,  RenderPassOutputUsage::Discard, {}};
+    m_RenderPassData[RP_BLIT].PassSetup  = { RenderPassInputUsage::DontCare, true,                  RenderPassOutputUsage::Present,        RenderPassOutputUsage::Discard, {}};
+
+    TextureFormat surfaceFormat = pVulkan->m_SurfaceFormat;
+    auto swapChainColorFormat = std::span<const TextureFormat>({ &surfaceFormat, 1 });
+    auto swapChainDepthFormat = pVulkan->m_SwapchainDepth.format;
+
+    LOGI("******************************");
+    LOGI("Initializing Render Passes... ");
+    LOGI("******************************");
+
+    for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        bool isSwapChainRenderPass = whichPass == RP_BLIT;
+
+        std::span<const TextureFormat> colorFormats = isSwapChainRenderPass ? swapChainColorFormat : m_RenderPassData[whichPass].RenderTarget.m_pLayerFormats;
+        TextureFormat                  depthFormat = isSwapChainRenderPass ? swapChainDepthFormat : m_RenderPassData[whichPass].RenderTarget.m_DepthFormat;
+
+        const auto& passSetup = m_RenderPassData[whichPass].PassSetup;
+        auto& passData = m_RenderPassData[whichPass];
+
+        RenderPass renderPass;
+        if (!pVulkan->CreateRenderPass(
+            { colorFormats },
+            depthFormat,
+            Msaa::Samples1,
+            passSetup.ColorInputUsage,
+            passSetup.ColorOutputUsage,
+            passSetup.ClearDepthRenderPass,
+            passSetup.DepthOutputUsage,
+            renderPass))
+        {
+            return false;
+        }
+
+        Framebuffer<Vulkan> framebuffer;
+        if (!isSwapChainRenderPass)
+        {
+            framebuffer.Initialize(*pVulkan,
+                renderPass,
+                passData.RenderTarget.m_ColorAttachments,
+                &passData.RenderTarget.m_DepthAttachment,
+                sRenderPassNames[whichPass]);
+        }
+
+        passData.RenderContext.push_back({ std::move(renderPass), {}/*pipeline*/, std::move(framebuffer), sRenderPassNames[whichPass] });
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::InitGui(uintptr_t windowHandle)
+//-----------------------------------------------------------------------------
+{
+    const auto& hudRenderTarget = m_RenderPassData[RP_HUD].RenderTarget;
+    m_Gui = std::make_unique<GuiImguiGfx>(*GetVulkan(), m_RenderPassData[RP_HUD].RenderContext[0].GetRenderPass().Copy());
+    if (!m_Gui->Initialize(windowHandle, TextureFormat::R8G8B8A8_UNORM, hudRenderTarget.m_Width, hudRenderTarget.m_Height))
+    {
+        return false;
+    }
+    
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::LoadMeshObjects()
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+
+    LOGI("***********************");
+    LOGI("Initializing Meshes... ");
+    LOGI("***********************");
+
+    const auto* pBlitQuadShader = m_ShaderManager->GetShader("Blit");
+    if (!pBlitQuadShader)
+    {
+        return false;
+    }
+    
+    LOGI("*********************");
+    LOGI("Creating Quad mesh...");
+    LOGI("*********************");
+
+    Mesh blitQuadMesh;
+    if (!MeshHelper::CreateMesh<Vulkan>(
+        pVulkan->GetMemoryManager(), 
+        MeshObjectIntermediate::CreateScreenSpaceMesh(), 
+        0, 
+        pBlitQuadShader->m_shaderDescription->m_vertexFormats, 
+        &blitQuadMesh))
+    {
+        return false;
+    }
+
+    // Blit Material
+    auto blitQuadShaderMaterial = m_MaterialManager->CreateMaterial(*pBlitQuadShader, 2,
+        [this](const std::string& texName) -> const MaterialManager::tPerFrameTexInfo
+    {
+        if (texName == "Overlay")
+        {
+            return { &m_RenderPassData[RP_HUD].RenderTarget.m_ColorAttachments[0] };
+        }
+        return {};
+    },
+        [this](const std::string& bufferName) -> PerFrameBuffer
+    {
+        return {};
+    }
+    );
+
+    m_BlitQuadDrawable = std::make_unique<Drawable>(*pVulkan, std::move(blitQuadShaderMaterial));
+    if (!m_BlitQuadDrawable->Init(m_RenderPassData[RP_BLIT].RenderContext[0], std::move(blitQuadMesh)))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::InitCommandBuffers()
+//-----------------------------------------------------------------------------
+{
+    LOGI("*******************************");
+    LOGI("Initializing Command Buffers...");
+    LOGI("*******************************");
+
+    Vulkan* const pVulkan = GetVulkan();
+
+    auto GetPassName = [](uint32_t whichPass)
+    {
+        if (whichPass >= sRenderPassNames.size())
+        {
+            LOGE("GetPassName() called with unknown pass (%d)!", whichPass);
+            return "RP_UNKNOWN";
+        }
+
+        return sRenderPassNames[whichPass];
+    };
+
+    m_RenderPassData[RP_HUD].PassCmdBuffer.resize(NUM_VULKAN_BUFFERS);
+    m_RenderPassData[RP_HUD].ObjectsCmdBuffer.resize(NUM_VULKAN_BUFFERS);
+    m_RenderPassData[RP_BLIT].PassCmdBuffer.resize(pVulkan->m_SwapchainImageCount);
+    m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.resize(pVulkan->m_SwapchainImageCount);
+
+    char szName[256];
+    const auto CmdBuffLevel = CommandListBase::Type::Secondary;
+    for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        for (uint32_t whichBuffer = 0; whichBuffer < m_RenderPassData[whichPass].PassCmdBuffer.size(); whichBuffer++)
+        {
+            // The Pass Command Buffer => Primary
+            sprintf(szName, "Primary (%s; Buffer %d of %d)", GetPassName(whichPass), whichBuffer + 1, NUM_VULKAN_BUFFERS);
+            if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Initialize(pVulkan, szName, CommandListBase::Type::Primary))
+            {
+                return false;
+            }
+
+            // Model => Secondary
+            sprintf(szName, "Model (%s; Buffer %d of %d)", GetPassName(whichPass), whichBuffer + 1, NUM_VULKAN_BUFFERS);
+            if (!m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].Initialize(pVulkan, szName, CmdBuffLevel))
+            {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::InitLocalSemaphores()
+//-----------------------------------------------------------------------------
+{
+    LOGI("********************************");
+    LOGI("Initializing Local Semaphores...");
+    LOGI("********************************");
+
+    const VkSemaphoreCreateInfo SemaphoreInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
+
+    for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        VkResult retVal = vkCreateSemaphore(GetVulkan()->m_VulkanDevice, &SemaphoreInfo, NULL, &m_RenderPassData[whichPass].PassCompleteSemaphore);
+        if (!CheckVkError("vkCreateSemaphore()", retVal))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+bool Application::BuildCmdBuffers()
+//-----------------------------------------------------------------------------
+{
+    LOGI("***************************");
+    LOGI("Building Command Buffers...");
+    LOGI("****************************");
+
+    Vulkan* const pVulkan = GetVulkan();
+
+    // Begin recording
+    for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        auto& renderPassData         = m_RenderPassData[whichPass];
+        bool  bisSwapChainRenderPass = whichPass == RP_BLIT;
+
+        for (uint32_t whichBuffer = 0; whichBuffer < renderPassData.ObjectsCmdBuffer.size(); whichBuffer++)
+        {
+            auto& cmdBufer = renderPassData.ObjectsCmdBuffer[whichBuffer];
+
+            uint32_t targetWidth  = bisSwapChainRenderPass ? pVulkan->m_SurfaceWidth : renderPassData.RenderTarget.GetWidth();
+            uint32_t targetHeight = bisSwapChainRenderPass ? pVulkan->m_SurfaceHeight : renderPassData.RenderTarget.GetHeight();
+
+            VkViewport viewport = {};
+            viewport.x          = 0.0f;
+            viewport.y          = 0.0f;
+            viewport.width      = (float)targetWidth;
+            viewport.height     = (float)targetHeight;
+            viewport.minDepth   = 0.0f;
+            viewport.maxDepth   = 1.0f;
+
+            VkRect2D scissor      = {};
+            scissor.offset.x      = 0;
+            scissor.offset.y      = 0;
+            scissor.extent.width  = targetWidth;
+            scissor.extent.height = targetHeight;
+
+            // Set up some values that change based on render pass
+            VkRenderPass  whichRenderPass  = renderPassData.RenderContext[0].GetRenderPass().mRenderPass;
+            VkFramebuffer whichFramebuffer = bisSwapChainRenderPass ? pVulkan->m_SwapchainBuffers[whichBuffer].framebuffer : renderPassData.RenderContext[0].GetFramebuffer()->m_FrameBuffer;
+
+            // Objects (can render into any pass except Blit)
+            if (!cmdBufer.Begin(whichFramebuffer, whichRenderPass, bisSwapChainRenderPass))
+            {
+                return false;
+            }
+            vkCmdSetViewport(cmdBufer.m_VkCommandBuffer, 0, 1, &viewport);
+            vkCmdSetScissor(cmdBufer.m_VkCommandBuffer, 0, 1, &scissor);
+        }
+    }
+    
+    // Blit quad drawable
+    AddDrawableToCmdBuffers(*m_BlitQuadDrawable.get(), m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.data(), 1, static_cast<uint32_t>(m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.size()));
+
+    // End recording
+    for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++)
+    {
+        auto& renderPassData = m_RenderPassData[whichPass];
+
+        for (uint32_t whichBuffer = 0; whichBuffer < renderPassData.ObjectsCmdBuffer.size(); whichBuffer++)
+        {
+            auto& cmdBufer = renderPassData.ObjectsCmdBuffer[whichBuffer];
+            if (!cmdBufer.End())
+            {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+void Application::UpdateGui()
+//-----------------------------------------------------------------------------
+{
+    if (m_Gui)
+    {
+        m_Gui->Update();
+        ImGuiIO& io = ImGui::GetIO();
+
+        if (ImGui::Begin("FPS", (bool*)nullptr, ImGuiWindowFlags_NoTitleBar))
+        {
+            ImGui::Text("FPS: %.1f", m_CurrentFPS);
+            ImGui::Text("Camera [%f, %f, %f]", m_Camera.Position().x, m_Camera.Position().y, m_Camera.Position().z);
+
+            if (m_cooperative_matrix_runner)
+            {
+                m_cooperative_matrix_runner->RenderUI();
+            }
+        }
+        ImGui::End();
+
+        return;
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Application::Render(float fltDiffTime)
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+
+    if (m_cooperative_matrix_runner)
+    {
+        pVulkan->WaitUntilIdle();
+        m_cooperative_matrix_runner->TriggerPendingTests();
+    }
+
+    // Obtain the next swap chain image for the next frame.
+    auto currentVulkanBuffer = pVulkan->SetNextBackBuffer();
+    uint32_t whichBuffer     = currentVulkanBuffer.idx;
+
+    // ********************************
+    // Application Draw() - Begin
+    // ********************************
+
+    UpdateGui();
+
+    // Update camera
+    m_Camera.UpdateController(fltDiffTime * 10.0f, *m_CameraController);
+    m_Camera.UpdateMatrices();
+ 
+    // First time through, wait for the back buffer to be ready
+    std::span<const VkSemaphore> pWaitSemaphores = { &currentVulkanBuffer.semaphore, 1 };
+
+    const VkPipelineStageFlags DefaultGfxWaitDstStageMasks[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
+
+    // RP_HUD
+    VkCommandBuffer guiCommandBuffer = VK_NULL_HANDLE;
+    if (m_Gui)
+    {
+        // Render gui (has its own command buffer, optionally returns vk_null_handle if not rendering anything)
+        guiCommandBuffer = GetGui()->Render(whichBuffer, m_RenderPassData[RP_HUD].RenderContext[0].GetFramebuffer()->m_FrameBuffer);
+        if (guiCommandBuffer != VK_NULL_HANDLE)
+        {
+            BeginRenderPass(whichBuffer, RP_HUD, currentVulkanBuffer.swapchainPresentIdx);
+            vkCmdExecuteCommands(m_RenderPassData[RP_HUD].PassCmdBuffer[whichBuffer].m_VkCommandBuffer, 1, &guiCommandBuffer);
+            EndRenderPass(whichBuffer, RP_HUD);
+
+            // Submit the commands to the queue.
+            SubmitRenderPass(whichBuffer, RP_HUD, pWaitSemaphores, DefaultGfxWaitDstStageMasks, { &m_RenderPassData[RP_HUD].PassCompleteSemaphore,1 });
+            pWaitSemaphores = { &m_RenderPassData[RP_HUD].PassCompleteSemaphore,1 };
+        }
+    }
+
+    // Blit Results to the screen
+    {
+        BeginRenderPass(whichBuffer, RP_BLIT, currentVulkanBuffer.swapchainPresentIdx);
+        AddPassCommandBuffer(whichBuffer, RP_BLIT);
+        EndRenderPass(whichBuffer, RP_BLIT);
+
+        // Submit the commands to the queue.
+        SubmitRenderPass(whichBuffer, RP_BLIT, pWaitSemaphores, DefaultGfxWaitDstStageMasks, { &m_RenderPassData[RP_BLIT].PassCompleteSemaphore,1 }, currentVulkanBuffer.fence);
+        pWaitSemaphores = { &m_RenderPassData[RP_BLIT].PassCompleteSemaphore,1 };
+    }
+
+    // Queue is loaded up, tell the driver to start processing
+    pVulkan->PresentQueue(pWaitSemaphores, currentVulkanBuffer.swapchainPresentIdx);
+
+    // ********************************
+    // Application Draw() - End
+    // ********************************
+}
+
+//-----------------------------------------------------------------------------
+void Application::BeginRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass, uint32_t WhichSwapchainImage)
+//-----------------------------------------------------------------------------
+{
+    Vulkan* const pVulkan = GetVulkan();
+    auto& renderPassData         = m_RenderPassData[whichPass];
+    bool  bisSwapChainRenderPass = whichPass == RP_BLIT;
+
+    if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Reset())
+    {
+        LOGE("Pass (%d) command buffer Reset() failed !", whichPass);
+    }
+
+    if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Begin())
+    {
+        LOGE("Pass (%d) command buffer Begin() failed !", whichPass);
+    }
+
+    VkFramebuffer framebuffer = nullptr;
+    switch (whichPass)
+    {
+    case RP_HUD:
+        framebuffer = m_RenderPassData[whichPass].RenderContext[0].GetFramebuffer()->m_FrameBuffer;
+        break;
+    case RP_BLIT:
+        framebuffer = pVulkan->m_SwapchainBuffers[WhichSwapchainImage].framebuffer;
+        break;
+    default:
+        framebuffer = nullptr;
+        break;
+    }
+
+    assert(framebuffer != nullptr);
+
+    VkRect2D passArea = {};
+    passArea.offset.x = 0;
+    passArea.offset.y = 0;
+    passArea.extent.width  = bisSwapChainRenderPass ? pVulkan->m_SurfaceWidth  : renderPassData.RenderTarget.m_Width;
+    passArea.extent.height = bisSwapChainRenderPass ? pVulkan->m_SurfaceHeight : renderPassData.RenderTarget.m_Height;
+
+    TextureFormat                  swapChainColorFormat = pVulkan->m_SurfaceFormat;
+    auto                           swapChainColorFormats = std::span<const TextureFormat>({ &swapChainColorFormat, 1 });
+    TextureFormat                  swapChainDepthFormat = pVulkan->m_SwapchainDepth.format;
+    std::span<const TextureFormat> colorFormats         = bisSwapChainRenderPass ? swapChainColorFormats : m_RenderPassData[whichPass].RenderTarget.m_pLayerFormats;
+    TextureFormat                  depthFormat          = bisSwapChainRenderPass ? swapChainDepthFormat : m_RenderPassData[whichPass].RenderTarget.m_DepthFormat;
+
+    VkClearColorValue clearColor = { renderPassData.PassSetup.ClearColor[0], renderPassData.PassSetup.ClearColor[1], renderPassData.PassSetup.ClearColor[2], renderPassData.PassSetup.ClearColor[3] };
+
+    m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].BeginRenderPass(
+        passArea,
+        0.0f,
+        1.0f,
+        { &clearColor , 1 },
+        (uint32_t)colorFormats.size(),
+        depthFormat != TextureFormat::UNDEFINED,
+        m_RenderPassData[whichPass].RenderContext[0].GetRenderPass().mRenderPass,
+        bisSwapChainRenderPass,
+        framebuffer,
+        VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);
+}
+
+
+//-----------------------------------------------------------------------------
+void Application::AddPassCommandBuffer(uint32_t whichBuffer, RENDER_PASS whichPass)
+//-----------------------------------------------------------------------------
+{
+    if (m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].m_NumDrawCalls)
+    {
+        vkCmdExecuteCommands(m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].m_VkCommandBuffer, 1, &m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].m_VkCommandBuffer);
+    }
+}
+
+//-----------------------------------------------------------------------------
+void Application::EndRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass)
+//-----------------------------------------------------------------------------
+{
+    m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].EndRenderPass();
+}
+
+//-----------------------------------------------------------------------------
+void Application::SubmitRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass, const std::span<const VkSemaphore> WaitSemaphores, const std::span<const VkPipelineStageFlags> WaitDstStageMasks, std::span<VkSemaphore> SignalSemaphores, VkFence CompletionFence)
+//-----------------------------------------------------------------------------
+{
+    m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].End();
+    m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].QueueSubmit(WaitSemaphores, WaitDstStageMasks, SignalSemaphores, CompletionFence);
+}
diff --git a/samples/cooperative_matrix/code/main/application.hpp b/samples/cooperative_matrix/code/main/application.hpp
new file mode 100644
index 0000000..daa2400
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/application.hpp
@@ -0,0 +1,112 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+#pragma once
+
+#include "main/applicationHelperBase.hpp"
+#include "memory/vulkan/uniform.hpp"
+#include "vulkan/commandBuffer.hpp"
+#include "cooperative_matrix_tester.hpp"
+#include "vulkan/renderPass.hpp"
+#include <unordered_map>
+
+enum RENDER_PASS
+{
+    RP_HUD = 0,
+    RP_BLIT,
+    NUM_RENDER_PASSES
+};
+
+// **********************
+// Render Pass
+// **********************
+struct PassSetupInfo
+{
+    RenderPassInputUsage    ColorInputUsage;
+    bool                    ClearDepthRenderPass;
+    RenderPassOutputUsage   ColorOutputUsage;
+    RenderPassOutputUsage   DepthOutputUsage;
+    glm::vec4               ClearColor;
+};
+
+struct PassData
+{
+    // Pass internal data
+    PassSetupInfo PassSetup;
+    std::vector<RenderContext<Vulkan>>  RenderContext;  // context per framebuffer (some passes might all point to the same framebuffers)
+
+    // Recorded objects that are set to be drawn on this pass
+    std::vector< CommandListVulkan> ObjectsCmdBuffer;
+
+    // Command buffer used to dispatch the render pass
+    std::vector< CommandListVulkan> PassCmdBuffer;
+
+    // Indicates the completing of the underlying render pass
+    VkSemaphore PassCompleteSemaphore = VK_NULL_HANDLE;
+
+    // Render targed used by the underlying render pass
+    // note: The blit pass uses the backbuffer directly instead this RT
+    RenderTarget<Vulkan> RenderTarget;
+};
+
+// **********************
+// Application
+// **********************
+class Application : public ApplicationHelperBase
+{
+public:
+    Application();
+    ~Application() override;
+
+    // ApplicationHelperBase
+    virtual void PreInitializeSetVulkanConfiguration(Vulkan::AppConfiguration& config) override;
+    virtual bool Initialize(uintptr_t windowHandle, uintptr_t hInstance) override;
+    virtual void Destroy() override;
+    virtual void Render(float fltDiffTime) override;
+
+private:
+
+    // Application - Initialization
+    bool InitializeCamera();
+    bool LoadShaders();
+    bool CreateRenderTargets();
+    bool InitAllRenderPasses();
+    bool InitGui(uintptr_t windowHandle);
+    bool LoadMeshObjects();
+    bool InitCommandBuffers();
+    bool InitLocalSemaphores();
+    bool BuildCmdBuffers();
+
+private:
+
+    // Application - Frame
+    void BeginRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass, uint32_t WhichSwapchainImage);
+    void AddPassCommandBuffer(uint32_t WhichBuffer, RENDER_PASS WhichPass);
+    void EndRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass);
+    void SubmitRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass, const std::span<const VkSemaphore> WaitSemaphores, const std::span<const VkPipelineStageFlags> WaitDstStageMasks, std::span<VkSemaphore> SignalSemaphores, VkFence CompletionFence = (VkFence)nullptr);
+    void UpdateGui();
+
+private:
+
+    // Render passes
+    std::array< PassData, NUM_RENDER_PASSES> m_RenderPassData;
+
+    // Drawables
+    std::unique_ptr<Drawable> m_BlitQuadDrawable;
+
+    // Shaders
+    std::unique_ptr<ShaderManager> m_ShaderManager;
+
+    // Materials
+    std::unique_ptr<MaterialManager> m_MaterialManager;
+
+    std::unique_ptr<CooperativeMatrixRunner> m_cooperative_matrix_runner;
+};
diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp b/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp
new file mode 100644
index 0000000..e99ac0b
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp
@@ -0,0 +1,347 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+#pragma once
+
+#include <string>
+
+const char* Test01_MxM_Basic = R"(
+#version 450 core
+#pragma use_vulkan_memory_model
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32   : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8    : enable
+
+// These specialized constants are set inside the host
+layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0
+layout(constant_id = 1) const uint lsy = 2;  // local_size_y set inside the host and map to constant_id = 1
+layout(constant_id = 2) const uint lsz = 2;  // local_size_z set inside the host and map to constant_id = 2
+layout(constant_id = 3) const uint TOTAL_M = 1;
+layout(constant_id = 4) const uint TOTAL_N = 1;
+layout(constant_id = 5) const uint TOTAL_K = 1;
+layout(constant_id = 6) const uint TILE_M = 1;
+layout(constant_id = 7) const uint TILE_N = 1;
+layout(constant_id = 8) const uint TILE_K = 1;
+layout(constant_id = 9) const bool layoutA_Mfirst = false;
+layout(constant_id = 10) const bool layoutB_Kfirst = false;
+layout(constant_id = 11) const bool layoutC_Mfirst = false;
+layout(constant_id = 12) const bool layoutR_Mfirst = false;
+layout(constant_id = 13) const uint strideAinElements = 1;
+layout(constant_id = 14) const uint strideBinElements = 1;
+layout(constant_id = 15) const uint strideCinElements = 1;
+layout(constant_id = 16) const uint strideRinElements = 1;
+
+// #defines set on compiler GLSL to SPIR-V command line:
+// A_TYPE = e.g. float or float16_t
+// R_TYPE = e.g. float or float16_t
+
+layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA;
+layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB;
+layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC;
+layout(set=0, binding=3)  buffer Output { R_TYPE x[]; } outputO;
+
+//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params;
+
+// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; 
+
+// Very simple shader, similar to our OpenCL MxM https://github.qualcomm.com/grtrt/TCU/blob/main/Kernels/MxM_Baseline_Sections.cl
+// 
+void main()
+{
+    //int la = (layoutA_Mfirst ? 1 : 0);
+    //int lb = (layoutB_Kfirst ? 1 : 0);
+    // Example of how to use printf, for details https://confluence.qualcomm.com/confluence/display/GCEA/Use+printf%28...%29+inside+a+Vulkan+shader+using+GLSL
+ //if ( (gl_GlobalInvocationID.x == 0) && (gl_GlobalInvocationID.y == 0) && (gl_GlobalInvocationID.z == 0))
+ //        debugPrintfEXT("\nMxM_Basic.comp with:\nTOTAL_M(%d), TOTAL_N(%d), TOTAL_K(%d)\nTILE_M(%d), TILE_N(%d), TILE_K(%d)\nlayoutA_Mfirst(%d), layoutB_Kfirst(%d)\nWGSize(%d, %d, %d), numWG(%d, %d, %d)\n", 
+ //        TOTAL_M, TOTAL_N, TOTAL_K, TILE_M, TILE_N, TILE_K, la, lb,
+ //        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z,
+ //        gl_NumWorkGroups.x, gl_NumWorkGroups.y, gl_NumWorkGroups.z);
+ //    //debugPrintfEXT("\nRunning GLSL shader MxM_Basic.comp at GlobalInvocationID(0,0,0), WorkGroupSize(%d, %d, %d)\n", gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+
+//    if ((gl_GlobalInvocationID.x == 0) && (gl_GlobalInvocationID.y == 0) && (gl_GlobalInvocationID.z == 0))
+//        debugPrintfEXT("\nMxM_Basic.comp with gl_SubgroupSize(%d)\n", gl_SubgroupSize);
+
+    const uint32_t block_id_m = gl_GlobalInvocationID.y;
+    const uint32_t block_id_n = gl_GlobalInvocationID.z;
+    if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return;
+
+    const uint32_t row = block_id_m * TILE_M;
+    const uint32_t col = block_id_n * TILE_N;
+    
+    // Initialize result matR to zero, not using matC in this shader
+    coopmat<R_TYPE, gl_ScopeSubgroup, TILE_M, TILE_N, gl_MatrixUseAccumulator> matR;
+    matR = coopmat<R_TYPE, gl_ScopeSubgroup, TILE_M, TILE_N, gl_MatrixUseAccumulator>(0.0);
+    
+    for (uint32_t step = 0; step < TOTAL_K; step += TILE_K)
+    {
+        // On each iteration, load a row of cooperative matrices from matrix A,
+        // load a column of cooperative matrices from matrix B, and multiply all
+        // pairs of those matrices.
+        uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step;
+        uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements;
+
+        //debugPrintfEXT("\nstep(%d), subMatrixAStartInElements(%d), strideAinElements(%d), subMatrixBStartInElements(%d), strideBinElements(%d)\nat GlobalID(%d, %d, %d), LocalID(%d, %d, %d), WGID(%d, %d, %d)\n",
+        //    step, subMatrixAStartInElements, strideAinElements, subMatrixBStartInElements, strideBinElements,
+        //    gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z,
+        //    gl_LocalInvocationID.x, gl_LocalInvocationID.y, gl_LocalInvocationID.z,
+        //    gl_WorkGroupID.x, gl_WorkGroupID.y, gl_WorkGroupID.z);
+         
+
+        coopmat<A_TYPE, gl_ScopeSubgroup, TILE_M, TILE_K, gl_MatrixUseA> matA;
+        coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst));
+
+        coopmat<A_TYPE, gl_ScopeSubgroup, TILE_K, TILE_N, gl_MatrixUseB> matB;
+        coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst));
+
+        //for (int i = (gl_LocalInvocationID.x > 63 ? 20 : 0); i < 100; i++) // diable unroll, test gpu_freq, should around 1%
+        matR = coopMatMulAdd(matA, matB, matR);
+    }
+
+    // Store results
+    uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col;
+
+    coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst));
+
+    // Example of how to peek results before storing back
+    //float f = float(matR[1]);
+    //if (gl_LocalInvocationIndex == 0) debugPrintfEXT("matR[0]=%f\n", f);
+
+}
+)";
+
+const char* Test03_CONV = R"(
+#version 450 core
+#pragma use_vulkan_memory_model
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32   : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8    : enable
+#extension GL_QCOM_cooperative_matrix_conversion : enable
+
+// These specialized constants are set inside the host
+layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0
+layout(constant_id = 1) const uint lsy = 2;  // local_size_y set inside the host and map to constant_id = 1
+layout(constant_id = 2) const uint lsz = 2;  // local_size_z set inside the host and map to constant_id = 2
+layout(constant_id = 3)  const uint TOTAL_M = 1;
+layout(constant_id = 4)  const uint TOTAL_N = 1;
+layout(constant_id = 5)  const uint TOTAL_K = 1;
+layout(constant_id = 6)  const uint TILE_M = 1;
+layout(constant_id = 7)  const uint TILE_N = 1;
+layout(constant_id = 8)  const uint TILE_K = 1;
+layout(constant_id = 9)  const uint INPUT_W = 1;
+layout(constant_id = 10)  const uint INPUT_H = 1;
+layout(constant_id = 11)  const uint FILTER_W = 1;
+layout(constant_id = 12)  const uint FILTER_H = 1;
+layout(constant_id = 13) const uint DILATION = 1;
+layout(constant_id = 14) const uint STRIDE  = 1;
+layout(constant_id = 15) const uint strideAinElements = 1;
+layout(constant_id = 16) const uint strideBinElements = 1;
+layout(constant_id = 17) const uint strideCinElements = 1;
+layout(constant_id = 18) const uint strideRinElements = 1;
+
+// #defines set on compiler GLSL to SPIR-V command line:
+// A_TYPE = e.g. float or float16_t
+// R_TYPE = e.g. float or float16_t
+
+layout(set=0, binding=0) readonly buffer InputA     { A_TYPE   x[]; } inputA;
+layout(set=0, binding=0) readonly buffer InputAuint { uint32_t x[]; } inputAuint;
+layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB;
+layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC;
+layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO;
+
+// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; 
+
+// Very simple shader, similar to our OpenCL CONV https://github.qualcomm.com/grtrt/TCU/blob/main/Kernels/Conv_MxM_Short.cl
+// 
+void main()
+{
+    const uint32_t block_id_m = gl_GlobalInvocationID.y;
+    const uint32_t block_id_n = gl_GlobalInvocationID.z;
+    if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return;
+
+    const uint32_t row = block_id_m * TILE_M;
+    const uint32_t col = block_id_n * TILE_N;
+    
+    uint32_t gidx_m = gl_GlobalInvocationID.x + TILE_M * gl_GlobalInvocationID.y; // fibers along M
+    uint32_t out_col_id = gidx_m % INPUT_W;
+    uint32_t out_row_id = gidx_m / INPUT_W;
+
+    uint32_t filter_offset_h = (FILTER_H % 2 == 0)? 0 : FILTER_H/2;
+    uint32_t filter_offset_w = (FILTER_W % 2 == 0)? 0 : FILTER_W/2;
+
+    // Initialize result matR to zero, not using matC in this shader
+    coopmat<R_TYPE, gl_ScopeSubgroup, TILE_M, TILE_N, gl_MatrixUseAccumulator> matR;
+    matR = coopmat<R_TYPE, gl_ScopeSubgroup, TILE_M, TILE_N, gl_MatrixUseAccumulator>(0.0);
+    
+    for (uint32_t step = 0; step < TOTAL_K; step += TILE_K)
+    {
+        uint32_t subMatrixBStartInElements = col * FILTER_H * FILTER_W * strideBinElements + step; // B is Kfirst
+        for (uint32_t filter_row = 0; filter_row < FILTER_H; filter_row++)
+        {
+            for (uint32_t filter_col = 0; filter_col < FILTER_W; filter_col++)
+            {
+                coopmat<A_TYPE, gl_ScopeSubgroup, TILE_K, TILE_N, gl_MatrixUseB> matB;
+                coopmat<A_TYPE, gl_ScopeSubgroup, TILE_M, TILE_K, gl_MatrixUseA> matA;
+
+                // load B matrix input data using coop_mat extension
+                coopMatLoad(matB, inputB.x, subMatrixBStartInElements, FILTER_H * FILTER_W * strideBinElements, int(true));
+
+                // load A matrix input data as vectors using regular vector load
+                uint32_t input_row_id = STRIDE * out_row_id + DILATION * (filter_row - filter_offset_h);
+                uint32_t input_col_id = STRIDE * out_col_id + DILATION * (filter_col - filter_offset_w);
+
+                // load A vector data from memory
+                uint32_t vecA[TILE_K/NUM_PACK];
+                for (int i=0; i<TILE_K/NUM_PACK; i++)
+                    vecA[i] = inputAuint.x[(input_row_id * INPUT_W + input_col_id) * strideAinElements/NUM_PACK + step/NUM_PACK + i];
+
+                // zero fill A vector data for out of boundary cases
+                if ((input_row_id < 0) || (input_row_id >= INPUT_H) || (input_col_id < 0) || (input_col_id >= INPUT_W))
+                  for (int i=0; i<TILE_K/NUM_PACK; i++) vecA[i] = uint32_t(0);
+
+                // convert A vector to A matrix
+                vectorToCoopmatQCOM(vecA, matA);
+
+                matR = coopMatMulAdd(matA, matB, matR);
+
+                subMatrixBStartInElements += strideBinElements;
+            }
+        }
+    }
+
+    // Store results
+    uint32_t subMatrixRStartInElements = row * strideRinElements + col;
+    coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(false));
+}
+)";
+
+const char* Test02_MxM_VecToMat = R"(
+#version 450 core
+#pragma use_vulkan_memory_model
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_EXT_buffer_reference : enable
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int32   : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8    : enable
+#extension GL_QCOM_cooperative_matrix_conversion : enable
+
+// These specialized constants are set inside the host
+layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0
+layout(constant_id = 1) const uint lsy = 2;  // local_size_y set inside the host and map to constant_id = 1
+layout(constant_id = 2) const uint lsz = 2;  // local_size_z set inside the host and map to constant_id = 2
+layout(constant_id = 3) const uint TOTAL_M = 1;
+layout(constant_id = 4) const uint TOTAL_N = 1;
+layout(constant_id = 5) const uint TOTAL_K = 1;
+layout(constant_id = 6) const uint TILE_M = 1;
+layout(constant_id = 7) const uint TILE_N = 1;
+layout(constant_id = 8) const uint TILE_K = 1;
+layout(constant_id = 9) const bool layoutA_Mfirst = false;
+layout(constant_id = 10) const bool layoutB_Kfirst = false;
+layout(constant_id = 11) const bool layoutC_Mfirst = false;
+layout(constant_id = 12) const bool layoutR_Mfirst = false;
+layout(constant_id = 13) const uint strideAinElements = 1;
+layout(constant_id = 14) const uint strideBinElements = 1;
+layout(constant_id = 15) const uint strideCinElements = 1;
+layout(constant_id = 16) const uint strideRinElements = 1;
+
+// #defines set on compiler GLSL to SPIR-V command line:
+// A_TYPE = e.g. float or float16_t
+// R_TYPE = e.g. float or float16_t
+
+layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA;
+layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB;
+layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC;
+layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO;
+
+//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params;
+
+// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; 
+
+// Very simple shader, similar to our OpenCL MxM https://github.qualcomm.com/grtrt/TCU/blob/main/Kernels/MxM_Baseline_Sections.cl
+// 
+void main()
+{
+    // Example of how to use printf, for details https://confluence.qualcomm.com/confluence/display/GCEA/Use+printf%28...%29+inside+a+Vulkan+shader+using+GLSL
+    if(gl_LocalInvocationIndex == 0)
+       debugPrintfEXT("\nRunning SPIR-V shader (QCOM version) gl_WorkGroupSize(%d, %d, %d)\n", gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+
+    const uint32_t block_id_m = gl_GlobalInvocationID.y;
+    const uint32_t block_id_n = gl_GlobalInvocationID.z;
+    if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return;
+
+    const uint32_t row = block_id_m * TILE_M;
+    const uint32_t col = block_id_n * TILE_N;
+    
+    // Initialize result matR to zero, not using matC in this shader
+    coopmat<R_TYPE, gl_ScopeSubgroup, 64, 64, gl_MatrixUseAccumulator> matR;
+    matR = coopmat<R_TYPE, gl_ScopeSubgroup, 64, 64, gl_MatrixUseAccumulator>(0.0);
+    
+    for (uint32_t step = 0; step < TOTAL_K; step += 8)
+    {
+        // On each iteration, load a row of cooperative matrices from matrix A,
+        // load a column of cooperative matrices from matrix B, and multiply all
+        // pairs of those matrices.
+        uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step;
+        uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements;
+
+        coopmat<A_TYPE, gl_ScopeSubgroup, TILE_M, TILE_K, gl_MatrixUseA> matA;
+#define NEW
+#ifdef NEW
+        uint32_t uvecA[8];
+        for (int i=0; i<8; i++)
+          uvecA[i] = floatBitsToInt(inputA.x[subMatrixAStartInElements + gl_GlobalInvocationID.x * strideAinElements + i]);
+        matA = constructCoopMatA64QCOM(uvecA, gl_Float32QCOM);
+#else
+        coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst));
+#endif
+
+        coopmat<A_TYPE, gl_ScopeSubgroup, TILE_K, TILE_N, gl_MatrixUseB> matB;
+        coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst));
+
+        matR = coopMatMulAdd(matA, matB, matR);
+    }
+
+    // Store results
+    uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col;
+
+    coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst));
+}
+)";
\ No newline at end of file
diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp
new file mode 100644
index 0000000..00f138f
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp
@@ -0,0 +1,1803 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+
+#include "cooperative_matrix_tester.hpp"
+#include "cooperative_matrix_shaders.hpp"
+#include "vulkan/extensionHelpers.hpp"
+#include "vulkan/extensionLib.hpp"
+#include <../external/glslang/glslang/Include/glslang_c_interface.h>
+#include <../external/glslang/glslang/Public/resource_limits_c.h>
+
+#pragma push_macro("BOOL")
+#define BOOL HALF_BOOL
+#include "half/half.h"
+#pragma pop_macro("BOOL")
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 1
+#endif
+
+#include "imgui.h"
+
+#include <random>
+#include <iostream>
+#include <filesystem>
+#include <sstream>
+
+#define CHECK_VK(cmd)                                                                 \
+    {                                                                                 \
+        VkResult local_result = cmd;                                                        \
+        if(local_result == VK_SUCCESS){}                                                          \
+        else if (local_result == VK_NOT_READY || local_result == VK_TIMEOUT ||                           \
+                 local_result == VK_EVENT_SET || local_result == VK_EVENT_RESET ||               \
+                 local_result == VK_INCOMPLETE)                                             \
+        {                                                                             \
+            LOGW("CHECK_VK: Warning - %s returned %d", #cmd, static_cast<int>(local_result)); \
+        }                                                                             \
+        else                                                                          \
+        {                                                                             \
+            LOGE("CHECK_VK: Error - %s returned %d", #cmd, static_cast<int>(local_result));  \
+            assert(false);                                                              \
+        }                                                                             \
+    }
+
+
+#define CHECK_BOOL(expr)                                                              \
+    {                                                                                 \
+        bool local_result = (expr);                                                         \
+        if (!local_result)                                                                   \
+        {                                                                             \
+            LOGE("CHECK_BOOL: Error - %s evaluated to false", #expr);                \
+        }                                                                             \
+    }
+
+namespace
+{
+    enum gpu_vendors
+    {
+        VK_VENDOR_ID_UNKNOWN = 0,
+        VK_VENDOR_ID_NVIDIA = 0x10de,
+        VK_VENDOR_ID_QUALCOMM = 0x5143,
+        VK_VENDOR_ID_AMD = 0x1002,
+        VK_VENDOR_ID_INTEL = 0x8086,
+        VK_VENDOR_ID_APPLE = 0x106b
+    };
+
+    enum gpu_tiers
+    {
+        TIER_UNKNOWN = 0,
+        QCOM_TIER_PAKALA = 0x44050000,
+        QCOM_TIER_KAANAPALI = 0x44050A30,
+        QCOM_TIER_GLYMUR = 0x44070040,
+        QCOM_TIER_GLYMUR_TEST = 0x36334630,
+        QCOM_TIER_HAWI = 0x44051430,
+        NVIDIA_TIER_RTX2070 = 0x1F14
+    };
+
+    const char* GetMatrixTypeName(VkComponentTypeKHR component_type)
+    {
+        switch (component_type)
+        {
+            case VK_COMPONENT_TYPE_FLOAT16_KHR: return "FLOAT16";
+            case VK_COMPONENT_TYPE_FLOAT32_KHR: return "FLOAT32";
+            case VK_COMPONENT_TYPE_FLOAT64_KHR: return "FLOAT64";
+            case VK_COMPONENT_TYPE_SINT8_KHR: return "SINT8";
+            case VK_COMPONENT_TYPE_SINT16_KHR: return "SINT16";
+            case VK_COMPONENT_TYPE_SINT32_KHR: return "SINT32";
+            case VK_COMPONENT_TYPE_SINT64_KHR: return "SINT64";
+            case VK_COMPONENT_TYPE_UINT8_KHR: return "UINT8";
+            case VK_COMPONENT_TYPE_UINT16_KHR: return "UINT16";
+            case VK_COMPONENT_TYPE_UINT32_KHR: return "UINT32";
+            case VK_COMPONENT_TYPE_UINT64_KHR: return "UINT64";
+            case VK_COMPONENT_TYPE_BFLOAT16_KHR: return "BFLOAT16";
+            case VK_COMPONENT_TYPE_SINT8_PACKED_NV: return "SINT8_PACKED";
+            case VK_COMPONENT_TYPE_UINT8_PACKED_NV: return "UINT8_PACKED";
+            case VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT: return "FLOAT8_E4M3";
+            case VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT: return "FLOAT8_E5M2";
+            default: return "UNKNOWN TYPE";
+        }
+    }
+
+    const char* GetMatrixComponentTypeName(VkComponentTypeKHR type)
+    {
+        switch (type)
+        {
+            case VK_COMPONENT_TYPE_FLOAT64_KHR: return "FP64";
+            case VK_COMPONENT_TYPE_FLOAT32_KHR: return "FP32";
+            case VK_COMPONENT_TYPE_FLOAT16_KHR: return "FP16";
+            case VK_COMPONENT_TYPE_SINT8_KHR:   return "INT8";
+            case VK_COMPONENT_TYPE_SINT16_KHR:  return "INT16";
+            case VK_COMPONENT_TYPE_SINT32_KHR:  return "INT32";
+            case VK_COMPONENT_TYPE_SINT64_KHR:  return "INT64";
+            default:                            return "UNKNOWN";
+        }
+    }
+
+    bool FindMatrixProperty(
+        std::span<VkCooperativeMatrixPropertiesKHR> cooperativeMatrixProperties,
+        VkCooperativeMatrixPropertiesKHR &cooperativeMatrixProps, 
+        uint32_t MSize, 
+        uint32_t NSize, 
+        uint32_t KSize,
+        VkComponentTypeKHR AType, 
+        VkComponentTypeKHR BType, 
+        VkComponentTypeKHR CType, 
+        VkComponentTypeKHR RType)
+    {
+        bool valid_testtypes = false;
+    
+        int32_t matrixprop;
+        for(matrixprop = 0; matrixprop < cooperativeMatrixProperties.size() && !valid_testtypes; ++matrixprop)
+        {
+            if ((cooperativeMatrixProperties[matrixprop].ResultType == RType) &&
+                (cooperativeMatrixProperties[matrixprop].CType       == CType) &&
+                (cooperativeMatrixProperties[matrixprop].BType       == BType) &&
+                (cooperativeMatrixProperties[matrixprop].AType       == AType) &&
+                (MSize != 0 ? cooperativeMatrixProperties[matrixprop].MSize == MSize : true) &&
+                (NSize != 0 ? cooperativeMatrixProperties[matrixprop].NSize == NSize : true) &&
+                (KSize != 0 ? cooperativeMatrixProperties[matrixprop].KSize == KSize : true) )
+            {
+                valid_testtypes = true;
+                cooperativeMatrixProps = cooperativeMatrixProperties[matrixprop];
+            }
+        }
+
+        return valid_testtypes;
+    }
+
+
+    static const char* ShaderPaths[]
+    {
+        Test01_MxM_Basic,
+        Test02_MxM_VecToMat,
+        Test03_CONV,
+    };
+
+    struct TestCase
+    {
+        TestType testType;
+        VkComponentTypeKHR inputType;
+        VkComponentTypeKHR outputType;
+
+        // TOTAL_M, TOTAL_N, TOTAL_K is the size of the full R=AxB+C matrix multiply
+        uint32_t TOTAL_M;
+        uint32_t TOTAL_N;
+        uint32_t TOTAL_K;
+
+        // Each cooperative matrix multiply is R[TILE_M, TILE_N] = A[TILE_M, TILE_K] x B[TILE_K, TILE_N] + C[TILE_M, TILE_N]
+        uint32_t TILE_M;
+        uint32_t TILE_N;
+        uint32_t TILE_K;
+
+        bool layoutA_Mfirst;
+        bool layoutB_Kfirst;
+        bool layoutC_Mfirst;
+        bool layoutR_Mfirst;
+
+        uint32_t strideAinElements;
+        uint32_t strideBinElements;
+        uint32_t strideCinElements;
+        uint32_t strideRinElements;
+    };
+
+    struct sComponentTypeInfo
+    {
+        const char* typeName;
+        uint32_t bits;
+    };
+
+    struct sComponentTypeInfo ComponentTypeInfo[] =
+    {                       // From vulkan_core.h
+        { "float16",  16 }, // VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+        { "float32",  32 }, // VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+        { "float64",  64 }, // VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+        { "int8",     8 },  // VK_COMPONENT_TYPE_SINT8_KHR = 3,
+        { "int16",    16 }, // VK_COMPONENT_TYPE_SINT16_KHR = 4,
+        { "int32",    32 }, // VK_COMPONENT_TYPE_SINT32_KHR = 5,
+        { "int64",    64 }, // VK_COMPONENT_TYPE_SINT64_KHR = 6,
+        { "uint8",    8 },  // VK_COMPONENT_TYPE_UINT8_KHR = 7,
+        { "uint16",   16 }, // VK_COMPONENT_TYPE_UINT16_KHR = 8,
+        { "uint32",   32 }, // VK_COMPONENT_TYPE_UINT32_KHR = 9,
+        { "uint64",   64 }, // VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    };
+
+    const char* scopeString[] = {
+        "invalid",
+        "device",
+        "workgroup",
+        "subgroup",
+        "invalid",
+        "queuefamily",
+    };
+
+    struct MatrixDesc
+    {
+        struct
+        {
+            uint32_t rows, cols;
+        } dims;
+        VkComponentTypeKHR dataType;
+        size_t elementSize;
+        VkDeviceSize bufferSize;
+        uint32_t totalElements;
+
+        // Create a host- and device-local buffer for each input and output.
+        // Descriptors point at the device buffers.
+        VkBuffer hostBuffer;
+        VkDeviceMemory hostMemory;
+        VkBuffer deviceBuffer;
+        VkDeviceMemory deviceMemory;
+        void* ptr;
+
+        bool isFloatType() const
+        {
+            switch (dataType)
+            {
+            default:
+                return false;
+            case VK_COMPONENT_TYPE_FLOAT16_KHR:
+            case VK_COMPONENT_TYPE_FLOAT32_KHR:
+            case VK_COMPONENT_TYPE_FLOAT64_KHR:
+                return true;
+            }
+        }
+
+        void setDataFloat(uint32_t i, float value)
+        {
+            if (dataType == VK_COMPONENT_TYPE_FLOAT32_KHR)
+            {
+                ((float*)ptr)[i] = value;
+            }
+            else
+            {
+                uint32_t asInt = *(uint32_t*)&value;
+                int sign = (asInt & 0x80000000) >> 31;
+                int exp = ((asInt & 0x7f800000) >> 23) - 127;
+                int mantissa = (asInt & 0x7FFFFF);
+
+                sign = sign << 15;
+                exp = (exp + 15) << 10;
+                mantissa = mantissa >> (23 - 10);
+
+                if (asInt != 0) {
+                    asInt = sign | exp | mantissa;
+                }
+
+                ((uint16_t*)ptr)[i] = asInt;
+            }
+        }
+
+        float getDataFloat(uint32_t i) const
+        {
+            if (dataType == VK_COMPONENT_TYPE_FLOAT32_KHR)
+            {
+                return ((float*)ptr)[i];
+            }
+            else
+            {
+                uint32_t asInt = ((uint16_t*)ptr)[i];
+                int sign = (asInt & 0x8000) >> 15;
+                int exp = ((asInt & 0x7c00) >> 10) - 15;
+                int mantissa = (asInt & 0x3FF);
+
+                sign = sign << 31;
+                exp = (exp + 127) << 23;
+                mantissa = mantissa << (23 - 10);
+
+                if (asInt != 0) {
+                    asInt = sign | exp | mantissa;
+                }
+
+                return *(float*)&asInt;
+            }
+        }
+
+        float getDataFloat(int m, int n, bool colMajor) const
+        {
+            return getDataFloat(colMajor ? (n * dims.rows + m) : (m * dims.cols + n));
+        }
+
+        void setDataInt(uint32_t i, uint32_t value)
+        {
+            assert(ComponentTypeInfo[dataType].bits == 8 || ComponentTypeInfo[dataType].bits == 32);
+            switch (dataType) {
+            default: assert(0); // fallthrough
+            case VK_COMPONENT_TYPE_UINT8_KHR:    ((uint8_t*)ptr)[i] = (uint8_t)value; break;
+            case VK_COMPONENT_TYPE_UINT32_KHR:   ((uint32_t*)ptr)[i] = (uint32_t)value; break;
+            case VK_COMPONENT_TYPE_SINT8_KHR:    ((int8_t*)ptr)[i] = (int8_t)value; break;
+            case VK_COMPONENT_TYPE_SINT32_KHR:   ((int32_t*)ptr)[i] = (int32_t)value; break;
+            }
+        }
+
+        uint32_t getDataInt(uint32_t i) const
+        {
+            assert(ComponentTypeInfo[dataType].bits == 8 || ComponentTypeInfo[dataType].bits == 32);
+            switch (dataType) {
+            default: assert(0); // fallthrough
+            case VK_COMPONENT_TYPE_UINT8_KHR:	return ((uint8_t*)ptr)[i];
+            case VK_COMPONENT_TYPE_UINT32_KHR:	return ((uint32_t*)ptr)[i];
+            case VK_COMPONENT_TYPE_SINT8_KHR:	return ((int8_t*)ptr)[i];
+            case VK_COMPONENT_TYPE_SINT32_KHR:	return ((int32_t*)ptr)[i];
+            }
+        }
+
+        uint32_t getDataInt(int m, int n, bool colMajor) const
+        {
+            return getDataInt(colMajor ? (n * dims.rows + m) : (m * dims.cols + n));
+        }
+    };
+
+
+    template<typename T>
+    void InitMatrix(T* matrix, unsigned int mrows, unsigned int mcols, unsigned int stride, FillDataType init, unsigned int set_num_decimals=2)//, int sequence, float const_init)
+    {
+        struct MatrixKey
+        {
+            unsigned int mrows;
+            unsigned int mcols;
+            unsigned int stride;
+            FillDataType init;
+            unsigned int set_num_decimals;
+
+            bool operator==(const MatrixKey& other) const
+            {
+                return mrows == other.mrows &&
+                       mcols == other.mcols &&
+                       stride == other.stride &&
+                       init == other.init &&
+                       set_num_decimals == other.set_num_decimals;
+            }
+        };
+
+        struct MatrixKeyHasher
+        {
+            std::size_t operator()(const MatrixKey& key) const
+            {
+                std::size_t h1 = std::hash<unsigned int>{}(key.mrows);
+                std::size_t h2 = std::hash<unsigned int>{}(key.mcols);
+                std::size_t h3 = std::hash<unsigned int>{}(key.stride);
+                std::size_t h4 = std::hash<int>{}(key.init);
+                std::size_t h5 = std::hash<unsigned int>{}(key.set_num_decimals);
+                return h1 ^ (h2 << 1) ^ (h3 << 2) ^ (h4 << 3) ^ (h5 << 4);
+            }
+        };
+
+        static std::unordered_map<MatrixKey, std::vector<T>, MatrixKeyHasher> cache;
+
+        MatrixKey key{ mrows, mcols, stride, init, set_num_decimals };
+
+        auto it = cache.find(key);
+        if (it != cache.end())
+        {
+            std::memcpy(matrix, it->second.data(), mrows * stride * sizeof(T));
+            return;
+        }
+
+        std::vector<T> temp_matrix(mrows * stride, T(0));
+
+        float r, rr;
+
+        float flow  = 0.0f;
+        float fhigh = 1.0f;
+        int range   = 3; // 3 -> -1, 0 and 1
+        static int  counter = 0;
+        float const_init = 1.0f;
+        //int sequence = 2048;// Float16: Integers between 0 and 2048 can be exactly represented (and also between -2048 and 0)
+        int sequence = 3;// 2048;// Float16: Integers between 0 and 2048 can be exactly represented (and also between -2048 and 0)
+        if (sizeof(T) == 1) sequence = 255; // 256 is too simple of a sequence
+
+        static unsigned seed = 3;
+        std::srand(seed++); // srand seed doesn't work with time(0)
+        std::cout << "Initializing ROWxCOL=" << mrows << "x" << mcols << " matrix (stride=" << stride << ") with init option = " << init << " and using " << set_num_decimals << " number of decimals\n";
+
+        // Set the buffer to '0' in case mcols < stride, init only mrows*mcols elements, 
+        memset((void*)matrix, 0, size_t(mrows * stride));
+
+        //	unsigned int counter=1; // for debugging purpose
+        for (unsigned int row = 0; row < mrows; row++) // y
+        {
+            for (unsigned int col = 0; col < mcols; col++) // x
+            {
+                switch (init)
+                {
+                case FILL_WITH_ZERO:
+                    r = 0;
+                    break;
+                case FILL_WITH_CONSTANTS:
+                    r = const_init; // default const_init=1.0f
+                    break;
+                case FILL_WITH_RANDON_UINT:
+                    r = float(std::rand() % range); // defualt range=3 -> init_matrix will be 0, 1, 2
+                    break;
+                case FILL_WITH_RANDON_INT:
+                    r = float(std::rand() % range - ((range - 1) / 2)); // defualt range=3 -> init_matrix will be - 1, 0 and 1 -> guarantee average 0 for dot products preventing float16 going out of range
+                    break;
+                case FILL_SEQUENCE_INT:
+                    r = float(counter++ % sequence);// + const_init;
+                    break;
+                case FILL_WITH_RANDOM_LOW_HIGH_INT:
+                    r = T(std::rand() % int(fhigh)) + int(flow);
+                    break;
+                case FILL_WITH_RANDOM_FLOAT:
+                    r = flow + float(rand()) / ((float(RAND_MAX) / (fhigh - flow)));
+                    break;
+                case FILL_WITH_RANDOM_PLUS1_MINUS1_FLOAT:
+                    //r = float(rand());
+                    r = rand() > RAND_MAX/2 ? float(1.0) : float(-1.0);
+                    break;
+                default:
+                    LOGE("Invalid InitMatrix(...) initialization option '-i:%d'", init);
+                }
+                // Force to fixed number of decimals based on user input
+                std::ostringstream o;
+                o << std::setprecision(set_num_decimals) << std::fixed << r;
+                rr = std::stof(o.str());
+                // load the matrix
+                temp_matrix[row * stride + col] = T(rr);
+            }
+        }
+
+        std::memcpy(matrix, temp_matrix.data(), mrows* stride * sizeof(T));
+        cache[key] = std::move(temp_matrix);
+    }
+
+    template<typename T>
+    void TransposeMatrix(T* matrix, const unsigned int& mrows, const unsigned int& mcols, const char *info)
+    {
+        std::cout << "\nTransposing MxM(" << info << ") on CPU, input type '" << typeid(matrix).name() << "', number of rows: '" << mrows << "', number of columns: '" << mcols << "', IT'LL TAKE SOME TIME!!!\n\n";
+
+        unsigned int count = mcols * mrows;
+
+        for (unsigned int col = 0; col < mcols; ++col)
+        {
+            unsigned int count_adjustment = mcols - col - 1;
+
+            for (unsigned int row = 0, step = 1; row < mrows; ++row, step += count_adjustment)
+            {
+                unsigned int last = count - (row + col * mrows);
+                unsigned int first = last - step;
+
+                std::rotate(matrix + first, matrix + first + 1, matrix + last);
+            }
+        }
+
+        //std::swap(mrows, mcols);
+        std::cout << "\nFinished Transposing MxM on CPU\n";
+    }
+
+    template<typename T>
+    void TransposeMatrix(T* matrix, const unsigned int& mrows, const unsigned int& mcols, T* matrixOut)
+    {
+        std::cout << "\nTransposing MxM on CPU, input type '" << typeid(matrix).name() << "', output type '" << typeid(matrixOut).name() << "', number of rows: '" << mrows << "', number of columns: '" << mcols << "', IT'LL TAKE SOME TIME!!!\n\n";
+
+        unsigned int count = mcols * mrows;
+
+        for (unsigned int col = 0; col < mcols; col++)
+            for (unsigned int row = 0; row < mrows; row++)
+                matrixOut[col*mrows + row] = matrix[row*mcols + col];
+    
+        std::cout << "\nFinished Transposing MxM on CPU\n";
+    }
+}
+
+CooperativeMatrixRunner::CooperativeMatrixRunner(Vulkan& vulkan_instance)
+    : m_vulkan_instance(vulkan_instance)
+{
+    glslang_initialize_process();
+}
+
+CooperativeMatrixRunner::~CooperativeMatrixRunner()
+{
+    glslang_finalize_process();
+}
+
+bool CooperativeMatrixRunner::InitializeRunner()
+{
+    if (!m_vulkan_instance.HasLoadedVulkanDeviceExtension(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME))
+    {
+        LOGE("Required Extension not supported %s", VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME);
+        LOGE("Platform does not support Cooperative Matrices. Cannot test.\n");
+
+        return false;
+    }
+
+    auto cooperativeMatrixEXT = m_vulkan_instance.GetExtension<ExtensionLib::Ext_VK_KHR_cooperative_matrix>();
+    if (!cooperativeMatrixEXT)
+    {
+        LOGE("Ext_VK_KHR_cooperative_matrix potentially unresolved!");
+    }
+
+    // select supported cooperative matrix types/sizes
+    uint32_t nCoopMatrixPropCount = 0;
+    CHECK_VK(cooperativeMatrixEXT->m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+        m_vulkan_instance.m_VulkanGpu,
+        &nCoopMatrixPropCount,
+        NULL
+    ));
+    
+    m_hFoundCooperativeMatrices.resize(nCoopMatrixPropCount);
+    for (auto& matrixProp : m_hFoundCooperativeMatrices)
+    {
+        matrixProp.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+        matrixProp.pNext = nullptr;
+    }
+
+    CHECK_VK(cooperativeMatrixEXT->m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+            m_vulkan_instance.m_VulkanGpu,
+            &nCoopMatrixPropCount,
+            &m_hFoundCooperativeMatrices[0]
+    ));
+
+    LOGI("Found Cooperative Matrices:\n");
+    for (auto& cm : m_hFoundCooperativeMatrices)
+    {
+        LOGI("\tMxNxK: %ux%ux%u\n", cm.MSize, cm.NSize, cm.KSize);
+        LOGI("\tA: %s | ", GetMatrixTypeName(cm.AType));
+        LOGI("B: %s | ", GetMatrixTypeName(cm.BType));
+        LOGI("C: %s | ", GetMatrixTypeName(cm.CType));
+        LOGI("D: %s\n",  GetMatrixTypeName(cm.ResultType));
+        LOGI("\tSaturating Accumulation: %u | Scope: %u\n\n", cm.saturatingAccumulation, cm.scope);
+    }
+
+    // Setup the test templates
+
+    m_test_group_templates.push_back(TestGroupTemplateDescription{
+        VK_COMPONENT_TYPE_FLOAT32_KHR ,
+        VK_COMPONENT_TYPE_FLOAT32_KHR ,
+        {
+            {8, 6, 128, // SizeInBlocks
+             0, 64, 0}, // Size (tile)
+
+            {8, 12, 128,
+             0, 32, 0},
+
+            {8, 24, 128,
+             0, 16, 0}
+        } });
+
+    m_test_group_templates.push_back(TestGroupTemplateDescription{
+        VK_COMPONENT_TYPE_FLOAT16_KHR ,
+        VK_COMPONENT_TYPE_FLOAT16_KHR ,
+        {
+            {8, 6, 128, // SizeInBlocks
+             0, 64, 0}, // Size (tile)
+
+            {8, 12, 128,
+             0, 32, 0},
+
+            {8, 24, 128,
+             0, 16, 0}
+        } });
+
+    m_test_group_templates.push_back(TestGroupTemplateDescription{
+        VK_COMPONENT_TYPE_SINT8_KHR ,
+        VK_COMPONENT_TYPE_SINT32_KHR ,
+        {
+            {8, 6, 128, // SizeInBlocks
+             0, 64, 0}, // Size (tile)
+
+            {8, 12, 128,
+             0, 32, 0},
+
+            {8, 24, 128,
+             0, 16, 0}
+        } });
+
+    return true;
+}
+
+bool CooperativeMatrixRunner::TriggerPendingTests()
+{
+    if (!m_is_processing_tests)
+    {
+        return true;
+    }
+
+    for (auto& test_group : m_test_groups)
+    {
+        for (auto& test_entry : test_group.test_entries)
+        {
+            if (test_entry.test_descriptions.size() != test_entry.test_results.size())
+            {
+                for (const auto& test_description : test_entry.test_descriptions)
+                {
+                    const auto test_result = RunTest(test_description);
+                    if (test_result)
+                    {
+                        test_entry.test_results.push_back(test_result.value());
+                    }
+                    else
+                    {
+                        test_entry.test_results.push_back(TestResult());
+                    } 
+
+                    m_total_processed_tests++;
+                }
+
+                // Process a single test entry per frame (so we can display progress on the UI)
+                return true;
+            }
+        }
+    }
+
+    m_is_processing_tests = false;
+
+    return true;
+}
+
+void CooperativeMatrixRunner::RenderUI()
+{
+    const bool disable_ui = m_is_processing_tests;
+    ImGui::BeginDisabled(disable_ui);
+    ImGui::BeginGroup();
+
+    if (ImGui::CollapsingHeader("Test Configuration", ImGuiTreeNodeFlags_DefaultOpen))
+    {
+        ImGui::DragInt("Test Repeats", &m_test_repeats, 1.0f, 0, 100);
+
+        // NOTE: Validation (and its transpose option) will be added in a future path
+        ImGui::BeginDisabled();
+        if (m_validate_matrix_result)
+        {
+            ImGui::BeginDisabled();
+            static bool always_true = true;
+            ImGui::Checkbox("Transpose When Needed", &always_true);
+            ImGui::EndDisabled();
+        }
+        else
+        {
+            ImGui::Checkbox("Transpose When Needed", &m_transpose_when_needed);
+        }
+
+        ImGui::Checkbox("Validate Result", &m_validate_matrix_result);
+        ImGui::EndDisabled();
+
+        static const char* test_case_names[] = {
+            "MxM Basic",
+            "MxM Vector To Matrix",
+            "CONV",
+        };
+
+        int test_type_current_index = static_cast<int>(m_test_type);
+        bool changed = false;
+
+        if (ImGui::BeginCombo("Test Case", test_case_names[test_type_current_index]))
+        {
+            for (int i = 0; i < static_cast<int>(TestType::TT_COUNT); ++i)
+            {
+                const bool is_selected = (test_type_current_index == i);
+                if (ImGui::Selectable(test_case_names[i], is_selected))
+                {
+                    m_test_type = static_cast<TestType>(i);
+                    changed     = true;
+                }
+
+                if (is_selected)
+                    ImGui::SetItemDefaultFocus();
+            }
+            ImGui::EndCombo();
+        }
+
+        ImGui::Separator();
+
+        static const char* fill_type_labels[] = {
+                "Fill with Zero",
+                "Fill with Constants",
+                "Fill with Random UInt",
+                "Fill with Random Int",
+                "Fill Sequence Int",
+                "Fill with Random Low/High Int",
+                "Fill with Random Float",
+                "Fill with Random +/-1 Float"
+        };
+
+        int fill_data_current_index = static_cast<int>(m_fill_data_type);
+
+        if (ImGui::Combo("Fill Data Type", &fill_data_current_index, fill_type_labels, IM_ARRAYSIZE(fill_type_labels)))
+        {
+            m_fill_data_type = static_cast<FillDataType>(fill_data_current_index);
+        }
+
+        ImGui::Separator();
+
+        static const char* option_labels[] = { "True", "False", "Variable" };
+        static const char* matrix_labels[] = { "A", "B", "C", "R"};
+
+        for (std::size_t i = 0; i < NUM_MATS; ++i)
+        {
+            int current_index = static_cast<int>(m_matrix_transpose_options[i]);
+
+            char label[32];
+            std::snprintf(label, sizeof(label), "Transpose Matrix %s", matrix_labels[i]);
+
+            if (ImGui::Combo(label, &current_index, option_labels, IM_ARRAYSIZE(option_labels)))
+            {
+                m_matrix_transpose_options[i] = static_cast<MatrixTransposeOption>(current_index);
+            }
+        }
+    }
+
+    if (ImGui::CollapsingHeader("Device Configuration", 0))
+    {
+        ImGui::Text("Default values for Pakala [SM8750][Adreno830] - Change as needed");
+        ImGui::DragInt("GPU Frequency MHz", &m_gpu_freq_MHz, 1.0f, 0, 999999);
+        ImGui::DragInt("GPU Micro SP", &m_gpu_microSP, 1.0f, 0, 999999);
+        ImGui::DragInt("GPU ALU per Micro SP", &m_gpu_ALU_per_microSP, 1.0f, 0, 999999);
+        ImGui::DragInt("GPU OPs per MAD", &m_gpu_ops_per_mad, 1.0f, 0, 999999);
+    }
+
+    ImGui::Separator();
+
+    if (ImGui::Button("Run Tests"))
+    {
+        PrepareTestSession();
+    }
+
+    ImGui::Text("For accurate values, make sure you are using the right device configurations (check 'Device Configuration' tab)");
+
+    if (m_is_processing_tests)
+    {
+        ImGui::SameLine();
+        ImGui::EndDisabled();
+        ImGui::Text("Processing Test [%d] of [%d]", m_total_processed_tests, m_total_tests);
+        ImGui::SameLine();
+        ImGui::ProgressBar(static_cast<float>(m_total_processed_tests) / static_cast<float>(std::max(0u, m_total_tests)));
+        ImGui::BeginDisabled(disable_ui);
+    }
+
+    if (!m_test_groups.empty())
+    {
+        for (int i=0; i< m_test_groups.size(); i++)
+        {
+            const auto& test_group = m_test_groups[i];
+
+            // Quick table exit if none of its entries are valid/supported
+            if (!test_group.test_entries.empty() && !test_group.test_entries.back().test_results.empty())
+            {
+                bool is_any_result_valid = false;
+                for (const auto& test_result : test_group.test_entries.back().test_results)
+                {
+                    is_any_result_valid |= test_result.is_valid;
+                }
+                
+                if (!is_any_result_valid)
+                {
+                    continue;
+                }
+            }
+
+            std::string collapsing_header_title = std::string("Test #").append(std::to_string(i)) +
+                std::string(" - ") + GetMatrixComponentTypeName(test_group.template_description.input_type) +
+                std::string(" input / ") +
+                GetMatrixComponentTypeName(test_group.template_description.output_type) +
+                std::string(" output");
+
+            const bool show_matrix_d = false;
+
+            if (ImGui::CollapsingHeader(collapsing_header_title.c_str()))
+            {
+                ImGuiStyle& style                   = ImGui::GetStyle();
+                const float original_scrollbar_size = style.ScrollbarSize;
+                style.ScrollbarSize                 = 40.0f;
+
+                ImGui::BeginChild("##test_results");
+                if (ImGui::BeginTable("TestResultTable", (NUM_MATS - (show_matrix_d ? 0 : 1)) + 3, ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg | ImGuiTableFlags_Resizable))
+                {
+                    ImGui::TableSetupColumn("A", ImGuiTableColumnFlags_WidthFixed, 100.0f);
+                    ImGui::TableSetupColumn("B", ImGuiTableColumnFlags_WidthFixed, 100.0f);
+                    ImGui::TableSetupColumn("C", ImGuiTableColumnFlags_WidthFixed, 100.0f);
+
+                    if (show_matrix_d)
+                    {
+                        ImGui::TableSetupColumn("D", ImGuiTableColumnFlags_WidthFixed, 100.0f);
+                    }
+
+                    for (const auto& size_configuration : test_group.template_description.size_configurations)
+                    {
+                        ImGui::TableSetupColumn(("NSize=" + std::to_string(size_configuration.NSize)).c_str());
+                        // TODO: Should we print the rest of the size config?
+                    }
+
+                    ImGui::TableHeadersRow();
+
+                    // Each test entry will be a table row
+                    for (int test_entry_index = 0; test_entry_index < test_group.test_entries.size(); test_entry_index++)
+                    {
+                        const auto& test_entry   = test_group.test_entries[test_entry_index];
+                        int current_column_index = 0;
+
+                        ImGui::TableNextRow();
+
+                        // Transpose flags
+                        ImGui::TableSetColumnIndex(current_column_index++);
+                        ImGui::Text("%s", test_entry.layoutA_Mfirst ? "M-first" : "K-first");
+
+                        ImGui::TableSetColumnIndex(current_column_index++);
+                        ImGui::Text("%s", test_entry.layoutB_Nfirst ? "N-first" : "K-first");
+
+                        ImGui::TableSetColumnIndex(current_column_index++);
+                        ImGui::Text("%s", test_entry.layoutC_Mfirst ? "M-first" : "N-first");
+
+                        if (show_matrix_d)
+                        {
+                            ImGui::TableSetColumnIndex(current_column_index++);
+                            ImGui::Text("%s", test_entry.layoutR_Mfirst ? "M-first" : "N-first");
+                        }
+
+                        // For each of the NSize configs
+                        for (int test_result_index = 0; test_result_index < test_entry.test_results.size(); test_result_index++)
+                        {
+                            ImGui::TableSetColumnIndex(current_column_index++);
+
+                            const auto& test_description = test_entry.test_descriptions[test_result_index];
+                            const auto& test_result      = test_entry.test_results[test_result_index];
+                            
+                            if (test_result.is_valid)
+                            {
+                                auto GetPercentageColor = [](float value) -> ImVec4
+                                {
+                                    value = std::clamp(value, 0.0f, 1.0f);
+
+                                    if (value < 0.5f)
+                                    {
+                                        float t = value / 0.5f;
+                                        return ImVec4(1.0f, t, 0.0f, 1.0f);
+                                    }
+                                    else
+                                    {
+                                        float t = (value - 0.5f) / 0.5f;
+                                        return ImVec4(1.0f - t, 1.0f, 0.0f, 1.0f);
+                                    }
+                                };
+
+                                ImGui::Text("[Time]: %.2fus", test_result.time_total);
+                                ImGui::Text("[TOPS]: %.2f", test_result.TOPS);
+                                ImVec4 color = GetPercentageColor(test_result.percentage / 100.0f);
+                                ImGui::PushStyleColor(ImGuiCol_Text, color);
+                                ImGui::Text("[%%]: %.2f", test_result.percentage);
+                                ImGui::PopStyleColor();
+                            }
+                            else
+                            {
+                                ImGui::Text("N/A - Not Supported");
+                            }
+                        }
+                    }
+
+                    ImGui::EndTable();
+                }
+                ImGui::EndChild();
+
+                style.ScrollbarSize = original_scrollbar_size;
+            }
+        }
+    }
+
+    ImGui::EndGroup();
+    ImGui::EndDisabled();
+}
+
+void CooperativeMatrixRunner::PrepareTestSession()
+{
+    m_vulkan_instance.WaitUntilIdle();
+
+    m_test_groups.clear();
+    m_total_tests           = 0;
+    m_total_processed_tests = 0;
+
+    auto GenerateTransposeCombinations = [&]() -> std::vector<std::vector<bool>>
+    {
+        std::vector<std::vector<bool>> combinations;
+
+        std::vector<std::size_t> variable_indices;
+        std::vector<bool> fixed_values(NUM_MATS);
+
+        for (std::size_t i = 0; i < NUM_MATS; ++i)
+        {
+            switch (m_matrix_transpose_options[i])
+            {
+                case MatrixTransposeOption::ALWAYS_TRUE:
+                    fixed_values[i] = true;
+                    break;
+                case MatrixTransposeOption::ALWAYS_FALSE:
+                    fixed_values[i] = false;
+                    break;
+                case MatrixTransposeOption::VARIABLE:
+                    variable_indices.push_back(i);
+                    break;
+            }
+        }
+
+        std::size_t num_combinations = 1ULL << variable_indices.size();
+        combinations.reserve(num_combinations);
+
+        for (std::size_t combo = 0; combo < num_combinations; ++combo)
+        {
+            std::vector<bool> current(NUM_MATS);
+
+            for (std::size_t i = 0; i < NUM_MATS; ++i)
+            {
+                current[i] = fixed_values[i];
+            }
+
+            for (std::size_t bit = 0; bit < variable_indices.size(); ++bit)
+            {
+                std::size_t index = variable_indices[bit];
+                current[index] = (combo >> bit) & 1;
+            }
+
+            combinations.push_back(std::move(current));
+        }
+
+        return combinations;
+    };
+
+    const auto transpose_combinations = GenerateTransposeCombinations();
+
+    for (const auto& test_template_description : m_test_group_templates)
+    {
+        TestGroup new_test_group;
+        new_test_group.template_description = test_template_description;
+
+        TestDescription new_test_description;
+
+        new_test_description.fill_data_type = m_fill_data_type;
+        new_test_description.gpu_freq_MHz   = m_gpu_freq_MHz;
+        new_test_description.test_type      = m_test_type;
+
+        new_test_description.inputWidth  = 1;
+        new_test_description.inputHeight = 1;
+
+        new_test_description.input_type  = test_template_description.input_type;
+        new_test_description.output_type = test_template_description.output_type;
+
+        new_test_description.perf_loop = static_cast<uint32_t>(m_test_repeats);
+
+        for (auto& transposeCombination : transpose_combinations)
+        {
+            TestGroup::TestRowEntry test_entry;
+
+            new_test_description.layoutA_Mfirst = transposeCombination[0];
+            new_test_description.layoutB_Nfirst = transposeCombination[1];
+            new_test_description.layoutC_Mfirst = transposeCombination[2];
+            new_test_description.layoutR_Mfirst = transposeCombination[3];
+
+            test_entry.layoutA_Mfirst = new_test_description.layoutA_Mfirst;
+            test_entry.layoutB_Nfirst = new_test_description.layoutB_Nfirst;
+            test_entry.layoutC_Mfirst = new_test_description.layoutC_Mfirst;
+            test_entry.layoutR_Mfirst = new_test_description.layoutR_Mfirst;
+
+            for (auto& size_configuration : test_template_description.size_configurations)
+            {
+                new_test_description.MSizeInBlocks = size_configuration.MSizeInBlocks;
+                new_test_description.NSizeInBlocks = size_configuration.NSizeInBlocks;
+                new_test_description.KSizeInBlocks = size_configuration.KSizeInBlocks;
+                new_test_description.MSize         = size_configuration.MSize;
+                new_test_description.NSize         = size_configuration.NSize;
+                new_test_description.KSize         = size_configuration.KSize;
+
+                test_entry.test_descriptions.push_back(new_test_description);
+                m_total_tests++;
+            }
+
+            new_test_group.test_entries.push_back(test_entry);
+        }
+
+        m_test_groups.push_back(new_test_group);
+    }
+
+    m_is_processing_tests = true;
+}
+
+std::optional<CooperativeMatrixRunner::TestResult> CooperativeMatrixRunner::RunTest(const TestDescription& test_description)
+{
+    TestResult test_result = {};
+    test_result.is_valid = true;
+
+    VkResult result;
+
+    uint32_t gpu_freq_MHz = test_description.gpu_freq_MHz;
+
+    int MSize = test_description.MSize;
+    int NSize = test_description.NSize;
+    int KSize = test_description.KSize;
+    int MSizeInBlocks = test_description.MSizeInBlocks;
+    int NSizeInBlocks = test_description.NSizeInBlocks;
+    int KSizeInBlocks = test_description.KSizeInBlocks;
+
+    uint32_t perf_loop = test_description.perf_loop;
+    
+    bool layoutA_Mfirst = test_description.layoutA_Mfirst;
+    bool layoutB_Kfirst = !test_description.layoutB_Nfirst;
+    bool layoutC_Mfirst = test_description.layoutC_Mfirst;
+    bool layoutR_Mfirst = test_description.layoutR_Mfirst;
+
+    int inputWidth  = test_description.inputWidth;
+    int inputHeight = test_description.inputHeight;
+
+    uint32_t tt = static_cast<uint32_t>(test_description.test_type);
+    int init    = test_description.fill_data_type;
+
+    auto command_pool_queue_family_index = m_vulkan_instance.m_VulkanQueues[Vulkan::QueueIndex::eGraphicsQueue].QueueFamilyIndex;
+    auto submission_queue                = m_vulkan_instance.m_VulkanQueues[command_pool_queue_family_index].Queue;
+
+    // Not optimal at all but we are drawing the UI and running the test in the same queue
+    m_vulkan_instance.QueueWaitIdle(Vulkan::QueueIndex::eGraphicsQueue);
+
+    const auto subgroup_size = m_vulkan_instance.GetExtension<ExtensionLib::Vulkan_SubgroupPropertiesHook>()->Properties.subgroupSize;
+    const auto gpuvendor_id = static_cast<gpu_vendors>(m_vulkan_instance.GetGpuProperties().Base.properties.vendorID);
+    const auto gputier_id   = static_cast<gpu_tiers>(m_vulkan_instance.GetGpuProperties().Base.properties.deviceID);
+    
+    const auto device_limits = m_vulkan_instance.GetGpuProperties().Base.properties.limits;
+
+    // Create descriptor set and descriptor set layout for our A,B,C,R matrices (buffers)
+    //
+    VkDescriptorSetLayout descriptorSetLayout;
+    VkDescriptorSet descriptorSet;
+
+    auto create_buffers_desc_set = [](VkDevice device, VkDescriptorSetLayout & descriptorSetLayout, VkDescriptorSet & descriptorSet, const uint32_t num_buffers)
+    {
+        VkResult result;
+
+        // Descriptor set are always bound at command buffer level
+        // There is only 1 descriptor per resource
+
+        // How to allocate descriptor sets:
+        // 
+        // 1. Create a pool of sufficient size (use multiple VkDescriptorPoolSize)
+        //    Use vkCreateDescriptorPool() to actually create the pool on the GPU
+        // 2. Create a VkDescriptorSetLayout for each descriptor set
+        //    Specify the resource bindings within the descriptor set using
+        //    VkDescriptorSetLayoutBinding elements per resource
+        // 3. Allocate a new set from the pool using vkAllocateDescriptorSets
+        //    The reference to the VkDescriptorPool is specified in the associated
+        //    VkDescriptorSetAllocateInfo config struct.
+        //    Bind all relevant VkDescriptorSet handles (from step 3.) for
+        //    draw/compute/ray tracing via vkCmdBindDescriptorSets
+
+        // 1) Create a descriptor pool (1 set)
+        VkDescriptorPoolSize* poolSizes = new VkDescriptorPoolSize[num_buffers];
+        for (uint32_t i = 0; i < num_buffers; i++)
+            poolSizes[i] = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 };
+
+        VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {};
+        descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+        descriptorPoolCreateInfo.pNext = NULL;
+        descriptorPoolCreateInfo.maxSets = 1; // Use only 1 set for all descriptors
+        descriptorPoolCreateInfo.poolSizeCount = num_buffers;
+        descriptorPoolCreateInfo.pPoolSizes = poolSizes;
+
+        VkDescriptorPool descriptorPool;
+        result = vkCreateDescriptorPool(device, &descriptorPoolCreateInfo, NULL, &descriptorPool);
+        CHECK_VK(result);
+
+        // 2) Create a VkDescriptorSetLayout for each descriptor set
+        // This compute shader uses 3 UBO and 1 SBO 
+        VkDescriptorSetLayoutBinding* layoutBindings = new VkDescriptorSetLayoutBinding[num_buffers];
+        for (uint32_t i = 0; i < num_buffers; i++)
+        {
+            layoutBindings[i].binding = i;
+            layoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            layoutBindings[i].descriptorCount = 1;
+            layoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+            layoutBindings[i].pImmutableSamplers = nullptr;
+        }
+
+        //  Next take layout bindings and use them to create a descriptor set layout
+        VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = {};
+        descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+        descriptorSetLayoutCreateInfo.pNext = nullptr;
+        descriptorSetLayoutCreateInfo.flags = 0;
+        descriptorSetLayoutCreateInfo.bindingCount = num_buffers;
+        descriptorSetLayoutCreateInfo.pBindings = layoutBindings;
+
+        result = vkCreateDescriptorSetLayout(device, &descriptorSetLayoutCreateInfo, NULL, &descriptorSetLayout);
+        CHECK_VK(result);
+
+        // 3. Allocate a new set from the pool using vkAllocateDescriptorSets
+        VkDescriptorSetAllocateInfo setAllocateInfo = {};
+        setAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+        setAllocateInfo.pNext = nullptr;
+        setAllocateInfo.descriptorPool = descriptorPool;
+        setAllocateInfo.descriptorSetCount = 1; // Use only 1 set for all descriptors
+        setAllocateInfo.pSetLayouts = &descriptorSetLayout;
+
+        result = vkAllocateDescriptorSets(device, &setAllocateInfo, &descriptorSet);
+        CHECK_VK(result);
+
+        delete[] poolSizes;
+        delete[] layoutBindings;
+    };
+    
+    create_buffers_desc_set(m_vulkan_instance.m_VulkanDevice, descriptorSetLayout, descriptorSet, NUM_MATS);
+
+    // Create command pool
+    //
+    VkCommandPoolCreateInfo commandPoolCreateInfo = { VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, (uint32_t)command_pool_queue_family_index };
+    VkCommandPool commandPool;
+    result = vkCreateCommandPool(m_vulkan_instance.m_VulkanDevice, &commandPoolCreateInfo, NULL, &commandPool);
+    CHECK_VK(result);
+
+    // Create command buffer
+    //
+    // The command buffers, one for initializing buffers, one for compute, one
+    // for reading back the results. This lets us time the compute work more
+    // precisely.
+    VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, nullptr, commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 3 };
+    VkCommandBuffer commandBuffers[3];
+    result = vkAllocateCommandBuffers(m_vulkan_instance.m_VulkanDevice, &commandBufferAllocateInfo, commandBuffers);
+    CHECK_VK(result);
+   
+    // Creat Pipeline layout
+    // Use only 1 set for all descriptors
+    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, NULL, 0, 1, &descriptorSetLayout, 0, nullptr };
+    VkPipelineLayout pipelineLayout;
+    result = vkCreatePipelineLayout(m_vulkan_instance.m_VulkanDevice, &pipelineLayoutCreateInfo, NULL, &pipelineLayout);
+    CHECK_VK(result);
+
+    // Query matrix properties and see if the test is supported for the given GPU
+    bool valid_testtypes = false;
+    VkCooperativeMatrixPropertiesKHR cooperativeMatrixProps = {};
+    if (!FindMatrixProperty(m_hFoundCooperativeMatrices, cooperativeMatrixProps, MSize, NSize, KSize, test_description.input_type, test_description.input_type, test_description.output_type, test_description.output_type))
+    {
+        return std::nullopt;
+    }
+
+    // Set local_size (workgroup size) based on GPU/Tier (nvidia, glymur, pakala, etc.), and datatype (fp32, fp16, etc)
+    // Default for 'unknown' or gpu/tier not recohgnized is local_size(64,2,2) for all datatyes
+    uint32_t local_size_x = 0, local_size_y = 0, local_size_z = 0;
+
+    switch (gpuvendor_id)
+    {
+        case VK_VENDOR_ID_NVIDIA:
+            local_size_x = subgroup_size;
+            local_size_y = 1;
+            local_size_z = 1;
+            break;
+        case VK_VENDOR_ID_AMD:
+            local_size_x = subgroup_size;
+            local_size_y = 1;
+            local_size_z = 1;
+            break;
+        case VK_VENDOR_ID_INTEL:
+            local_size_x = subgroup_size;
+            local_size_y = 1;
+            local_size_z = 1;
+            break;
+        case VK_VENDOR_ID_APPLE:
+            local_size_x = subgroup_size;
+            local_size_y = 1;
+            local_size_z = 1;
+            break;
+        case VK_VENDOR_ID_QUALCOMM:
+            local_size_x = subgroup_size;
+            local_size_y = 2;
+            local_size_z = 2;
+            break;
+        default: // unknown, including gpu option not part of the map
+            printf("\nUnknown GPU or GPU no set with -gpu:[nvidia|qualcomm|pakala|kaanapali|glymmur|etc.]");
+            local_size_x = 64;
+            local_size_y = 2;
+            local_size_z = 2;
+            break;
+    }
+
+    RuntimeShader runtime_shader;
+
+    // Set compiler options
+    //
+    std::vector<const char*> compiler_options;
+    int bytesPerInput; // = int8 ? 1 : fp16 ? 2 : 4;
+    int bytesPerOutput;// =            fp16 ? 2 : 4;
+
+    if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR)
+    {
+        runtime_shader.AddDefine("A_TYPE", std::string("float"));
+        runtime_shader.AddDefine("R_TYPE", std::string("float"));
+        bytesPerInput = 4;
+        bytesPerOutput = 4;
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR)
+    {
+        runtime_shader.AddDefine("A_TYPE", std::string("float16_t"));
+        runtime_shader.AddDefine("R_TYPE", std::string("float16_t"));
+        bytesPerInput = 2;
+        bytesPerOutput = 2;
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR)
+    {
+        runtime_shader.AddDefine("A_TYPE", std::string("uint8_t"));
+        runtime_shader.AddDefine("R_TYPE", std::string("uint32_t"));
+        bytesPerInput = 1;
+        bytesPerOutput = 4;
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR)
+    {
+        runtime_shader.AddDefine("A_TYPE", std::string("int8_t"));
+        runtime_shader.AddDefine("R_TYPE", std::string("int32_t"));
+        bytesPerInput = 1;
+        bytesPerOutput = 4;
+    }
+    else
+    {
+        return std::nullopt;
+    }
+
+    if (!runtime_shader.Build(ShaderPaths[tt], m_vulkan_instance.m_VulkanDevice, "main", glslang_stage_t::GLSLANG_STAGE_COMPUTE))
+    {
+        LOGE("Failed to compile test shader");
+        return std::nullopt;
+    }
+    
+    VkShaderModule shaderModule = runtime_shader.GetShaderModule();
+
+    if (tt == TT_CONV && (inputWidth * inputHeight != MSizeInBlocks * cooperativeMatrixProps.MSize))
+    {
+        LOGE("Convolution ConvInputWidth * ConvInputHeight (%d) must equal MSizeInBlocks * MSize (%d) for current datatype",
+            (inputWidth * inputHeight), (MSizeInBlocks * cooperativeMatrixProps.MSize));
+        return std::nullopt;
+    }
+
+    int filterWidth  = 3;
+    int filterHeight = 3;
+    int dilation = 1;
+    int stride   = 1;
+
+    TestCase testCase = {};
+
+    testCase.testType   = (TestType)tt;
+    testCase.inputType  = cooperativeMatrixProps.AType;
+    testCase.outputType = cooperativeMatrixProps.ResultType;
+
+    // MxNxK is the size of the full matrix multiply
+    testCase.TOTAL_M = cooperativeMatrixProps.MSize * MSizeInBlocks;
+    testCase.TOTAL_N = cooperativeMatrixProps.NSize * NSizeInBlocks;
+    testCase.TOTAL_K = cooperativeMatrixProps.KSize * KSizeInBlocks;
+
+    int mA_paddedM = testCase.TOTAL_M;
+    int mA_paddedK = testCase.TOTAL_K;
+    int mB_paddedN = testCase.TOTAL_N;
+    int mB_paddedK = testCase.TOTAL_K;
+    int mC_paddedM = testCase.TOTAL_M;
+    int mC_paddedN = testCase.TOTAL_N;
+    int mR_paddedM = testCase.TOTAL_M;
+    int mR_paddedN = testCase.TOTAL_N;
+
+    std::cout << "\nPadding image width to fix CCHE bank mapping issue." << std::endl;
+    // 512bits is one line in the CCHE (512bits/8bits = 64bytes)
+    if (layoutA_Mfirst) mA_paddedM += (mA_paddedM % (128 / bytesPerInput))  ? 0 : 64 / bytesPerInput;  else  mA_paddedK += (mA_paddedK % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput;
+    if (layoutB_Kfirst) mB_paddedK += (mB_paddedK % (128 / bytesPerInput))  ? 0 : 64 / bytesPerInput;  else  mB_paddedN += (mB_paddedN % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput;
+    if (layoutC_Mfirst) mC_paddedM += (mC_paddedM % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; else  mC_paddedN += (mC_paddedN % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput;
+    if (layoutR_Mfirst) mR_paddedM += (mR_paddedM % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; else  mR_paddedN += (mR_paddedN % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput;
+
+    // Each cooperative matrix multiply is R[TILE_M, TILE_N] = A[TILE_M, TILE_K] x B[TILE_K, TILE_N] + C[TILE_M, TILE_N]
+    testCase.TILE_M = cooperativeMatrixProps.MSize;
+    testCase.TILE_N = cooperativeMatrixProps.NSize;
+    testCase.TILE_K = cooperativeMatrixProps.KSize;
+
+    testCase.layoutA_Mfirst = (uint32_t)layoutA_Mfirst;
+    testCase.layoutB_Kfirst = (uint32_t)layoutB_Kfirst;
+    testCase.layoutC_Mfirst = (uint32_t)layoutC_Mfirst;
+    testCase.layoutR_Mfirst = (uint32_t)layoutR_Mfirst;
+
+    testCase.strideAinElements = (layoutA_Mfirst ? mA_paddedM : mA_paddedK);
+    testCase.strideBinElements = (layoutB_Kfirst ? mB_paddedK : mB_paddedN);
+    testCase.strideCinElements = (layoutC_Mfirst ? mC_paddedM : mC_paddedN);
+    testCase.strideRinElements = (layoutR_Mfirst ? mR_paddedM : mR_paddedN);
+
+    auto FindProperties = [](const VkPhysicalDeviceMemoryProperties* pMemoryProperties,
+        uint32_t memoryTypeBitsRequirement, VkMemoryPropertyFlags requiredProperties) -> int32_t
+    {
+        const uint32_t memoryCount = pMemoryProperties->memoryTypeCount;
+        for (uint32_t memoryIndex = 0; memoryIndex < memoryCount; ++memoryIndex) {
+            const uint32_t memoryTypeBits = (1 << memoryIndex);
+            const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
+
+            const VkMemoryPropertyFlags properties =
+                pMemoryProperties->memoryTypes[memoryIndex].propertyFlags;
+            const bool hasRequiredProperties =
+                (properties & requiredProperties) == requiredProperties;
+
+            if (isRequiredMemoryType && hasRequiredProperties)
+                return static_cast<int32_t>(memoryIndex);
+        }
+
+        // failed to find memory type
+        return -1;
+    };
+
+    auto CreateMatrixDesc = [&](
+        VkDevice device, 
+        VkPhysicalDeviceMemoryProperties& memory_properties,
+        MatrixDesc& m, 
+        VkComponentTypeKHR dt, 
+        int rows, 
+        int cols)
+    {
+        VkResult result;
+
+        m.dims.rows = rows;
+        m.dims.cols = cols;
+        m.dataType = dt;
+        m.elementSize = ComponentTypeInfo[m.dataType].bits / 8; // float->4-buyes, float16->2 bytes, int8->1 byte
+        m.totalElements = m.dims.cols * m.dims.rows;
+        m.bufferSize = m.totalElements * m.elementSize;
+
+        VkBufferCreateInfo bufferCreateInfo = {
+            VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            NULL,
+            0,
+            m.bufferSize,
+            VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT,
+            VK_SHARING_MODE_EXCLUSIVE,
+            0u,
+            NULL,
+        };
+
+        result = vkCreateBuffer(device, &bufferCreateInfo, NULL, &m.hostBuffer);
+        CHECK_VK(result);
+        result = vkCreateBuffer(device, &bufferCreateInfo, NULL, &m.deviceBuffer);
+        CHECK_VK(result);
+
+        VkMemoryRequirements memReqs;
+        vkGetBufferMemoryRequirements(device, m.hostBuffer, &memReqs);
+
+        int32_t hostIndex = FindProperties(&memory_properties, memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
+        int32_t deviceIndex = FindProperties(&memory_properties, memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+
+        VkMemoryAllocateFlagsInfo memAllocateFlagsInfo = {VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, NULL,VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, 0};
+        VkMemoryAllocateInfo memAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &memAllocateFlagsInfo, memReqs.size, (uint32_t)hostIndex};
+
+        result = vkAllocateMemory(device, &memAllocateInfo, NULL, &m.hostMemory);
+        CHECK_VK(result);
+
+        memAllocateInfo.memoryTypeIndex = deviceIndex;
+        result = vkAllocateMemory(device, &memAllocateInfo, NULL, &m.deviceMemory);
+        CHECK_VK(result);
+
+        result = vkBindBufferMemory(device, m.hostBuffer, m.hostMemory, 0);
+        CHECK_VK(result);
+
+        result = vkBindBufferMemory(device, m.deviceBuffer, m.deviceMemory, 0);
+        CHECK_VK(result);
+
+        result = vkMapMemory(device, m.hostMemory, 0, m.bufferSize, 0, &m.ptr);
+        CHECK_VK(result);
+    };
+
+    VkPhysicalDeviceMemoryProperties memory_properties;
+    vkGetPhysicalDeviceMemoryProperties(m_vulkan_instance.m_VulkanGpu, &memory_properties);
+
+    MatrixDesc matrices[NUM_MATS];
+
+    CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_A], cooperativeMatrixProps.AType, mA_paddedM, mA_paddedK);
+    if (tt == TT_CONV) CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_B], cooperativeMatrixProps.AType, filterWidth*filterWidth*mB_paddedN, mB_paddedK);
+    else               CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_B], cooperativeMatrixProps.AType, mB_paddedK, mB_paddedN);
+    CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_C], cooperativeMatrixProps.CType, mC_paddedM, mC_paddedN);
+    CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_R], cooperativeMatrixProps.ResultType, mR_paddedM, mR_paddedN);
+
+    auto update_buffer_descriptor_set = [](VkDevice device, MatrixDesc * matrices, uint32_t num_matrices, VkDescriptorSet & descriptorSet)
+    {
+        VkDescriptorBufferInfo* bufferDescriptor = new VkDescriptorBufferInfo[num_matrices];
+
+        for (uint32_t i = 0; i < num_matrices; i++)
+        {
+            bufferDescriptor[i].buffer = matrices[i].deviceBuffer;
+            bufferDescriptor[i].offset = 0;
+            bufferDescriptor[i].range = matrices[i].bufferSize;
+        }
+
+        VkWriteDescriptorSet* writeDescriptorset = new VkWriteDescriptorSet[num_matrices];
+
+        for (uint32_t i = 0; i < num_matrices; i++)
+        {
+            writeDescriptorset[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+            writeDescriptorset[i].pNext = nullptr;
+            writeDescriptorset[i].dstSet = descriptorSet;
+            writeDescriptorset[i].dstBinding = i;
+            writeDescriptorset[i].dstArrayElement = 0;
+            writeDescriptorset[i].descriptorCount = 1;
+            writeDescriptorset[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            writeDescriptorset[i].pImageInfo = nullptr;
+            writeDescriptorset[i].pBufferInfo = &bufferDescriptor[i];
+            writeDescriptorset[i].pTexelBufferView = nullptr;
+        }
+
+        vkUpdateDescriptorSets(device, num_matrices, writeDescriptorset, 0, NULL);
+
+        delete[] bufferDescriptor;
+        delete[] writeDescriptorset;
+    };
+
+    update_buffer_descriptor_set(m_vulkan_instance.m_VulkanDevice, matrices, NUM_MATS, descriptorSet);
+
+    float*    matrixR_CPU_fp32   = new float[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols]();
+    FLOAT16*     matrixR_CPU_fp16   = new FLOAT16[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols]();
+    int32_t*  matrixR_CPU_sint32 = new int32_t[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols]();
+    uint32_t* matrixR_CPU_uint32 = new uint32_t[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols]();
+    std::ostringstream fna, fnb, fnr_cpu, fnr_vk;
+    fna << "matrixA_" << "M" << testCase.TOTAL_M << "xK" << testCase.TOTAL_K << ".txt";
+    fnb << "matrixB_" << "K" << testCase.TOTAL_K << "xN" << testCase.TOTAL_N << ".txt";
+
+    // ToDo: Think in how to use templates!
+    if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR)) // CONV test case, input/output data Type Float 32?
+    {
+        InitMatrix((float*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((float*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((float*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((float*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+    }
+    else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR)) // CONV test case, input/output data Type Float 16?
+    {
+        InitMatrix((FLOAT16*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+    }
+    else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR)) // CONV test case, Input data Type signed int8, output data type signed int 32?
+    {
+        InitMatrix((int8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((int8_t*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((int32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((int32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+    }
+    else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR)) // CONV test case, Input data Type signed int8, output data type signed int 32?
+    {
+        InitMatrix((uint8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((uint8_t*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((uint32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((uint32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+    }
+    else if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR) // Input/output data Type Float 32?
+    {
+        InitMatrix((float*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((float*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((float*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);  
+        InitMatrix((float*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+
+        if (m_transpose_when_needed || m_validate_matrix_result)
+        {
+            if (layoutA_Mfirst) // Matrix A M-First?
+                TransposeMatrix((float*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst");
+            if (layoutB_Kfirst) // Matrix B K-First?
+                TransposeMatrix((float*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst");
+            if (layoutC_Mfirst) // Matrix C M-First?
+                TransposeMatrix((float*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst");
+        }
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR) // Input/output data Type Float 16?
+    {
+
+        InitMatrix((FLOAT16*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((FLOAT16*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+
+        if (m_transpose_when_needed || m_validate_matrix_result)
+        {
+            if (layoutA_Mfirst) // Matrix A M-First?
+                TransposeMatrix((FLOAT16*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst");
+            if (layoutB_Kfirst) // Matrix B K-First?
+                TransposeMatrix((FLOAT16*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst");
+            if (layoutC_Mfirst) // Matrix C M-First?
+                TransposeMatrix((FLOAT16*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst");
+        }
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR) // Input data Type signed int8, output data type signed int 32?
+    {
+        InitMatrix((int8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((int8_t*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((int32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((int32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+
+        if (m_transpose_when_needed || m_validate_matrix_result)
+        {
+            if (layoutA_Mfirst) // Matrix A M-First?
+                TransposeMatrix((int8_t*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst");
+            if (layoutB_Kfirst) // Matrix B K-First?
+                TransposeMatrix((int8_t*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst");
+            if (layoutC_Mfirst) // Matrix C M-First?
+                TransposeMatrix((int32_t*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst");
+        }
+    }
+    else
+    if (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR) // Data Type input unsigned int 8, data type output unsigned int 32?
+    {
+        InitMatrix((uint8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2);
+        InitMatrix((uint8_t*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2);
+        InitMatrix((uint32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2);
+        InitMatrix((uint32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2);
+
+        if (m_transpose_when_needed || m_validate_matrix_result)
+        {
+            if (layoutA_Mfirst) // Matrix A M-First?
+                TransposeMatrix((uint8_t*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst");
+            if (layoutB_Kfirst) // Matrix B K-First?
+                TransposeMatrix((uint8_t*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst");
+            if (layoutC_Mfirst) // Matrix C M-First?
+                TransposeMatrix((uint32_t*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst");
+        }
+    }
+    else
+    {
+        return std::nullopt;
+    }
+
+    // Specialize the shader with the matrix sizes, strides, and constants.
+    // Also, work-group sizes
+    const uint32_t specDataMxM[] = {   // pass to shader_name.comp
+        local_size_x,               // layout(constant_id = 0) const uint local_size_x;
+        local_size_y,               // layout(constant_id = 1) const uint local_size_y;
+        local_size_z,               // layout(constant_id = 2) const uint local_size_z;
+        testCase.TOTAL_M,           // layout(constant_id = 3) const uint TOTAL_M = 1;
+        testCase.TOTAL_N,           // layout(constant_id = 4) const uint TOTAL_N = 1;
+        testCase.TOTAL_K,           // layout(constant_id = 5) const uint TOTAL_K = 1;
+        testCase.TILE_M,            // layout(constant_id = 6) const uint TILE_M = 1;
+        testCase.TILE_N,            // layout(constant_id = 7) const uint TILE_N = 1;
+        testCase.TILE_K,            // layout(constant_id = 8) const uint TILE_K = 1;
+        testCase.layoutA_Mfirst,    // layout(constant_id = 9) const bool layoutA_Mfirst = false;
+        testCase.layoutB_Kfirst,    // layout(constant_id =10) const bool layoutB_Kfirst = false;
+        testCase.layoutC_Mfirst,    // layout(constant_id =11) const bool layoutC_Mfirst = false;
+        testCase.layoutR_Mfirst,    // layout(constant_id =12) const bool layoutR_Mfirst = false;
+        testCase.strideAinElements, // layout(constant_id =13) const uint strideAinElements = 1;
+        testCase.strideBinElements, // layout(constant_id =14) const uint strideBinElements = 1;
+        testCase.strideCinElements, // layout(constant_id =15) const uint strideCinElements = 1;
+        testCase.strideRinElements  // layout(constant_id =16) const uint strideRinElements = 1;
+    };
+
+    const uint32_t specDataCONV[] = {   // pass to shader_name.comp
+        local_size_x,               // layout(constant_id = 0) const uint local_size_x;
+        local_size_y,               // layout(constant_id = 1) const uint local_size_y;
+        local_size_z,               // layout(constant_id = 2) const uint local_size_z;
+        testCase.TOTAL_M,           // layout(constant_id = 3) const uint TOTAL_M = 1;
+        testCase.TOTAL_N,           // layout(constant_id = 4) const uint TOTAL_N = 1;
+        testCase.TOTAL_K,           // layout(constant_id = 5) const uint TOTAL_K = 1;
+        testCase.TILE_M,            // layout(constant_id = 6) const uint TILE_M = 1;
+        testCase.TILE_N,            // layout(constant_id = 7) const uint TILE_N = 1;
+        testCase.TILE_K,            // layout(constant_id = 8) const uint TILE_K = 1;
+        (uint32_t)inputWidth,       // layout(constant_id = 9) const uint INPUT_W = 1;
+        (uint32_t)inputHeight,      // layout(constant_id =10) const uint INPUT_H = 1;
+        (uint32_t)filterWidth,      // layout(constant_id =11) const uint FILTER_W = 1;
+        (uint32_t)filterHeight,     // layout(constant_id =12) const uint FILTER_H = 1;
+        (uint32_t)dilation,         // layout(constant_id =13) const uint DILATION = 1;
+        (uint32_t)stride,           // layout(constant_id =14) const uint STRIDE  = 1;
+        testCase.strideAinElements, // layout(constant_id =15) const uint strideAinElements = 1;
+        testCase.strideBinElements, // layout(constant_id =16) const uint strideBinElements = 1;
+        testCase.strideCinElements, // layout(constant_id =17) const uint strideCinElements = 1;
+        testCase.strideRinElements  // layout(constant_id =18) const uint strideRinElements = 1;
+    };
+
+    auto fill_specialized_map_entries = [](VkSpecializationMapEntry entries[], uint32_t num_entries, uint32_t sizeof_entry)
+    {
+        for (uint32_t i = 0; i < num_entries; i++)
+            entries[i] = { i, sizeof_entry * i, sizeof_entry };
+    };
+
+#define ARRAY_LENGTH(x) (sizeof(x) / sizeof(x[0]))
+
+    VkSpecializationMapEntry entriesMxM[ARRAY_LENGTH(specDataMxM)];
+    fill_specialized_map_entries(entriesMxM, ARRAY_LENGTH(specDataMxM), sizeof(uint32_t)); // {0,  sizeof(uint32_t) * 0, sizeof(uint32_t)},...,//{end,  sizeof(uint32_t) * end, sizeof(uint32_t)}
+
+    VkSpecializationMapEntry entriesCONV[ARRAY_LENGTH(specDataCONV)];
+    fill_specialized_map_entries(entriesCONV, ARRAY_LENGTH(specDataCONV), sizeof(uint32_t)); // {0, sizeof(uint32_t) * 0, sizeof(uint32_t)}, ...,//{end,  sizeof(uint32_t) * end, sizeof(uint32_t)}
+
+    VkSpecializationInfo specInfo;
+    switch (tt)
+    {
+    case TT_CONV:
+        specInfo = { ARRAY_LENGTH(specDataCONV), entriesCONV, sizeof(specDataCONV), specDataCONV, };
+        break;
+    case TT_MXM_BASIC:
+    case TT_MXM_VecToMat:
+        specInfo = { ARRAY_LENGTH(specDataMxM), entriesMxM, sizeof(specDataMxM), specDataMxM, };
+        break;
+    default:
+        LOGE("Unknown use case(%d), can't sent specialized constantas to shader!", tt);
+    }
+
+#undef ARRAY_LENGTH
+
+    // Create pipeline with a desired subgroup size (e.g., AMD supports two subgroup sizes)
+    VkPipelineShaderStageRequiredSubgroupSizeCreateInfo subgroupSizeInfo = {};
+    subgroupSizeInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO;
+    subgroupSizeInfo.requiredSubgroupSize = subgroup_size; // Must be between min and max
+
+    VkPipelineShaderStageCreateInfo shaderCreateInfo   = {VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, &subgroupSizeInfo, 0, VK_SHADER_STAGE_COMPUTE_BIT, shaderModule, "main", &specInfo};
+    VkComputePipelineCreateInfo     pipelineCreateInfo = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, NULL, 0, shaderCreateInfo, pipelineLayout, VK_NULL_HANDLE, 0 };
+
+    // Create the query pool
+    VkQueryPool query_pool_timestamps = VK_NULL_HANDLE;       // A query pool is required to use GPU time stamps
+    std::vector<uint64_t> time_stamps((size_t)perf_loop*2, 0);// We will get timestamps for the beginning and end of each of the compute passes
+                                                              // GPU time stamps will be stored in a vector
+    // VK_QUERY_TYPE_TIMESTAMP: We need to specify the query type for this pool, which in our case is for time stamps
+    // time_stamps: Set the no. of queries in this pool
+    VkQueryPoolCreateInfo query_pool_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, nullptr, 0, VK_QUERY_TYPE_TIMESTAMP, static_cast<uint32_t>(time_stamps.size()), 0 };
+    result = vkCreateQueryPool(m_vulkan_instance.m_VulkanDevice, &query_pool_info, nullptr, &query_pool_timestamps);
+    CHECK_VK(result);
+
+    std::cout << "\nExecuting vkCreateComputePipelines(...) (takes a while!)\n";
+    VkPipeline pipeline;
+    result = vkCreateComputePipelines(m_vulkan_instance.m_VulkanDevice, VK_NULL_HANDLE, 1, &pipelineCreateInfo, NULL, &pipeline);
+    CHECK_VK(result);
+
+    VkCommandBufferBeginInfo commandBufferBeginInfo{};
+    commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+
+    // Download input buffers to device memory.
+    result = vkBeginCommandBuffer(commandBuffers[0], &commandBufferBeginInfo); // Begin command buffer recording
+    CHECK_VK(result);
+
+    for (uint32_t i = 0; i < NUM_MATS; ++i) {
+        MatrixDesc &m = matrices[i];
+        VkBufferCopy copy = { 0, 0, m.bufferSize };
+        vkCmdCopyBuffer(commandBuffers[0], m.hostBuffer, m.deviceBuffer, 1, &copy);
+    }
+
+    result = vkEndCommandBuffer(commandBuffers[0]); // End command buffer recording
+    CHECK_VK(result);
+
+    VkSubmitInfo submitInfo = {VK_STRUCTURE_TYPE_SUBMIT_INFO, NULL, 0, NULL, NULL,1, &commandBuffers[0], 0,  NULL};
+
+    submitInfo.pCommandBuffers = &commandBuffers[0];
+    result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE);
+    CHECK_VK(result);
+    result = vkQueueWaitIdle(submission_queue);
+    CHECK_VK(result);
+
+    uint32_t groupCountX = 1;
+    uint32_t groupCountY = (testCase.TOTAL_M / testCase.TILE_M + (local_size_y - 1)) / local_size_y;
+    uint32_t groupCountZ = (testCase.TOTAL_N / testCase.TILE_N + (local_size_z - 1)) / local_size_z;
+
+    result = vkBeginCommandBuffer(commandBuffers[1], &commandBufferBeginInfo); // Begin command buffer recording
+    CHECK_VK(result);
+
+    vkCmdBindPipeline(commandBuffers[1], VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+    vkCmdBindDescriptorSets(commandBuffers[1], VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0u, 1, &descriptorSet, 0u, NULL);
+
+	// Reset the timestamp query pool, so we can start fetching new values into it
+    vkCmdResetQueryPool(commandBuffers[1], query_pool_timestamps, 0, static_cast<uint32_t>(time_stamps.size()));
+    
+    perf_loop = time_stamps.size()/2; // Both should have the same value, but just in case...
+
+    for (size_t loop = 0; loop < perf_loop; loop++)
+    {
+        vkCmdPipelineBarrier(commandBuffers[1], VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
+        vkCmdWriteTimestamp( commandBuffers[1], VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,   query_pool_timestamps, loop*2  ); // Start timer...
+        vkCmdDispatch(       commandBuffers[1], groupCountX, groupCountY, groupCountZ);                                      // Dispacth work
+        vkCmdWriteTimestamp( commandBuffers[1], VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, query_pool_timestamps,loop*2+1); // Stop timer...
+    }
+
+    vkCmdPipelineBarrier(commandBuffers[1], VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
+
+    result = vkEndCommandBuffer(commandBuffers[1]); // End command buffer recording
+    CHECK_VK(result);
+
+    submitInfo.pCommandBuffers = &commandBuffers[1];
+    result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE); // Here is the actual work!
+    CHECK_VK(result);
+    result = vkQueueWaitIdle(submission_queue);
+    CHECK_VK(result);
+
+    vkGetQueryPoolResults(m_vulkan_instance.m_VulkanDevice, query_pool_timestamps, 0,	time_stamps.size(), time_stamps.size() * sizeof(uint64_t), time_stamps.data(), sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+    
+    double ms = 0.0, min_ms = DBL_MAX, delta_in_ms = 0.0;
+    for (size_t loop = 0; loop < perf_loop; loop++)
+    {
+        delta_in_ms = double(time_stamps[loop*2+1] - time_stamps[loop*2]) * double(device_limits.timestampPeriod) / 1000000.0;
+        min_ms = (delta_in_ms < min_ms ? delta_in_ms : min_ms);
+        ms += delta_in_ms;
+    }
+
+    if(gpuvendor_id == VK_VENDOR_ID_QUALCOMM )
+    {
+        uint32_t num_uSP;
+        switch (gputier_id)
+        {
+        case QCOM_TIER_GLYMUR:
+        case QCOM_TIER_GLYMUR_TEST:
+            num_uSP = 16;
+            printf("\nQCOM Glymur GPU with num of uSP: %d, ", num_uSP);
+            break;
+        default:
+            num_uSP = 12;
+            printf("\nQCOM GPU with Num of uSP: %d, ", num_uSP);
+        }
+
+        uint64_t total_ops = 0;
+        if (tt == TT_CONV)
+        {
+            total_ops = static_cast<uint64_t>(testCase.TOTAL_M) *
+                static_cast<uint64_t>(testCase.TOTAL_N) *
+                static_cast<uint64_t>(testCase.TOTAL_K) *
+                static_cast<uint64_t>(filterHeight) *
+                static_cast<uint64_t>(filterWidth) * 2;
+        }
+        else
+        {
+            total_ops = static_cast<uint64_t>(testCase.TOTAL_M) *
+                static_cast<uint64_t>(testCase.TOTAL_N) *
+                static_cast<uint64_t>(testCase.TOTAL_K) * 2;
+        }
+
+        uint32_t theoreticalTime_ns = 1000 * ((unsigned long int)testCase.TOTAL_M * testCase.TOTAL_N * testCase.TOTAL_K / 64 / 2 / num_uSP / (4 / bytesPerInput)) / gpu_freq_MHz;
+        if (tt == TT_CONV)
+                 theoreticalTime_ns = 1000 * ((unsigned long int)testCase.TOTAL_M * testCase.TOTAL_N * testCase.TOTAL_K * filterHeight * filterWidth / 64 / 2 / num_uSP / (4 / bytesPerInput)) / gpu_freq_MHz;
+        
+        std::cout << "Maximum theoretical perf on device @" << gpu_freq_MHz << "MHz is " << theoreticalTime_ns / 1000 << "us." << std::endl;
+        ms /= double(perf_loop);
+        double percentOfPeak_avg = 100 * theoreticalTime_ns / ms / 1000 / 1000;
+        double percentOfPeak_min = 100 * theoreticalTime_ns / min_ms / 1000 / 1000;
+        std::cout << "MxM kernel time, average of " << perf_loop << " run(s): " << ms * 1000 << "us (" << percentOfPeak_avg << "% of theoretical peak (assuming " << gpu_freq_MHz << "MHz frequency))\n";
+        std::cout << "MxM kernel time, min of     " << perf_loop << " run(s): " << min_ms * 1000 << "us (" << percentOfPeak_min << "% of theoretical peak (assuming " << gpu_freq_MHz << "MHz frequency))\n";
+
+        test_result.time_total = ms * 1000;
+        test_result.TOPS       = static_cast<double>(total_ops) / (ms / 1000.0) / 1e12;
+        test_result.percentage = percentOfPeak_avg;
+    }
+    else
+    {
+        ms /= double(perf_loop);
+        std::cout << "MxM kernel time, average of " << perf_loop << " run(s): " << ms * 1000 << "us\n";
+        std::cout << "MxM kernel time, min of     " << perf_loop << " run(s): " << min_ms * 1000 << "us\n";
+
+        test_result.time_total = ms * 1000;
+        test_result.TOPS       = 0.0;
+        test_result.percentage = 0.0;
+    }
+
+    // Upload the result from device memory.
+    result = vkBeginCommandBuffer(commandBuffers[2], &commandBufferBeginInfo); // Begin command buffer recording
+    CHECK_VK(result);
+    {
+        MatrixDesc &m = matrices[MAT_R];
+        VkBufferCopy copy = { 0, 0, m.bufferSize };
+        vkCmdCopyBuffer(commandBuffers[2], m.deviceBuffer, m.hostBuffer, 1, &copy);
+    }
+    result = vkEndCommandBuffer(commandBuffers[2]); // End command buffer recording
+    CHECK_VK(result);
+
+    submitInfo.pCommandBuffers = &commandBuffers[2];
+    result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE);
+    CHECK_VK(result);
+    result = vkQueueWaitIdle(submission_queue);
+    CHECK_VK(result);
+
+    auto destroyMatrixDesc = [](VkDevice device, MatrixDesc & m)
+    {
+        vkDestroyBuffer(device, m.hostBuffer, NULL);
+        vkDestroyBuffer(device, m.deviceBuffer, NULL);
+        vkFreeMemory(device, m.hostMemory, NULL);
+        vkFreeMemory(device, m.deviceMemory, NULL);
+    };
+
+    // Free the memory/buffers/pipeline for this iteration.
+    for (int i = 0; i < NUM_MATS; ++i) 
+    {
+        destroyMatrixDesc(m_vulkan_instance.m_VulkanDevice, matrices[i]);
+    }
+
+    vkDestroyPipeline(m_vulkan_instance.m_VulkanDevice, pipeline, NULL);
+
+    vkDestroyShaderModule(m_vulkan_instance.m_VulkanDevice, shaderModule, NULL);
+
+    return test_result;
+}
diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp
new file mode 100644
index 0000000..867893d
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp
@@ -0,0 +1,152 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+#include "runtime_shader.hpp"
+#include <cstdint>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <glm/glm.hpp>
+
+enum { MAT_A = 0, MAT_B = 1, MAT_C = 2, MAT_R = 3, NUM_MATS = 4 };
+enum TestType
+{
+    TT_MXM_BASIC = 0,
+    TT_MXM_VecToMat = 1,
+    TT_CONV = 2,
+    TT_COUNT,
+};
+
+enum MatrixTransposeOption
+{
+    ALWAYS_TRUE,
+    ALWAYS_FALSE,
+    VARIABLE,
+};
+
+enum FillDataType { FILL_WITH_ZERO = 0, FILL_WITH_CONSTANTS, FILL_WITH_RANDON_UINT, FILL_WITH_RANDON_INT, FILL_SEQUENCE_INT, FILL_WITH_RANDOM_LOW_HIGH_INT, FILL_WITH_RANDOM_FLOAT, FILL_WITH_RANDOM_PLUS1_MINUS1_FLOAT };
+
+class CooperativeMatrixRunner
+{
+    struct TestDescription
+    {
+        TestType test_type = TT_MXM_BASIC;
+        FillDataType fill_data_type = FILL_WITH_RANDON_INT;
+
+        uint32_t gpu_freq_MHz = 900;
+
+        VkComponentTypeKHR input_type;
+        VkComponentTypeKHR output_type;
+
+        int MSize;
+        int NSize;
+        int KSize;
+        int MSizeInBlocks;
+        int NSizeInBlocks;
+        int KSizeInBlocks;
+        uint32_t perf_loop;
+
+        bool layoutA_Mfirst = false;
+        bool layoutB_Nfirst = false;
+        bool layoutC_Mfirst = false;
+        bool layoutR_Mfirst = false;
+
+        int inputWidth = 1;
+        int inputHeight = 1;
+    };
+
+    struct TestResult
+    {
+        bool   is_valid = false;
+        double time_total;
+        double TOPS;
+        double percentage;
+    };
+
+    struct SizeConfiguration
+    {
+        int MSizeInBlocks;
+        int NSizeInBlocks;
+        int KSizeInBlocks;
+
+        int MSize;
+        int NSize;
+        int KSize;
+    };
+
+    struct TestGroupTemplateDescription
+    {
+        VkComponentTypeKHR input_type;
+        VkComponentTypeKHR output_type;
+
+        std::vector<SizeConfiguration> size_configurations;
+    };
+
+    struct TestGroup
+    {
+        struct TestRowEntry
+        {
+            std::vector<TestDescription> test_descriptions;
+            std::vector<TestResult>      test_results;
+
+            bool layoutA_Mfirst = false;
+            bool layoutB_Nfirst = false;
+            bool layoutC_Mfirst = false;
+            bool layoutR_Mfirst = false;
+        };
+
+        TestGroupTemplateDescription template_description;
+        std::vector<TestRowEntry>    test_entries; // One per size_in_block_configuration from the template description
+    };
+
+public:
+
+    CooperativeMatrixRunner(Vulkan& vulkan_instance);
+    ~CooperativeMatrixRunner();
+
+    bool InitializeRunner();
+
+    bool TriggerPendingTests();
+    void RenderUI();
+
+private:
+
+    void PrepareTestSession();
+    std::optional<TestResult> RunTest(const TestDescription& test_description);
+
+private:
+
+    Vulkan& m_vulkan_instance;
+
+    std::vector<VkCooperativeMatrixPropertiesKHR> m_hFoundCooperativeMatrices;
+
+    TestType     m_test_type           = TT_MXM_BASIC;
+    FillDataType m_fill_data_type      = FILL_WITH_RANDON_INT;
+    int32_t      m_gpu_freq_MHz        = 900;
+    int32_t      m_gpu_microSP         = 12;
+    int32_t      m_gpu_ALU_per_microSP = 2;
+    int32_t      m_gpu_ops_per_mad     = 2;
+
+    MatrixTransposeOption m_matrix_transpose_options[NUM_MATS] = { VARIABLE , VARIABLE , VARIABLE , ALWAYS_FALSE };
+
+    int  m_test_repeats          = 1;
+    bool m_transpose_when_needed = false;
+    bool m_validate_matrix_result = false;
+
+    bool     m_is_processing_tests   = false;
+    uint32_t m_total_tests           = 0;
+    uint32_t m_total_processed_tests = 0;
+    std::vector<TestGroupTemplateDescription> m_test_group_templates;
+    std::vector<TestGroup> m_test_groups;
+};
\ No newline at end of file
diff --git a/samples/cooperative_matrix/code/main/runtime_shader.cpp b/samples/cooperative_matrix/code/main/runtime_shader.cpp
new file mode 100644
index 0000000..dec1696
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/runtime_shader.cpp
@@ -0,0 +1,207 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+
+#include "runtime_shader.hpp"
+#include "main/applicationEntrypoint.hpp"
+#include "camera/cameraController.hpp"
+#include "camera/cameraControllerTouch.hpp"
+#include "camera/cameraData.hpp"
+#include "camera/cameraGltfLoader.hpp"
+#include "gui/imguiVulkan.hpp"
+#include "material/drawable.hpp"
+#include "material/vulkan/shaderModule.hpp"
+#include "material/shaderManagerT.hpp"
+#include "material/materialManager.hpp"
+#include "material/vulkan/specializationConstantsLayout.hpp"
+#include "mesh/meshHelper.hpp"
+#include "mesh/meshLoader.hpp"
+#include "system/math_common.hpp"
+#include "texture/textureManager.hpp"
+#include "vulkan/extensionHelpers.hpp"
+#include "imgui.h"
+#include <../external/glslang/glslang/Include/glslang_c_interface.h>
+#include <../external/glslang/glslang/Public/resource_limits_c.h>
+
+#include <random>
+#include <iostream>
+#include <filesystem>
+
+bool RuntimeShader::Build(const std::string& glsl_code,
+    VkDevice device,
+    const char* entry_point,
+    glslang_stage_t stage)
+{
+    m_is_valid = false;
+
+    m_spirv = CompileGLSLToSPIRV(glsl_code, entry_point, stage, m_defines);
+    if (m_spirv.empty())
+    {
+        LOGE("Runtime Shader failed to compile GLSL into SPIRV blob");
+        return false;
+    }
+
+    VkShaderModuleCreateInfo create_info{};
+    create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    create_info.codeSize = m_spirv.size() * sizeof(uint32_t);
+    create_info.pCode = m_spirv.data();
+
+    if (vkCreateShaderModule(device, &create_info, nullptr, &m_shader_module) != VK_SUCCESS)
+    {
+        LOGE("Runtime Shader failed to create vk shader module");
+        return false;
+    }
+
+    m_is_valid = true;
+
+    return true;
+}
+
+std::vector<uint32_t> RuntimeShader::CompileGLSLToSPIRV(
+    const std::string&                                   glsl_source,
+    const char*                                          entry_name,
+    glslang_stage_t                                      stage,
+    std::span<const std::pair<std::string, std::string>> defines)
+{
+    ////////////////////
+    // COMPOSE SHADER //
+    ////////////////////
+
+
+    size_t version_string_index = glsl_source.find_first_of("version");
+    if (version_string_index == std::string::npos)
+    {
+        LOGE("Shader compilation failed -> Could not locate 'version' string on shader code");
+        return {};
+    }
+    version_string_index += std::string_view("version").length();
+    size_t line_under_version_string_index = glsl_source.find_first_of('\n', version_string_index);
+    if (line_under_version_string_index == std::string::npos)
+    {
+        LOGE("Shader compilation failed -> Could not locate 'version' string on shader code");
+        return {};
+    }
+    line_under_version_string_index += 1;
+
+    std::string composed_shader_code = glsl_source;
+    for (auto& [define_text, value_text] : defines)
+    {
+        composed_shader_code.insert(line_under_version_string_index, "#define " + define_text + " " + value_text + "\n");
+    }
+
+#if 1
+    glslang_input_t input = {
+        .language = GLSLANG_SOURCE_GLSL,
+        .stage = stage,
+        .client = GLSLANG_CLIENT_VULKAN,
+        .client_version = GLSLANG_TARGET_VULKAN_1_3,
+        .target_language = GLSLANG_TARGET_SPV,
+        .target_language_version = GLSLANG_TARGET_SPV_1_6,
+        .code = composed_shader_code.c_str(),
+        .default_version = 100,
+        .default_profile = GLSLANG_NO_PROFILE,
+        .force_default_version_and_profile = false,
+        .forward_compatible = false,
+        .messages = GLSLANG_MSG_DEFAULT_BIT,
+//        .resource = s_slslang_built_in_resource,
+        .resource = glslang_default_resource(),
+//         .resource = nullptr,
+    };
+
+    if (!glslang_shader_create(&input)) // initialize internally
+    {
+        LOGE("Failed to create shader\n");
+        return {};
+    }
+
+    glslang_shader_t* shader = glslang_shader_create(&input);
+
+    if (!glslang_shader_preprocess(shader, &input))
+    {
+        LOGE("Preprocessing failed:\n%s\n", glslang_shader_get_info_log(shader));
+        glslang_shader_delete(shader);
+        return {};
+    }
+
+    if (!glslang_shader_parse(shader, &input))
+    {
+        LOGE("Parsing failed:\n%s\n", glslang_shader_get_info_log(shader));
+        glslang_shader_delete(shader);
+        return {};
+    }
+
+    glslang_program_t* program = glslang_program_create();
+    glslang_program_add_shader(program, shader);
+
+    if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
+    {
+        LOGE("Linking failed:\n%s\n", glslang_program_get_info_log(program));
+        glslang_program_delete(program);
+        glslang_shader_delete(shader);
+        return {};
+    }
+
+    glslang_program_SPIRV_generate(program, stage);
+
+    const auto* words = glslang_program_SPIRV_get_ptr(program);
+    const auto size = glslang_program_SPIRV_get_size(program);
+
+    std::vector<uint32_t> spirv(size);
+    std::memcpy(spirv.data(), words, size * sizeof(uint32_t));
+
+    glslang_program_delete(program);
+    glslang_shader_delete(shader);
+
+    return spirv;
+#else
+
+    ///////////////
+    // SLANG API //
+    ///////////////
+
+    SlangSession* session = spCreateSession(nullptr);
+    SlangCompileRequest* request = spCreateCompileRequest(&m_global_session);
+
+    spSetCodeGenTarget(request, SLANG_SPIRV);
+    spSetTargetProfile(request, 0, spFindProfile(&m_global_session, "vk_1_3"));  // Vulkan 1.3 compatibility
+
+    int translationUnitIndex = spAddTranslationUnit(request, SlangSourceLanguage::SLANG_SOURCE_LANGUAGE_GLSL, nullptr);
+    spAddTranslationUnitSourceString(request, translationUnitIndex, nullptr, composed_shader_code.c_str());
+
+    spAddEntryPoint(
+        request,
+        translationUnitIndex,
+        entry_name,
+        stage);
+
+    int compileResult = spCompile(request);
+    if (SLANG_FAILED(compileResult))
+    {
+        const char* diagnosticOutput = spGetDiagnosticOutput(request);
+        spDestroyCompileRequest(request);
+        LOGE("Shader compilation failed -> Compilation failed");
+        return {};
+    }
+
+    size_t spvSize = 0;
+    const void* spvData = spGetEntryPointCode(request, 0, &spvSize);
+    if (!spvData || spvSize == 0)
+    {
+        spDestroyCompileRequest(request);
+        LOGE("Shader compilation failed -> Failed to retrieve entrypoint from compiled code");
+        return {};
+    }
+
+    std::vector<uint32_t> spirv(spvSize / 4);
+    std::memcpy(spirv.data(), spvData, spvSize);
+    return spirv;
+#endif
+}
\ No newline at end of file
diff --git a/samples/cooperative_matrix/code/main/runtime_shader.hpp b/samples/cooperative_matrix/code/main/runtime_shader.hpp
new file mode 100644
index 0000000..4f8ee83
--- /dev/null
+++ b/samples/cooperative_matrix/code/main/runtime_shader.hpp
@@ -0,0 +1,95 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+///
+/// Sample app demonstrating the loading of a .gltf file (hello world)
+///
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+#include <cstdint>
+#include <vector>
+#include <string>
+#include <span>
+#include <unordered_map>
+#include <../external/glslang/glslang/Include/glslang_c_interface.h>
+#include <../external/glslang/glslang/Public/resource_limits_c.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Class name: RuntimeShader
+////////////////////////////////////////////////////////////////////////////////
+class RuntimeShader
+{
+public:
+
+    /*
+    * Adds a preprocessor definition to be used during shader compilation.
+    * @param name : Name of the macro
+    * @param value : Value of the macro
+    */
+    template<typename Arg, typename OverrideType = Arg>
+    inline void AddDefine(const std::string& name, const Arg& arg)
+    {
+        if constexpr (std::is_same_v<OverrideType, std::string> || std::is_same_v<OverrideType, const char*>)
+        {
+            m_defines.emplace_back(name, std::string(arg));
+        }
+        else
+        {
+            m_defines.emplace_back(name, std::to_string(static_cast<const OverrideType&>(arg)));
+        }
+    }
+
+
+    /*
+    * Builds the shader by compiling GLSL to SPIR-V and creating a Vulkan shader module.
+    * @param glsl_code : GLSL source code as a string
+    * @param device : Vulkan logical device used to create the shader module
+    * @param entry_point : Entry point name in the GLSL code (e.g., "main")
+    * @param stage : Shader stage (e.g., SLANG_STAGE_VERTEX)
+    * @return true if compilation and module creation succeeded
+    * @note If compilation fails, m_is_valid will be false
+    */
+    bool Build(
+        const std::string& glsl_code,
+        VkDevice device,
+        const char* entry_point,
+        glslang_stage_t stage);
+
+    /*
+    * Returns the Vulkan shader module.
+    * @return VkShaderModule handle
+    */
+    inline VkShaderModule GetShaderModule() const
+    {
+        return m_shader_module;
+    }
+
+    /*
+    * Checks if the shader was successfully built.
+    * @return true if valid
+    */
+    inline bool IsValid() const
+    {
+        return m_is_valid;
+    }
+
+private:
+
+    std::vector<uint32_t> CompileGLSLToSPIRV(
+        const std::string&                                   glsl_source,
+        const char*                                          entry_name,
+        glslang_stage_t                                      stage,
+        std::span<const std::pair<std::string, std::string>> defines = {});
+
+private:
+    std::vector<std::pair<std::string, std::string>> m_defines;
+    std::vector<uint32_t>                            m_spirv;
+    VkShaderModule                                   m_shader_module = VK_NULL_HANDLE;
+    bool                                             m_is_valid      = false;
+};
\ No newline at end of file
diff --git a/samples/cooperative_matrix/install_apk.bat b/samples/cooperative_matrix/install_apk.bat
new file mode 100644
index 0000000..62ab3c4
--- /dev/null
+++ b/samples/cooperative_matrix/install_apk.bat
@@ -0,0 +1,21 @@
+@echo off
+cd /D "%~dp0"
+
+:: Get the name of the current folder (assumed to be the project name)
+for %%I in ("%~dp0.") do set "project_name=%%~nxI"
+
+@echo.
+@echo ****************************************
+@echo Installing APK for project: %project_name%
+@echo ****************************************
+
+set "apk_path=..\..\build\android\%project_name%\outputs\apk\debug\%project_name%-debug.apk"
+
+call adb install -r -t "%apk_path%"
+
+@echo.
+@echo ****************************************
+@echo Done!
+@echo ****************************************
+
+IF "%~dpnx0"=="%0" PAUSE
\ No newline at end of file
diff --git a/samples/cooperative_matrix/install_config.bat b/samples/cooperative_matrix/install_config.bat
new file mode 100644
index 0000000..c3d20e8
--- /dev/null
+++ b/samples/cooperative_matrix/install_config.bat
@@ -0,0 +1,31 @@
+
+@echo off
+cd /D "%~dp0"
+
+:: Get the name of the current folder (assumed to be the project name)
+for %%I in ("%~dp0.") do set "project_name=%%~nxI"
+
+:: Check if app_config.txt exists
+if exist "app_config.txt" (
+    @echo.
+    @echo ****************************************
+    @echo Pushing app_config.txt to: /sdcard/Android/data/com.quic.%project_name%/files/
+    @echo ****************************************
+    adb push ./app_config.txt /sdcard/Android/data/com.quic.%project_name%/files/app_config.txt
+
+    @echo.
+    @echo ****************************************
+    @echo Done!
+    @echo ****************************************
+) else (
+    @echo.
+    @echo ****************************************
+    @echo No app_config.txt was found.
+    @echo It's not necessary for the app, but it can be used to override application settings.
+    @echo If such functionality is desired, please create the file and override the global variables
+    @echo according to how they are defined in the project.
+    @echo ****************************************
+)
+
+:: Pause only if run directly
+IF "%~dpnx0"=="%0" PAUSE
\ No newline at end of file
diff --git a/samples/cooperative_matrix/project/android/AndroidManifest.xml b/samples/cooperative_matrix/project/android/AndroidManifest.xml
new file mode 100644
index 0000000..6d317b7
--- /dev/null
+++ b/samples/cooperative_matrix/project/android/AndroidManifest.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- BEGIN_INCLUDE(manifest) -->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+android:versionCode="1"
+android:versionName="1.0">
+  <!-- This is the platform API where NativeActivity was introduced. -->
+  <uses-sdk android:minSdkVersion="26" />
+  <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+  <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
+  <uses-permission android:name="android.permission.INTERNET" />
+  <!-- This .apk has no Java code itself, so set hasCode to false. -->
+  <application
+      android:allowBackup="false"
+      android:fullBackupContent="false"
+      android:icon="@mipmap/ic_launcher"
+      android:label="@string/app_name"
+      android:testOnly="false"
+      android:hasCode="false">
+    <!-- Our activity is the built-in NativeActivity framework class.
+             This will take care of integrating with our NDK code. -->
+    <activity android:name="android.app.NativeActivity"
+        android:label="@string/app_name"
+        android:configChanges="orientation|keyboardHidden|screenSize"
+        android:screenOrientation="reverseLandscape"
+        android:theme="@android:style/Theme.NoTitleBar.Fullscreen"
+        android:exported="true">
+
+      <!-- Tell NativeActivity the name of our .so (our code for the entry point for the app) -->
+      <meta-data android:name="android.app.lib_name"
+          android:value="native-lib" />
+
+      <!-- Support updatable graphics driver -->
+      <meta-data
+          android:name="com.android.graphics.developerdriver.enable"
+          android:value="true" />
+
+      <meta-data android:name="com.android.graphics.injectLayers.enable"
+          android:value="true"/>
+
+      <intent-filter>
+        <action android:name="android.intent.action.MAIN" />
+        <category android:name="android.intent.category.LAUNCHER" />
+      </intent-filter>
+    </activity>
+  </application>
+
+</manifest>
+<!-- END_INCLUDE(manifest) -->
diff --git a/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 0000000..1b58b37
Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 0000000..11acf77
Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 0000000..b8016f2
Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000..c0b9e85
Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 0000000..7df4d52
Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/samples/cooperative_matrix/project/android/res/values/strings.xml b/samples/cooperative_matrix/project/android/res/values/strings.xml
new file mode 100644
index 0000000..2c159d9
--- /dev/null
+++ b/samples/cooperative_matrix/project/android/res/values/strings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <string name="app_name">SGS Cooperative Matrix</string>
+</resources>
diff --git a/samples/cooperative_matrix/project/img/screenshot.png b/samples/cooperative_matrix/project/img/screenshot.png
new file mode 100644
index 0000000..13f6aa8
Binary files /dev/null and b/samples/cooperative_matrix/project/img/screenshot.png differ
diff --git a/samples/cooperative_matrix/shaders/Blit.frag b/samples/cooperative_matrix/shaders/Blit.frag
new file mode 100644
index 0000000..bf6f90e
--- /dev/null
+++ b/samples/cooperative_matrix/shaders/Blit.frag
@@ -0,0 +1,34 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+#version 400
+
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_ARB_shading_language_420pack : enable
+
+// Buffer binding locations
+#define SHADER_OVERLAY_TEXTURE_LOC          0
+
+layout(set = 0, binding = SHADER_OVERLAY_TEXTURE_LOC) uniform sampler2D u_OverlayTex;
+
+// Varying's
+layout (location = 0) in vec2   v_TexCoord;
+layout (location = 1) in vec4   v_VertColor;
+
+// Finally, the output color
+layout (location = 0) out vec4 FragColor;
+
+//-----------------------------------------------------------------------------
+void main()
+//-----------------------------------------------------------------------------
+{
+    vec2 LocalTexCoord = vec2(v_TexCoord.xy);
+    vec4 OverlayColor = texture( u_OverlayTex, LocalTexCoord.xy );
+    FragColor = OverlayColor;
+}
+
diff --git a/samples/cooperative_matrix/shaders/Blit.json b/samples/cooperative_matrix/shaders/Blit.json
new file mode 100644
index 0000000..f9f7df6
--- /dev/null
+++ b/samples/cooperative_matrix/shaders/Blit.json
@@ -0,0 +1,58 @@
+{
+	"$schema": "../../../framework/schema/shaderSchema.json",
+	"Passes": [
+		{
+			"Name": "RP_BLIT",
+			"Shaders": {
+				"Vertex": "Media/Shaders/Blit.vert.spv",
+				"Fragment": "Media/Shaders/Blit.frag.spv"
+			},
+			"DescriptorSets": [
+				{
+					"Buffers": [
+						{
+							"Type": "ImageSampler",
+							"Stages": [ "Fragment" ],
+							"Count": 1,
+							"Names": [ "Overlay" ]
+						}
+					]
+				}
+			],
+			"VertexBindings": [ "VB0" ]
+		}
+	],
+	"Vertex": [
+		{
+			"Span": 60,
+			"Name": "VB0",
+			"Elements": [
+				{
+					"Name": "Position",
+					"Offset": 0,
+					"Type": "Vec3"
+				},
+				{
+					"Name": "Normal",
+					"Offset": 12,
+					"Type": "Vec3"
+				},
+				{
+					"Name": "UV",
+					"Offset": 24,
+					"Type": "Vec2"
+				},
+				{
+					"Name": "Color",
+					"Offset": 32,
+					"Type": "Vec4"
+				},
+				{
+					"Name": "Tangent",
+					"Offset": 48,
+					"Type": "Vec3"
+				}
+			]
+		}
+	]
+}
diff --git a/samples/cooperative_matrix/shaders/Blit.vert b/samples/cooperative_matrix/shaders/Blit.vert
new file mode 100644
index 0000000..d11750a
--- /dev/null
+++ b/samples/cooperative_matrix/shaders/Blit.vert
@@ -0,0 +1,38 @@
+//============================================================================================================
+//
+//
+//                  Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+//                              SPDX-License-Identifier: BSD-3-Clause
+//
+//============================================================================================================
+
+#version 400
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_ARB_shading_language_420pack : enable
+
+#define SHADER_ATTRIB_LOC_POSITION          0
+#define SHADER_ATTRIB_LOC_NORMAL            1
+#define SHADER_ATTRIB_LOC_TEXCOORD0         2
+#define SHADER_ATTRIB_LOC_COLOR             3
+#define SHADER_ATTRIB_LOC_TANGENT           4
+
+layout (location = SHADER_ATTRIB_LOC_POSITION ) in vec4 a_Position;
+layout (location = SHADER_ATTRIB_LOC_NORMAL   ) in vec3 a_Normal;
+layout (location = SHADER_ATTRIB_LOC_TEXCOORD0) in vec2 a_TexCoord;
+layout (location = SHADER_ATTRIB_LOC_COLOR    ) in vec4 a_Color;
+layout (location = SHADER_ATTRIB_LOC_TANGENT  ) in vec4 a_Tangent;
+
+// Varying's
+layout (location = 0) out vec2    v_TexCoord;
+layout (location = 1) out vec4    v_VertColor;
+
+void main()
+{
+    // Position and text coord are simple (Except Y in inverted on screen compared to OpenGL)
+    vec4 TempPos = vec4(a_Position.xyz, 1.0); 
+    gl_Position = vec4(TempPos.x, -TempPos.y, TempPos.z, TempPos.w);
+    v_TexCoord = vec2(a_TexCoord.xy);
+
+    // Color is simple attribute color
+    v_VertColor.xyzw = vec4(a_Color.xyz, 1.0);
+}
diff --git a/samples/tile_memory/README.md b/samples/tile_memory/README.md
new file mode 100644
index 0000000..243f385
--- /dev/null
+++ b/samples/tile_memory/README.md
@@ -0,0 +1,8 @@
+# Tile Memory Heap Sample
+
+This sample demonstrates a light clustering algorithm using Vulkan, with specific support for the *VK_QCOM_tile_memory_heap* extension.
+This extension allows the application to allocate and manage tile memory, which is used for efficient memory management within a command buffer submission batch.
+
+The sample showcases how tile memory can be used to optimize rendering performance by reducing memory bandwidth and improving cache locality. It implements a forward rendering pipeline with clustered lighting, where lights are grouped based on screen-space tiles. These tiles are processed using tile-local memory allocations, enabling fast access and minimizing global memory usage.
+
+The rendering technique is designed to highlight the benefits of tile memory in scenarios with many dynamic lights, demonstrating how Vulkan applications can leverage Qualcomm™-specific extensions to achieve better performance on Adreno™ GPUs.
\ No newline at end of file
diff --git a/samples/tile_shading/README.md b/samples/tile_shading/README.md
new file mode 100644
index 0000000..c787a00
--- /dev/null
+++ b/samples/tile_shading/README.md
@@ -0,0 +1,11 @@
+# Tile Shading Sample
+
+This sample demonstrates a tile-based shading technique using Vulkan, with support for the *VK_QCOM_tile_memory_heap* extension.
+
+The extension enables the application to allocate and manage tile-local memory, which is scoped to the duration of a command buffer submission and optimized for high-bandwidth, low-latency access within a tile.
+
+The sample implements a forward rendering pipeline where shading computations are performed per tile, rather than per pixel or per fragment. This approach leverages the tiling architecture of Adreno™ GPUs to reduce memory traffic and improve cache efficiency.
+
+By using tile memory, the sample avoids costly round-trips to global memory for intermediate shading data. Instead, lighting calculations and material evaluations are performed directly in tile-local memory, which is faster and more power-efficient.
+
+The technique is particularly well-suited for mobile GPUs, where bandwidth and power are constrained. It demonstrates how Vulkan applications can take advantage of Qualcomm™-specific extensions to optimize rendering workloads and achieve better performance on Snapdragon™ platforms.
\ No newline at end of file
diff --git a/samples/tile_shading/code/main/application.cpp b/samples/tile_shading/code/main/application.cpp
index d02c336..a263276 100644
--- a/samples/tile_shading/code/main/application.cpp
+++ b/samples/tile_shading/code/main/application.cpp
@@ -456,6 +456,7 @@ bool Application::CreateRenderTargets()
             gSurfaceHeight,
             FinalColorType,
             TextureFormat::UNDEFINED,
+            nullptr,
             blitColorTypes,
         },
         "BLIT RT"))
@@ -475,6 +476,7 @@ bool Application::CreateRenderTargets()
                 gRenderHeight,
                 TileShadingSceneColorType,
                 desiredDepthFormat,
+                nullptr,
                 tileShadingSceneColorTypes, 
                 TT_DEPTH_TARGET_LOCAL_READ,
             },