diff --git a/Config.txt b/Config.txt index e450e62..615d689 100644 --- a/Config.txt +++ b/Config.txt @@ -6,6 +6,7 @@ tests\ samples\ vulkan; framework\vulkan vk_extensions + cooperative_matrix graph_pipelines image_processing tile_shading @@ -21,7 +22,7 @@ samples\ framework\ base; framework\external\GameSampleAssets, framework\external\glm, framework\external\tinyobjloader, framework\external\tinygltf, framework\external\json, framework\external\eigen generic; framework\base, framework\external\imgui, framework\external\implot, framework\external\portable-file-dialogs, framework\external\KTX-Software, Tools - vulkan; framework\generic, framework\external\volk, framework\external\SPIRV-Cross, framework\external\glslang, framework\external\slang, framework\external\VulkanMemoryAllocator, framework\external\Vulkan-Headers + vulkan; framework\generic, framework\external\volk, framework\external\SPIRV-Cross, framework\external\glslang, framework\external\half, framework\external\VulkanMemoryAllocator, framework\external\Vulkan-Headers external\ VulkanMemoryAllocator @ https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator/archive/refs/tags/v3.0.1.tar.gz MD5:8571f3def0ff86f228e2864c907ba0b3 tinyobjloader @ https://github.com/tinyobjloader/tinyobjloader/archive/e39c1737bc61c8dce28be7932cfe839d408e7838.zip @@ -35,9 +36,9 @@ framework\ volk @ https://github.com/zeux/volk/archive/1e0ec168f1726e6389b8647435a3018f0cef9428.zip SPIRV-Cross @ https://github.com/KhronosGroup/SPIRV-Cross/archive/7affe74d77f93a622bb5002789d5332d32e512ee.zip glslang @ https://github.com/KhronosGroup/glslang/archive/3a7f78758f8faa9a6e059b09e25fc64ede7fbfb0.zip - slang @ https://github.com/shader-slang/slang/archive/9c2024a7509baae921083d49a56e1321c51f00ec.zip json @ https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz MD5:4b67aba51ddf17c798e80361f527f50e eigen @ https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz MD5:4c527a9171d71a72a9d4186e65bea559 KTX-Software @ https://github.com/KhronosGroup/KTX-Software/archive/refs/tags/v4.1.0.tar.gz MD5:b35fc412cdb3a00aa92aadcdd1e5f004 PATCH:..\cmake\KTX-Software.diff; Tools D3D12MemoryAllocator @ https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator/archive/7597f717c7b32b74d263009ecc15985b517585c7.zip + half @ https://github.com/ramenhut/half/archive/43473931db0fae8ecef4ff1492ad18061e3600ec.zip diff --git a/framework/CMakeLists.txt b/framework/CMakeLists.txt index 238fe67..1e54461 100644 --- a/framework/CMakeLists.txt +++ b/framework/CMakeLists.txt @@ -519,12 +519,22 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan) endif() # Add vulkan framework dependency libraries + set(ENABLE_OPT OFF CACHE BOOL "Disable SPIRV-Tools optimizer" FORCE) + set(ENABLE_C_INTERFACE ON CACHE BOOL "" FORCE) add_subdirectory(external/volk) - #add_subdirectory(external/SPIRV-Cross) - #add_subdirectory(external/glslang) + add_subdirectory(external/SPIRV-Cross) + add_subdirectory(external/glslang) #add_subdirectory(external/slang) add_library(framework_vulkan STATIC ${CPP_VULKAN_SRC} ${EXTERNAL_VULKAN_SRC}) + + # Make sure framework_vulkan builds after glslang libs + add_dependencies(framework_vulkan + glslang + glslang-default-resource-limits + SPIRV + ) + target_include_directories(framework_vulkan PUBLIC code) target_include_directories(framework_vulkan PUBLIC external) target_include_directories(framework_vulkan PUBLIC external/glm) # so code can do #include "glm/mat3x3.hpp" etc @@ -532,7 +542,7 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan) target_include_directories(framework_vulkan PUBLIC external/imgui) target_include_directories(framework_vulkan PUBLIC external/implot) - target_link_libraries(framework_vulkan framework) + target_link_libraries(framework_vulkan PUBLIC framework) get_target_property(VulkanHeaders_INCLUDE_DIRS Vulkan::Headers INTERFACE_INCLUDE_DIRECTORIES) target_include_directories(framework_vulkan PUBLIC "${VulkanHeaders_INCLUDE_DIRS}") @@ -545,18 +555,20 @@ if(FRAMEWORK_ENABLE_VULKAN AND FRAMEWORK_framework_vulkan) target_include_directories(framework_vulkan PUBLIC external/slang/include) # Link vulkan framework library dependencies - target_link_libraries(framework_vulkan volk) - #target_link_libraries(framework_vulkan spirv-cross-core) - #target_link_libraries(framework_vulkan spirv-cross-cpp) - #target_link_libraries(framework_vulkan spirv-cross-glsl) - #target_link_libraries(framework_vulkan spirv-cross-hlsl) - #target_link_libraries(framework_vulkan spirv-cross-msl) - #target_link_libraries(framework_vulkan spirv-cross-reflect) - #target_link_libraries(framework_vulkan spirv-cross-util) - #target_link_libraries(framework_vulkan SPIRV) - #target_link_libraries(framework_vulkan glslang) - #target_link_libraries(framework_vulkan glslang-default-resource-limits) - #target_link_libraries(framework_vulkan slang) + target_link_libraries( + framework_vulkan + PUBLIC + glslang + glslang-default-resource-limits + spirv-cross-core + spirv-cross-cpp + spirv-cross-glsl + spirv-cross-hlsl + spirv-cross-msl + spirv-cross-reflect + spirv-cross-util + SPIRV + ) target_compile_definitions(framework_vulkan PUBLIC VK_ENABLE_BETA_EXTENSIONS) @@ -660,6 +672,21 @@ if(FRAMEWORK_LIB_OUTPUT) target_link_libraries(framework_dx12_shared PUBLIC framework_dx12) endif() + + # Copy external libraries using a local target's post-build step + add_custom_command(TARGET framework_vulkan POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$) + add_custom_command(TARGET framework_vulkan POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$) + add_custom_command(TARGET framework_vulkan POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + ${FRAMEWORK_LIB_OUTPUT}/${CMAKE_BUILD_TYPE}/$) + endif() diff --git a/framework/code/graphicsApi/renderTarget.hpp b/framework/code/graphicsApi/renderTarget.hpp index 5d0e331..5ec33fc 100644 --- a/framework/code/graphicsApi/renderTarget.hpp +++ b/framework/code/graphicsApi/renderTarget.hpp @@ -13,12 +13,15 @@ class GraphicsApiBase; template class RenderTarget; +class RenderTargetBase; + struct RenderTargetInitializeInfo { uint32_t Width = 0; uint32_t Height = 0; std::span LayerFormats = {}; TextureFormat DepthFormat = TextureFormat::UNDEFINED; + RenderTargetBase* InheritedDepthAttachment = nullptr; const std::span TextureTypes = {}; const std::optional DepthTextureType = std::nullopt; std::span Msaa = {}; diff --git a/framework/code/vulkan/extensionLib.cpp b/framework/code/vulkan/extensionLib.cpp index eeb93aa..0c35d66 100644 --- a/framework/code/vulkan/extensionLib.cpp +++ b/framework/code/vulkan/extensionLib.cpp @@ -261,6 +261,20 @@ namespace ExtensionLib } #endif // VK_KHR_fragment_shading_rate +#if VK_KHR_cooperative_matrix + void Ext_VK_KHR_cooperative_matrix::PrintFeatures() const + { + LOGI("VK_KHR_fragment_shading_rate (VkPhysicalDeviceCooperativeMatrixFeaturesKHR): "); + LOGI(" pipelineFragmentShadingRate: %s", this->AvailableFeatures.cooperativeMatrix ? "True" : "False"); + LOGI(" primitiveFragmentShadingRate: %s", this->AvailableFeatures.cooperativeMatrixRobustBufferAccess ? "True" : "False"); + } + void Ext_VK_KHR_cooperative_matrix::PrintProperties() const + { + LOGI("VK_KHR_fragment_shading_rate (VkPhysicalDeviceFragmentShadingRatePropertiesKHR): "); + LOGI(" cooperativeMatrixSupportedStages: %d", this->Properties.cooperativeMatrixSupportedStages); + } +#endif // VK_KHR_cooperative_matrix + #if VK_KHR_create_renderpass2 void Ext_VK_KHR_create_renderpass2::LookupFunctionPointers( VkInstance vkInstance ) { diff --git a/framework/code/vulkan/extensionLib.hpp b/framework/code/vulkan/extensionLib.hpp index beda17d..37055e6 100644 --- a/framework/code/vulkan/extensionLib.hpp +++ b/framework/code/vulkan/extensionLib.hpp @@ -239,6 +239,7 @@ namespace ExtensionLib #endif // VK_KHR_create_renderpass2 + #if VK_KHR_draw_indirect_count struct Ext_VK_KHR_draw_indirect_count : public VulkanFunctionPointerExtensionHelper @@ -253,6 +254,36 @@ namespace ExtensionLib #endif // VK_KHR_draw_indirect_count +#if VK_KHR_cooperative_matrix + + struct Ext_VK_KHR_cooperative_matrix : public VulkanFeaturesPropertiesAndFunctionPointerExtensionHelper< + VkPhysicalDeviceCooperativeMatrixFeaturesKHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR, + VkPhysicalDeviceCooperativeMatrixPropertiesKHR, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR> + { + static constexpr auto Name = VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME; + explicit Ext_VK_KHR_cooperative_matrix(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) : VulkanFeaturesPropertiesAndFunctionPointerExtensionHelper(Name, status) + { + } + + void PopulateRequestedFeatures() override + { + RequestedFeatures.sType = AvailableFeatures.sType; + RequestedFeatures.cooperativeMatrix = AvailableFeatures.cooperativeMatrix; + RequestedFeatures.cooperativeMatrixRobustBufferAccess = AvailableFeatures.cooperativeMatrixRobustBufferAccess; + } + void LookupFunctionPointers(VkInstance vkInstance) override + { + m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vkInstance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); + } + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + void PrintFeatures() const override; + void PrintProperties() const override; + + PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = nullptr; + }; + +#endif // VK_KHR_cooperative_matrix + #if VK_KHR_depth_stencil_resolve struct Ext_VK_KHR_depth_stencil_resolve : public VulkanFunctionPointerExtensionHelper diff --git a/framework/code/vulkan/renderTarget.cpp b/framework/code/vulkan/renderTarget.cpp index 89a7c75..dee2f38 100644 --- a/framework/code/vulkan/renderTarget.cpp +++ b/framework/code/vulkan/renderTarget.cpp @@ -53,6 +53,8 @@ RenderTarget& RenderTarget::operator=( RenderTarget&& sr m_ClearColorValues = std::move( src.m_ClearColorValues ); m_ResolveAttachments = std::move( src.m_ResolveAttachments ); m_DepthAttachment = std::move( src.m_DepthAttachment ); + m_InheritedDepthAttachment = std::move( src.m_InheritedDepthAttachment); + src.m_InheritedDepthAttachment = nullptr; m_FrameBuffer = std::move( src.m_FrameBuffer ); m_FrameBufferDepthOnly = std::move( src.m_FrameBufferDepthOnly ); @@ -107,15 +109,30 @@ bool RenderTarget::Initialize( Vulkan* pVulkan, const RenderTargetInitia m_pLayerFormats.assign( info.LayerFormats.begin(), info.LayerFormats.end() ); - if (!InitializeDepth(depthTextureType)) - return false; + if (info.InheritedDepthAttachment) + { + auto* inheritedDepthAttachment = apiCast(info.InheritedDepthAttachment); + if (!inheritedDepthAttachment->m_DepthAttachment) + { + return false; + } + + m_InheritedDepthAttachment = &inheritedDepthAttachment->m_DepthAttachment; + m_DepthFormat = m_InheritedDepthAttachment->Format; + } + else + { + if (!InitializeDepth(depthTextureType)) + return false; + } + if (!InitializeColor(colorTextureTypes)) return false; if (!InitializeResolve( info.ResolveTextureFormats )) return false; - if (renderPass && *renderPass && !CreateFrameBuffer( *renderPass, m_ColorAttachments, &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBuffer )) + if (renderPass && *renderPass && !CreateFrameBuffer( *renderPass, m_ColorAttachments, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBuffer )) return false; - if (renderPassDepthOnly && *renderPassDepthOnly && m_DepthAttachment && !CreateFrameBuffer( *renderPassDepthOnly, {}, &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBufferDepthOnly )) + if (renderPassDepthOnly && *renderPassDepthOnly && (m_InheritedDepthAttachment || m_DepthAttachment) && !CreateFrameBuffer( *renderPassDepthOnly, {}, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr/*pVRSAttachment*/, &m_FrameBufferDepthOnly )) return false; return true; @@ -154,7 +171,7 @@ bool RenderTarget::Initialize(Vulkan* pVulkan, uint32_t uiWidth, uint32_ bool RenderTarget::InitializeFrameBuffer( Vulkan* pVulkan, const RenderPass& renderPass ) //----------------------------------------------------------------------------- { - bool success = CreateFrameBuffer( renderPass, m_ColorAttachments, &m_DepthAttachment, m_ResolveAttachments, nullptr, &m_FrameBuffer ); + bool success = CreateFrameBuffer( renderPass, m_ColorAttachments, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, m_ResolveAttachments, nullptr, &m_FrameBuffer ); return success; } @@ -162,7 +179,7 @@ bool RenderTarget::InitializeFrameBuffer( Vulkan* pVulkan, const RenderP bool RenderTarget::InitializeFrameBufferDepthOnly( Vulkan* pVulkan, const RenderPass& renderPassDepthOnly ) //----------------------------------------------------------------------------- { - bool success = CreateFrameBuffer( renderPassDepthOnly, {}, &m_DepthAttachment, {}, nullptr, &m_FrameBufferDepthOnly ); + bool success = CreateFrameBuffer( renderPassDepthOnly, {}, m_InheritedDepthAttachment ? m_InheritedDepthAttachment : &m_DepthAttachment, {}, nullptr, &m_FrameBufferDepthOnly ); return success; } @@ -321,6 +338,7 @@ void RenderTarget::Release() m_Msaa.clear(); m_DepthAttachment.Release(m_pVulkan); + m_InheritedDepthAttachment = nullptr; m_DepthFormat = TextureFormat::UNDEFINED; m_FrameBufferDepthOnly = {}; diff --git a/framework/code/vulkan/renderTarget.hpp b/framework/code/vulkan/renderTarget.hpp index 7ea397f..8f51a9f 100644 --- a/framework/code/vulkan/renderTarget.hpp +++ b/framework/code/vulkan/renderTarget.hpp @@ -225,6 +225,7 @@ class RenderTarget final : public RenderTargetBase // The Depth Attachment TextureVulkan m_DepthAttachment; + TextureVulkan* m_InheritedDepthAttachment = nullptr; // Note: Not owning // The Frame Buffer Framebuffer m_FrameBuffer; diff --git a/framework/code/vulkan/vulkan.cpp b/framework/code/vulkan/vulkan.cpp index 260b1ef..3a9e7d3 100644 --- a/framework/code/vulkan/vulkan.cpp +++ b/framework/code/vulkan/vulkan.cpp @@ -787,6 +787,9 @@ bool Vulkan::RegisterKnownExtensions() m_ExtKhrSynchronization2 = m_DeviceExtensions.GetExtension(); m_ExtKhrDrawIndirectCount = m_DeviceExtensions.GetExtension(); m_ExtRenderPass2 = m_DeviceExtensions.GetExtension(); + m_ExtBufferDeviceAddress = m_DeviceExtensions.GetExtension(); + m_Ext8BitStorage = m_DeviceExtensions.GetExtension(); + m_ExtCooperativeMatrix = m_DeviceExtensions.GetExtension(); m_ExtFragmentShadingRate = m_DeviceExtensions.GetExtension(); m_ExtMeshShader = m_DeviceExtensions.GetExtension(); m_ExtDynamicRendering = m_DeviceExtensions.GetExtension(); @@ -1234,6 +1237,7 @@ bool Vulkan::GetDataGraphProcessingEngine() // isn't fully supported publicly by the driver #if defined(OS_ANDROID) { +#if 0 auto* Ext_VK_ARM_tensors = static_cast(m_DeviceExtensions.GetExtension(VK_ARM_TENSORS_EXTENSION_NAME)); auto* Ext_VK_ARM_data_graph = static_cast(m_DeviceExtensions.GetExtension(VK_ARM_DATA_GRAPH_EXTENSION_NAME)); auto fpGetDeviceProcAddr = (PFN_vkGetDeviceProcAddr)vkGetInstanceProcAddr(GetVulkanInstance(), "vkGetDeviceProcAddr"); @@ -1244,14 +1248,27 @@ bool Vulkan::GetDataGraphProcessingEngine() { LOGI("Forcing registering and enabling Graph Pipelines extensions for Android"); - Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); - Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanInstance); + try + { + Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded; + Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); + Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanInstance); + + Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded; + Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); + Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanInstance); + + LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Done"); + } + catch (...) + { + Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded; + Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); - Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanInstance); + LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Failed, disabling EXT"); + } } +#endif } #endif diff --git a/framework/code/vulkan/vulkan.hpp b/framework/code/vulkan/vulkan.hpp index b463a87..3d74fa2 100644 --- a/framework/code/vulkan/vulkan.hpp +++ b/framework/code/vulkan/vulkan.hpp @@ -67,6 +67,9 @@ namespace ExtensionLib { struct Ext_VK_EXT_hdr_metadata; struct Ext_VK_KHR_fragment_shading_rate; struct Ext_VK_KHR_create_renderpass2; + struct Ext_VK_KHR_buffer_device_address; + struct Ext_VK_KHR_8bit_storage; + struct Ext_VK_KHR_cooperative_matrix; struct Ext_VK_KHR_synchronization2; struct Ext_VK_QCOM_tile_properties; struct Ext_VK_QCOM_tile_shading; @@ -1077,6 +1080,9 @@ class Vulkan : public ::GraphicsApiBase const ExtensionLib::Ext_VK_EXT_hdr_metadata* m_ExtHdrMetadata = nullptr; const ExtensionLib::Ext_VK_KHR_fragment_shading_rate* m_ExtFragmentShadingRate = nullptr; const ExtensionLib::Ext_VK_KHR_create_renderpass2* m_ExtRenderPass2 = nullptr; + const ExtensionLib::Ext_VK_KHR_buffer_device_address* m_ExtBufferDeviceAddress = nullptr; + const ExtensionLib::Ext_VK_KHR_8bit_storage* m_Ext8BitStorage = nullptr; + const ExtensionLib::Ext_VK_KHR_cooperative_matrix* m_ExtCooperativeMatrix = nullptr; const ExtensionLib::Ext_VK_KHR_synchronization2* m_ExtKhrSynchronization2 = nullptr; const ExtensionLib::Ext_VK_QCOM_tile_properties* m_ExtQcomTileProperties = nullptr; const ExtensionLib::Ext_VK_QCOM_tile_shading* m_ExtQcomTileShading = nullptr; diff --git a/project/android/cmake/FrameworkApplicationHelper.cmake b/project/android/cmake/FrameworkApplicationHelper.cmake index c820f96..3ecb364 100644 --- a/project/android/cmake/FrameworkApplicationHelper.cmake +++ b/project/android/cmake/FrameworkApplicationHelper.cmake @@ -55,6 +55,20 @@ target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../.. #endif() target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libframework.a ) target_link_libraries( ${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libframework_base.a ) + +target_link_libraries( + ${TARGET_NAME} + ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libglslang.a +) +target_link_libraries( + ${TARGET_NAME} + ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libglslang-default-resource-limits.a +) +target_link_libraries( + ${TARGET_NAME} + ${CMAKE_CURRENT_BINARY_DIR}/../../../../../android/framework/${CMAKE_BUILD_TYPE}/libSPIRV.a +) + target_include_directories(${TARGET_NAME} PUBLIC ../../framework/code) target_include_directories(${TARGET_NAME} PUBLIC ../../framework/external) target_include_directories(${TARGET_NAME} PUBLIC ../../framework/external/glm) # so code can do #include "glm/mat3x3.hpp" etc diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 0000000..17be11b --- /dev/null +++ b/samples/README.md @@ -0,0 +1,100 @@ +# Samples + +Unless noted all samples run on Windows and Android. + +## [empty](empty) + +Empty app. Minimal app linked against Framework. + +## [hello-gltf](hello-gltf) + +Scene (gltf) loading app. Implements a working scene with camera movement and minimal lightning. + +## [AODemo](AODemo) + +Vulkan implementation of Neural Network Ambient Occlusion. + +## [FrameworkTest](FrameworkTest) + +Simple test project that initializes the Vulkan Framework and displays a textured sphere. + +## [MLClothApp](MLClothApp) + +Sample project using machine learning to lower cloth simulation cost. + +## [deferredLpac](deferredLpac) + +App that renders a (reasonably) complex scene using forward rendering and compute shaders. + +Where LPAC (Low Priority Asyncronous Compute) is available the Compute jobs will be done on a low priority queue during shadow pass z-buffer write. + +## [DspOffload](dspOffload) + +App illustrating how the Hexagon DSP can be used to run graphics tasks and write results to GPU accessable Android Hardware Buffers. + +## [forward](forward) + +App illustrating a resonably complex forward rendered scene. + +## [hdrSwapchain](hdrSwapchain) + +Demonstrates the use of different swapchain image formats and colorspaces. Has a gui dropdown that allows for switching buffer formats on the fly. + +Also demonstrates Qualcomm Vulkan render-pass transform extension [VK_QCOM_render_pass_transform](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_QCOM_render_pass_transform.html) + +## [rayQueryShadows](rayQueryShadows) + +Uses Vulkan Ray Tracing extension (VK_KHR_ray_tracing) to implement shadows using Ray Queries. + +Currently Windows only. + +## [rotatedCopy](rotatedCopy) + +Uses VK_QCOM_rotated_copy_commands (and VK_KHR_copy_commands2) extension to blit from a (lower resolution) intermediate render target to the device framebuffer rotated to match the devices native orientation (and thus avoiding the Android SurfaceFlinger doing an additional rotation/composition step). + +## [shaderResolve](shaderResolve) + +Uses VK_QCOM_render_pass_shader_resolve extension to implement MSAA and order-independent transparency in a deferred renderer. + +## [shaderResolveTonemap](shaderResolveTonemap) + +Uses VK_QCOM_render_pass_shader_resolve to perform a filmic tonemapping operator (on a simple forward rendered scene) as part of the MSAA resolve. + +Optionally runs the tonemap/resolve as a subpass of the main scene pass. Has onscreen UI controls to modify MSAA sample counts and to enable/disable the shader resolve and use of subpasses (for measuring GPU subpass/shader-resolve efficiency). + +## [atmospherics](atmospherics) + +Atmospheric lighting. + +# Configuration + +Each sample can be configured by adding an 'app_config.txt' file in the root of the relevant sample (ie samples/forward/app_config.txt). + +On Android the app_config.txt needs to be pushed to device, into /sdcard/Android/data/ANDROID_APP_ID/files/. , many samples have a batch file to do this (eg 07_InstallConfig.bat). + +If this file is missing or empty the sample application should run with 'reasonable' defaults. + +Samples share a set of common settings and can define additional settings specific to the sample's functionality. + +## Common config settings + +gFramesToRender = x + +Render a specific number of frames before exiting the app. x should be in integer. 0 (default) will render 'forever'. + +# File handling + +## Windows + +Executables are compiled to project\windows\solution\samples\APPLICATION\Debug\APPLICATION.exe + +Executables should be run from the samples\APPLICATION folder and data files (textures, models, shaders) are loaded from the Media subfolder. The Visual Studio solution is pre-configured to run the exe from the correct folder. + +## Android + +Apk application bundles are complied to build\android\APPLICATION\outputs\apk\debug\APPLICATION-debug.apk + +So long as the sample's Media files were prepared (02_PrepareMedia.bat) before building the apk, the apk is stand-alone and contains the application executable and Media files. + +If desired any files in the Media folder can be 'overridden' by copying the relevant file to /sdcard/Android/data/ANDROID_APP_ID/files/. with the expected folder path. Eg you can copy a shader file from Media\Shaders\. to /sdcard/Android/data/ANDROID_APP_ID/files/Media/Shaders/. and see your new shader code when the application is re-launched. + diff --git a/samples/cooperative_matrix/CMakeLists.txt b/samples/cooperative_matrix/CMakeLists.txt new file mode 100644 index 0000000..3fa97e6 --- /dev/null +++ b/samples/cooperative_matrix/CMakeLists.txt @@ -0,0 +1,97 @@ + +cmake_minimum_required (VERSION 3.21) + +project (cooperative_matrix C CXX) +set(CMAKE_CXX_STANDARD 20) + +# +# Source files included in this application. +# +set(CPP_SRC + code/main/application.cpp + code/main/application.hpp + code/main/cooperative_matrix_tester.cpp + code/main/cooperative_matrix_tester.hpp + code/main/runtime_shader.cpp + code/main/runtime_shader.hpp +) + +# +# Setup the module path to include the 'project directory' (project/windows or project/android) +# +if(NOT DEFINED PROJECT_ROOT_DIR) + # Windows can use CMAKE_SOURCE_DIR; Android's gradle passes -DPROJECT_ROOT_DIR=${project.rootDir} + set(PROJECT_ROOT_DIR ${CMAKE_SOURCE_DIR}) +endif() +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_ROOT_DIR}/cmake) + +# +# Do all the build steps for a Framework application. +# (Defines ${TARGET_NAME} inside the helper; do NOT modify the helper that links .a files.) +# +include(FrameworkApplicationHelper) + +# ------------------------------------------------------------------------------ +# Half (float16) dependency under ../../framework/external +# Build as a normal CMake target and link it to ${TARGET_NAME}. +# ------------------------------------------------------------------------------ + +add_library(half STATIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.cpp +) + +# Expose headers to dependents (your app target will inherit this include path) +target_include_directories(half + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external +) + +# Nicer grouping in IDEs +source_group(TREE "${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external" PREFIX "external" FILES + ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../framework/external/half/half.h +) + +# Link half into the application/library target defined by the helper +target_link_libraries(${TARGET_NAME} half) + +# +# Setup asset source and target folders +# + +# cmake will use our GameSampleAssets (default for no parameter) as root directory for any asset request +# (see FrameworkApplicationHelper.cmake for more info) +inject_root_asset_path() + +# Register local variables for asset request, while also defining them in the C++ code for easy access +# Here we use the default destination paths, all defined at FrameworkApplicationHelper.cmake +register_local_asset_path(SHADER_DESTINATION "${DEFAULT_LOCAL_SHADER_DESTINATION}") +register_local_asset_path(MESH_DESTINATION "${DEFAULT_LOCAL_MESH_DESTINATION}") +register_local_asset_path(TEXTURE_DESTINATION "${DEFAULT_LOCAL_TEXTURE_DESTINATION}") + +# +# Add in the contents of 'shaders' directory +# +include(AddShadersDir) + +# Search and include all project shaders +scan_for_shaders() + +# +# Copy required models to local folders +# +include(ModelPackager) + +# Scene GLTF +add_gltf(scenes/SteamPunkSauna/SteamPunkSauna.gltf) + +# +# Convert and copy textures to local folders +# +include(TexturePackager) + +# Scene Textures +add_textures_from_path(scenes/SteamPunkSauna UASTC) + +# Supporting Textures +add_textures_from_path(textures) diff --git a/samples/cooperative_matrix/README.md b/samples/cooperative_matrix/README.md new file mode 100644 index 0000000..9b1226c --- /dev/null +++ b/samples/cooperative_matrix/README.md @@ -0,0 +1,66 @@ +# Hello-GLTF Sample + +![Screenshot](img/screenshot.png) + +## Overview + +Hello GLTF sample demonstrates the most basic usage of the Framework to produce a native Vulkan application and it is designed to be small and simple and meant as a starting point for developers to expand its functionality. + +It is recommended that this sample is used as a starting point for other applications using this Framework. To do so, this folder can be copied to a desired location and the respective changes to point the source and include files to the Framework `src` and `include` folders are addressed. For simplicity, the folder of the new sample can be created alongside the `hello-gltf` sample within the `samples` folder. This way, no additional modifications are required in any configuration file to build it out of the box. + +## Building + +### Dependencies + +The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable. + +* Android SDK +* Andorid NDK +* Gradle +* CMake +* Android Studio + +### Pre-Build + +Compile the underlying shaders to .spv by running the batch file below: + +``` +01_CompileShaders.bat +``` + +And convert the needed textures and shaders to the correct format using the batch file below: + +``` +02_PrepareMedia.bat +``` + +Note: The sample assumes there are user provided asset files at the following path: **'Media\Meshes\Museum.gltf'** and **'Media\Meshes\Museum.bin'**. +Texture dependencies from this asset should be added to **'Media\Textures\'** and are required to have the *.ktx* extension. +There are 3 extra require supporting textures that should also go to the same texture path listed above: **white_d.ktx**, **black_d.ktx** and **normal_default.ktx**. +The framework team is working to build a centralized asset repository that should minimize these requirements in the near future. + +### Build + +Once the dependencies are installed and shaders compiled, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system: + +``` +01_BuildAndroid.bat +02_BuildWindows.bat +``` + +### Deploy (android-only) + +To deploy the media files and the .apk to a connected device, run the batch files below: + +``` +02_CopyMediaToDevice.bat +03_Install_APK.bat +``` + +If desired, you can keep track of any logging by running one of the logcat batch files (which you can find on the current directory). + +## Android Studio + +This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging. + +To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio. diff --git a/samples/cooperative_matrix/code/main/application.cpp b/samples/cooperative_matrix/code/main/application.cpp new file mode 100644 index 0000000..b4327fa --- /dev/null +++ b/samples/cooperative_matrix/code/main/application.cpp @@ -0,0 +1,733 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// + +#include "application.hpp" +#include "main/applicationEntrypoint.hpp" +#include "camera/cameraController.hpp" +#include "camera/cameraControllerTouch.hpp" +#include "camera/cameraData.hpp" +#include "camera/cameraGltfLoader.hpp" +#include "gui/imguiVulkan.hpp" +#include "material/vulkan/computable.hpp" +#include "material/vulkan/drawable.hpp" +#include "material/drawableLoader.hpp" +#include "material/vulkan/materialManager.hpp" +#include "material/vulkan/shaderModule.hpp" +#include "material/vulkan/shaderManager.hpp" +#include "material/vulkan/specializationConstantsLayout.hpp" +#include "mesh/meshHelper.hpp" +#include "mesh/meshLoader.hpp" +#include "system/math_common.hpp" +#include "texture/vulkan/textureManager.hpp" +#include "vulkan/extensionHelpers.hpp" +#include "vulkan/extensionLib.hpp" +#include "imgui.h" + +#include +#include +#include + +namespace +{ + static constexpr std::array sRenderPassNames = { "RP_HUD", "RP_BLIT" }; + + glm::vec3 gCameraStartPos = glm::vec3(26.48f, 20.0f, -5.21f); + glm::vec3 gCameraStartRot = glm::vec3(0.0f, 110.0f, 0.0f); + + float gFOV = PI_DIV_4; + float gNearPlane = 1.0f; + float gFarPlane = 1800.0f; + float gNormalAmount = 0.3f; + float gNormalMirrorReflectAmount = 0.05f; +} + +/// +/// @brief Implementation of the Application entrypoint (called by the framework) +/// @return Pointer to Application (derived from @FrameworkApplicationBase). +/// Creates the Application class. Ownership is passed to the calling (framework) function. +/// +FrameworkApplicationBase* Application_ConstructApplication() +{ + return new Application(); +} + +Application::Application() : ApplicationHelperBase() +{ +} + +Application::~Application() +{ +} + +//----------------------------------------------------------------------------- +void Application::PreInitializeSetVulkanConfiguration(Vulkan::AppConfiguration& config) +//----------------------------------------------------------------------------- +{ + ApplicationHelperBase::PreInitializeSetVulkanConfiguration(config); + config.RequiredExtension(); + config.RequiredExtension(); + config.RequiredExtension(); + config.RequiredExtension(); + config.RequiredExtension(); + config.OptionalExtension(); +} + +//----------------------------------------------------------------------------- +bool Application::Initialize(uintptr_t windowHandle, uintptr_t hInstance) +//----------------------------------------------------------------------------- +{ + if (!ApplicationHelperBase::Initialize( windowHandle, hInstance )) + { + return false; + } + + if (GetVulkan()->HasLoadedVulkanDeviceExtension(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME)) + { + GetVulkan()->WaitUntilIdle(); + m_cooperative_matrix_runner = std::make_unique(*GetVulkan()); + LOGI("Initializing cooperative matrix runner"); + if (!m_cooperative_matrix_runner->InitializeRunner()) + { + return false; + } + LOGI("Cooperative matrix runner initialized!"); + } + + if (!InitializeCamera()) + { + return false; + } + + if (!LoadShaders()) + { + return false; + } + + if (!CreateRenderTargets()) + { + return false; + } + + if (!InitAllRenderPasses()) + { + return false; + } + + if (!InitGui(windowHandle)) + { + return false; + } + + if (!LoadMeshObjects()) + { + return false; + } + + if (!InitCommandBuffers()) + { + return false; + } + + if (!InitLocalSemaphores()) + { + return false; + } + + if (!BuildCmdBuffers()) + { + return false; + } + + return true; +} + +//----------------------------------------------------------------------------- +void Application::Destroy() +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + + // Cmd buffers + for (int whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + for (auto& cmdBuffer : m_RenderPassData[whichPass].PassCmdBuffer) + { + cmdBuffer.Release(); + } + + for (auto& cmdBuffer : m_RenderPassData[whichPass].ObjectsCmdBuffer) + { + cmdBuffer.Release(); + } + + m_RenderPassData[whichPass].RenderTarget.Release(); + } + + // Render passes / Context / Semaphores + for (int whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + vkDestroySemaphore(pVulkan->m_VulkanDevice, m_RenderPassData[whichPass].PassCompleteSemaphore, nullptr); + m_RenderPassData[whichPass].RenderContext.clear(); + } + + // Drawables + m_BlitQuadDrawable.reset(); + + // Internal + m_ShaderManager.reset(); + m_MaterialManager.reset(); + m_CameraController.reset(); + m_AssetManager.reset(); + + ApplicationHelperBase::Destroy(); +} + +//----------------------------------------------------------------------------- +bool Application::InitializeCamera() +//----------------------------------------------------------------------------- +{ + LOGI("******************************"); + LOGI("Initializing Camera..."); + LOGI("******************************"); + + m_Camera.SetPosition(gCameraStartPos, glm::quat(gCameraStartRot * TO_RADIANS)); + m_Camera.SetAspect(float(gRenderWidth) / float(gRenderHeight)); + m_Camera.SetFov(gFOV); + m_Camera.SetClipPlanes(gNearPlane, gFarPlane); + + // Camera Controller // + +#if defined(OS_ANDROID) + typedef CameraControllerTouch tCameraController; +#else + typedef CameraController tCameraController; +#endif + + auto cameraController = std::make_unique(); + if (!cameraController->Initialize(gRenderWidth, gRenderHeight)) + { + return false; + } + + m_CameraController = std::move(cameraController); + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::LoadShaders() +//----------------------------------------------------------------------------- +{ + m_ShaderManager = std::make_unique(*GetVulkan()); + m_ShaderManager->RegisterRenderPassNames(sRenderPassNames); + + m_MaterialManager = std::make_unique(*GetVulkan()); + + LOGI("******************************"); + LOGI("Loading Shaders..."); + LOGI("******************************"); + + typedef std::pair tIdAndFilename; + for (const tIdAndFilename& i : + { tIdAndFilename { "Blit", "Blit.json" } + }) + { + if (!m_ShaderManager->AddShader(*m_AssetManager, i.first, i.second, SHADER_DESTINATION_PATH)) + { + LOGE("Error Loading shader %s from %s", i.first.c_str(), i.second.c_str()); + LOGI("Please verify if you have all required assets on the sample media folder"); + LOGI("If you are running on Android, don't forget to run the `02_CopyMediaToDevice.bat` script to copy all media files into the device memory"); + return false; + } + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::CreateRenderTargets() +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + + LOGI("**************************"); + LOGI("Creating Render Targets..."); + LOGI("**************************"); + + TextureFormat vkDesiredDepthFormat = pVulkan->GetBestSurfaceDepthFormat(); + TextureFormat desiredDepthFormat = vkDesiredDepthFormat; + + const TextureFormat MainColorType[] = { TextureFormat::R8G8B8A8_SRGB }; + const TextureFormat HudColorType[] = { TextureFormat::R8G8B8A8_SRGB }; + + // Notice no depth on the HUD RT + if (!m_RenderPassData[RP_HUD].RenderTarget.Initialize(pVulkan, gSurfaceWidth, gSurfaceHeight, HudColorType, TextureFormat::UNDEFINED, Msaa::Samples1, "HUD RT")) + { + LOGE("Unable to create hud render target"); + return false; + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::InitAllRenderPasses() +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + + // ColorInputUsage | ClearDepthRenderPass | ColorOutputUsage | DepthOutputUsage | ClearColor + m_RenderPassData[RP_HUD].PassSetup = { RenderPassInputUsage::Clear, false, RenderPassOutputUsage::StoreReadOnly, RenderPassOutputUsage::Discard, {}}; + m_RenderPassData[RP_BLIT].PassSetup = { RenderPassInputUsage::DontCare, true, RenderPassOutputUsage::Present, RenderPassOutputUsage::Discard, {}}; + + TextureFormat surfaceFormat = pVulkan->m_SurfaceFormat; + auto swapChainColorFormat = std::span({ &surfaceFormat, 1 }); + auto swapChainDepthFormat = pVulkan->m_SwapchainDepth.format; + + LOGI("******************************"); + LOGI("Initializing Render Passes... "); + LOGI("******************************"); + + for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + bool isSwapChainRenderPass = whichPass == RP_BLIT; + + std::span colorFormats = isSwapChainRenderPass ? swapChainColorFormat : m_RenderPassData[whichPass].RenderTarget.m_pLayerFormats; + TextureFormat depthFormat = isSwapChainRenderPass ? swapChainDepthFormat : m_RenderPassData[whichPass].RenderTarget.m_DepthFormat; + + const auto& passSetup = m_RenderPassData[whichPass].PassSetup; + auto& passData = m_RenderPassData[whichPass]; + + RenderPass renderPass; + if (!pVulkan->CreateRenderPass( + { colorFormats }, + depthFormat, + Msaa::Samples1, + passSetup.ColorInputUsage, + passSetup.ColorOutputUsage, + passSetup.ClearDepthRenderPass, + passSetup.DepthOutputUsage, + renderPass)) + { + return false; + } + + Framebuffer framebuffer; + if (!isSwapChainRenderPass) + { + framebuffer.Initialize(*pVulkan, + renderPass, + passData.RenderTarget.m_ColorAttachments, + &passData.RenderTarget.m_DepthAttachment, + sRenderPassNames[whichPass]); + } + + passData.RenderContext.push_back({ std::move(renderPass), {}/*pipeline*/, std::move(framebuffer), sRenderPassNames[whichPass] }); + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::InitGui(uintptr_t windowHandle) +//----------------------------------------------------------------------------- +{ + const auto& hudRenderTarget = m_RenderPassData[RP_HUD].RenderTarget; + m_Gui = std::make_unique(*GetVulkan(), m_RenderPassData[RP_HUD].RenderContext[0].GetRenderPass().Copy()); + if (!m_Gui->Initialize(windowHandle, TextureFormat::R8G8B8A8_UNORM, hudRenderTarget.m_Width, hudRenderTarget.m_Height)) + { + return false; + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::LoadMeshObjects() +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + + LOGI("***********************"); + LOGI("Initializing Meshes... "); + LOGI("***********************"); + + const auto* pBlitQuadShader = m_ShaderManager->GetShader("Blit"); + if (!pBlitQuadShader) + { + return false; + } + + LOGI("*********************"); + LOGI("Creating Quad mesh..."); + LOGI("*********************"); + + Mesh blitQuadMesh; + if (!MeshHelper::CreateMesh( + pVulkan->GetMemoryManager(), + MeshObjectIntermediate::CreateScreenSpaceMesh(), + 0, + pBlitQuadShader->m_shaderDescription->m_vertexFormats, + &blitQuadMesh)) + { + return false; + } + + // Blit Material + auto blitQuadShaderMaterial = m_MaterialManager->CreateMaterial(*pBlitQuadShader, 2, + [this](const std::string& texName) -> const MaterialManager::tPerFrameTexInfo + { + if (texName == "Overlay") + { + return { &m_RenderPassData[RP_HUD].RenderTarget.m_ColorAttachments[0] }; + } + return {}; + }, + [this](const std::string& bufferName) -> PerFrameBuffer + { + return {}; + } + ); + + m_BlitQuadDrawable = std::make_unique(*pVulkan, std::move(blitQuadShaderMaterial)); + if (!m_BlitQuadDrawable->Init(m_RenderPassData[RP_BLIT].RenderContext[0], std::move(blitQuadMesh))) + { + return false; + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::InitCommandBuffers() +//----------------------------------------------------------------------------- +{ + LOGI("*******************************"); + LOGI("Initializing Command Buffers..."); + LOGI("*******************************"); + + Vulkan* const pVulkan = GetVulkan(); + + auto GetPassName = [](uint32_t whichPass) + { + if (whichPass >= sRenderPassNames.size()) + { + LOGE("GetPassName() called with unknown pass (%d)!", whichPass); + return "RP_UNKNOWN"; + } + + return sRenderPassNames[whichPass]; + }; + + m_RenderPassData[RP_HUD].PassCmdBuffer.resize(NUM_VULKAN_BUFFERS); + m_RenderPassData[RP_HUD].ObjectsCmdBuffer.resize(NUM_VULKAN_BUFFERS); + m_RenderPassData[RP_BLIT].PassCmdBuffer.resize(pVulkan->m_SwapchainImageCount); + m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.resize(pVulkan->m_SwapchainImageCount); + + char szName[256]; + const auto CmdBuffLevel = CommandListBase::Type::Secondary; + for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + for (uint32_t whichBuffer = 0; whichBuffer < m_RenderPassData[whichPass].PassCmdBuffer.size(); whichBuffer++) + { + // The Pass Command Buffer => Primary + sprintf(szName, "Primary (%s; Buffer %d of %d)", GetPassName(whichPass), whichBuffer + 1, NUM_VULKAN_BUFFERS); + if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Initialize(pVulkan, szName, CommandListBase::Type::Primary)) + { + return false; + } + + // Model => Secondary + sprintf(szName, "Model (%s; Buffer %d of %d)", GetPassName(whichPass), whichBuffer + 1, NUM_VULKAN_BUFFERS); + if (!m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].Initialize(pVulkan, szName, CmdBuffLevel)) + { + return false; + } + } + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::InitLocalSemaphores() +//----------------------------------------------------------------------------- +{ + LOGI("********************************"); + LOGI("Initializing Local Semaphores..."); + LOGI("********************************"); + + const VkSemaphoreCreateInfo SemaphoreInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO }; + + for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + VkResult retVal = vkCreateSemaphore(GetVulkan()->m_VulkanDevice, &SemaphoreInfo, NULL, &m_RenderPassData[whichPass].PassCompleteSemaphore); + if (!CheckVkError("vkCreateSemaphore()", retVal)) + { + return false; + } + } + + return true; +} + +//----------------------------------------------------------------------------- +bool Application::BuildCmdBuffers() +//----------------------------------------------------------------------------- +{ + LOGI("***************************"); + LOGI("Building Command Buffers..."); + LOGI("****************************"); + + Vulkan* const pVulkan = GetVulkan(); + + // Begin recording + for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + auto& renderPassData = m_RenderPassData[whichPass]; + bool bisSwapChainRenderPass = whichPass == RP_BLIT; + + for (uint32_t whichBuffer = 0; whichBuffer < renderPassData.ObjectsCmdBuffer.size(); whichBuffer++) + { + auto& cmdBufer = renderPassData.ObjectsCmdBuffer[whichBuffer]; + + uint32_t targetWidth = bisSwapChainRenderPass ? pVulkan->m_SurfaceWidth : renderPassData.RenderTarget.GetWidth(); + uint32_t targetHeight = bisSwapChainRenderPass ? pVulkan->m_SurfaceHeight : renderPassData.RenderTarget.GetHeight(); + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float)targetWidth; + viewport.height = (float)targetHeight; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset.x = 0; + scissor.offset.y = 0; + scissor.extent.width = targetWidth; + scissor.extent.height = targetHeight; + + // Set up some values that change based on render pass + VkRenderPass whichRenderPass = renderPassData.RenderContext[0].GetRenderPass().mRenderPass; + VkFramebuffer whichFramebuffer = bisSwapChainRenderPass ? pVulkan->m_SwapchainBuffers[whichBuffer].framebuffer : renderPassData.RenderContext[0].GetFramebuffer()->m_FrameBuffer; + + // Objects (can render into any pass except Blit) + if (!cmdBufer.Begin(whichFramebuffer, whichRenderPass, bisSwapChainRenderPass)) + { + return false; + } + vkCmdSetViewport(cmdBufer.m_VkCommandBuffer, 0, 1, &viewport); + vkCmdSetScissor(cmdBufer.m_VkCommandBuffer, 0, 1, &scissor); + } + } + + // Blit quad drawable + AddDrawableToCmdBuffers(*m_BlitQuadDrawable.get(), m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.data(), 1, static_cast(m_RenderPassData[RP_BLIT].ObjectsCmdBuffer.size())); + + // End recording + for (uint32_t whichPass = 0; whichPass < NUM_RENDER_PASSES; whichPass++) + { + auto& renderPassData = m_RenderPassData[whichPass]; + + for (uint32_t whichBuffer = 0; whichBuffer < renderPassData.ObjectsCmdBuffer.size(); whichBuffer++) + { + auto& cmdBufer = renderPassData.ObjectsCmdBuffer[whichBuffer]; + if (!cmdBufer.End()) + { + return false; + } + } + } + + return true; +} + +//----------------------------------------------------------------------------- +void Application::UpdateGui() +//----------------------------------------------------------------------------- +{ + if (m_Gui) + { + m_Gui->Update(); + ImGuiIO& io = ImGui::GetIO(); + + if (ImGui::Begin("FPS", (bool*)nullptr, ImGuiWindowFlags_NoTitleBar)) + { + ImGui::Text("FPS: %.1f", m_CurrentFPS); + ImGui::Text("Camera [%f, %f, %f]", m_Camera.Position().x, m_Camera.Position().y, m_Camera.Position().z); + + if (m_cooperative_matrix_runner) + { + m_cooperative_matrix_runner->RenderUI(); + } + } + ImGui::End(); + + return; + } +} + +//----------------------------------------------------------------------------- +void Application::Render(float fltDiffTime) +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + + if (m_cooperative_matrix_runner) + { + pVulkan->WaitUntilIdle(); + m_cooperative_matrix_runner->TriggerPendingTests(); + } + + // Obtain the next swap chain image for the next frame. + auto currentVulkanBuffer = pVulkan->SetNextBackBuffer(); + uint32_t whichBuffer = currentVulkanBuffer.idx; + + // ******************************** + // Application Draw() - Begin + // ******************************** + + UpdateGui(); + + // Update camera + m_Camera.UpdateController(fltDiffTime * 10.0f, *m_CameraController); + m_Camera.UpdateMatrices(); + + // First time through, wait for the back buffer to be ready + std::span pWaitSemaphores = { ¤tVulkanBuffer.semaphore, 1 }; + + const VkPipelineStageFlags DefaultGfxWaitDstStageMasks[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT }; + + // RP_HUD + VkCommandBuffer guiCommandBuffer = VK_NULL_HANDLE; + if (m_Gui) + { + // Render gui (has its own command buffer, optionally returns vk_null_handle if not rendering anything) + guiCommandBuffer = GetGui()->Render(whichBuffer, m_RenderPassData[RP_HUD].RenderContext[0].GetFramebuffer()->m_FrameBuffer); + if (guiCommandBuffer != VK_NULL_HANDLE) + { + BeginRenderPass(whichBuffer, RP_HUD, currentVulkanBuffer.swapchainPresentIdx); + vkCmdExecuteCommands(m_RenderPassData[RP_HUD].PassCmdBuffer[whichBuffer].m_VkCommandBuffer, 1, &guiCommandBuffer); + EndRenderPass(whichBuffer, RP_HUD); + + // Submit the commands to the queue. + SubmitRenderPass(whichBuffer, RP_HUD, pWaitSemaphores, DefaultGfxWaitDstStageMasks, { &m_RenderPassData[RP_HUD].PassCompleteSemaphore,1 }); + pWaitSemaphores = { &m_RenderPassData[RP_HUD].PassCompleteSemaphore,1 }; + } + } + + // Blit Results to the screen + { + BeginRenderPass(whichBuffer, RP_BLIT, currentVulkanBuffer.swapchainPresentIdx); + AddPassCommandBuffer(whichBuffer, RP_BLIT); + EndRenderPass(whichBuffer, RP_BLIT); + + // Submit the commands to the queue. + SubmitRenderPass(whichBuffer, RP_BLIT, pWaitSemaphores, DefaultGfxWaitDstStageMasks, { &m_RenderPassData[RP_BLIT].PassCompleteSemaphore,1 }, currentVulkanBuffer.fence); + pWaitSemaphores = { &m_RenderPassData[RP_BLIT].PassCompleteSemaphore,1 }; + } + + // Queue is loaded up, tell the driver to start processing + pVulkan->PresentQueue(pWaitSemaphores, currentVulkanBuffer.swapchainPresentIdx); + + // ******************************** + // Application Draw() - End + // ******************************** +} + +//----------------------------------------------------------------------------- +void Application::BeginRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass, uint32_t WhichSwapchainImage) +//----------------------------------------------------------------------------- +{ + Vulkan* const pVulkan = GetVulkan(); + auto& renderPassData = m_RenderPassData[whichPass]; + bool bisSwapChainRenderPass = whichPass == RP_BLIT; + + if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Reset()) + { + LOGE("Pass (%d) command buffer Reset() failed !", whichPass); + } + + if (!m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].Begin()) + { + LOGE("Pass (%d) command buffer Begin() failed !", whichPass); + } + + VkFramebuffer framebuffer = nullptr; + switch (whichPass) + { + case RP_HUD: + framebuffer = m_RenderPassData[whichPass].RenderContext[0].GetFramebuffer()->m_FrameBuffer; + break; + case RP_BLIT: + framebuffer = pVulkan->m_SwapchainBuffers[WhichSwapchainImage].framebuffer; + break; + default: + framebuffer = nullptr; + break; + } + + assert(framebuffer != nullptr); + + VkRect2D passArea = {}; + passArea.offset.x = 0; + passArea.offset.y = 0; + passArea.extent.width = bisSwapChainRenderPass ? pVulkan->m_SurfaceWidth : renderPassData.RenderTarget.m_Width; + passArea.extent.height = bisSwapChainRenderPass ? pVulkan->m_SurfaceHeight : renderPassData.RenderTarget.m_Height; + + TextureFormat swapChainColorFormat = pVulkan->m_SurfaceFormat; + auto swapChainColorFormats = std::span({ &swapChainColorFormat, 1 }); + TextureFormat swapChainDepthFormat = pVulkan->m_SwapchainDepth.format; + std::span colorFormats = bisSwapChainRenderPass ? swapChainColorFormats : m_RenderPassData[whichPass].RenderTarget.m_pLayerFormats; + TextureFormat depthFormat = bisSwapChainRenderPass ? swapChainDepthFormat : m_RenderPassData[whichPass].RenderTarget.m_DepthFormat; + + VkClearColorValue clearColor = { renderPassData.PassSetup.ClearColor[0], renderPassData.PassSetup.ClearColor[1], renderPassData.PassSetup.ClearColor[2], renderPassData.PassSetup.ClearColor[3] }; + + m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].BeginRenderPass( + passArea, + 0.0f, + 1.0f, + { &clearColor , 1 }, + (uint32_t)colorFormats.size(), + depthFormat != TextureFormat::UNDEFINED, + m_RenderPassData[whichPass].RenderContext[0].GetRenderPass().mRenderPass, + bisSwapChainRenderPass, + framebuffer, + VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS); +} + + +//----------------------------------------------------------------------------- +void Application::AddPassCommandBuffer(uint32_t whichBuffer, RENDER_PASS whichPass) +//----------------------------------------------------------------------------- +{ + if (m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].m_NumDrawCalls) + { + vkCmdExecuteCommands(m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].m_VkCommandBuffer, 1, &m_RenderPassData[whichPass].ObjectsCmdBuffer[whichBuffer].m_VkCommandBuffer); + } +} + +//----------------------------------------------------------------------------- +void Application::EndRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass) +//----------------------------------------------------------------------------- +{ + m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].EndRenderPass(); +} + +//----------------------------------------------------------------------------- +void Application::SubmitRenderPass(uint32_t whichBuffer, RENDER_PASS whichPass, const std::span WaitSemaphores, const std::span WaitDstStageMasks, std::span SignalSemaphores, VkFence CompletionFence) +//----------------------------------------------------------------------------- +{ + m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].End(); + m_RenderPassData[whichPass].PassCmdBuffer[whichBuffer].QueueSubmit(WaitSemaphores, WaitDstStageMasks, SignalSemaphores, CompletionFence); +} diff --git a/samples/cooperative_matrix/code/main/application.hpp b/samples/cooperative_matrix/code/main/application.hpp new file mode 100644 index 0000000..daa2400 --- /dev/null +++ b/samples/cooperative_matrix/code/main/application.hpp @@ -0,0 +1,112 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// +#pragma once + +#include "main/applicationHelperBase.hpp" +#include "memory/vulkan/uniform.hpp" +#include "vulkan/commandBuffer.hpp" +#include "cooperative_matrix_tester.hpp" +#include "vulkan/renderPass.hpp" +#include + +enum RENDER_PASS +{ + RP_HUD = 0, + RP_BLIT, + NUM_RENDER_PASSES +}; + +// ********************** +// Render Pass +// ********************** +struct PassSetupInfo +{ + RenderPassInputUsage ColorInputUsage; + bool ClearDepthRenderPass; + RenderPassOutputUsage ColorOutputUsage; + RenderPassOutputUsage DepthOutputUsage; + glm::vec4 ClearColor; +}; + +struct PassData +{ + // Pass internal data + PassSetupInfo PassSetup; + std::vector> RenderContext; // context per framebuffer (some passes might all point to the same framebuffers) + + // Recorded objects that are set to be drawn on this pass + std::vector< CommandListVulkan> ObjectsCmdBuffer; + + // Command buffer used to dispatch the render pass + std::vector< CommandListVulkan> PassCmdBuffer; + + // Indicates the completing of the underlying render pass + VkSemaphore PassCompleteSemaphore = VK_NULL_HANDLE; + + // Render targed used by the underlying render pass + // note: The blit pass uses the backbuffer directly instead this RT + RenderTarget RenderTarget; +}; + +// ********************** +// Application +// ********************** +class Application : public ApplicationHelperBase +{ +public: + Application(); + ~Application() override; + + // ApplicationHelperBase + virtual void PreInitializeSetVulkanConfiguration(Vulkan::AppConfiguration& config) override; + virtual bool Initialize(uintptr_t windowHandle, uintptr_t hInstance) override; + virtual void Destroy() override; + virtual void Render(float fltDiffTime) override; + +private: + + // Application - Initialization + bool InitializeCamera(); + bool LoadShaders(); + bool CreateRenderTargets(); + bool InitAllRenderPasses(); + bool InitGui(uintptr_t windowHandle); + bool LoadMeshObjects(); + bool InitCommandBuffers(); + bool InitLocalSemaphores(); + bool BuildCmdBuffers(); + +private: + + // Application - Frame + void BeginRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass, uint32_t WhichSwapchainImage); + void AddPassCommandBuffer(uint32_t WhichBuffer, RENDER_PASS WhichPass); + void EndRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass); + void SubmitRenderPass(uint32_t WhichBuffer, RENDER_PASS WhichPass, const std::span WaitSemaphores, const std::span WaitDstStageMasks, std::span SignalSemaphores, VkFence CompletionFence = (VkFence)nullptr); + void UpdateGui(); + +private: + + // Render passes + std::array< PassData, NUM_RENDER_PASSES> m_RenderPassData; + + // Drawables + std::unique_ptr m_BlitQuadDrawable; + + // Shaders + std::unique_ptr m_ShaderManager; + + // Materials + std::unique_ptr m_MaterialManager; + + std::unique_ptr m_cooperative_matrix_runner; +}; diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp b/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp new file mode 100644 index 0000000..e99ac0b --- /dev/null +++ b/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp @@ -0,0 +1,347 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// +#pragma once + +#include + +const char* Test01_MxM_Basic = R"( +#version 450 core +#pragma use_vulkan_memory_model +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader + +#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable + +// These specialized constants are set inside the host +layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 +layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 +layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 +layout(constant_id = 3) const uint TOTAL_M = 1; +layout(constant_id = 4) const uint TOTAL_N = 1; +layout(constant_id = 5) const uint TOTAL_K = 1; +layout(constant_id = 6) const uint TILE_M = 1; +layout(constant_id = 7) const uint TILE_N = 1; +layout(constant_id = 8) const uint TILE_K = 1; +layout(constant_id = 9) const bool layoutA_Mfirst = false; +layout(constant_id = 10) const bool layoutB_Kfirst = false; +layout(constant_id = 11) const bool layoutC_Mfirst = false; +layout(constant_id = 12) const bool layoutR_Mfirst = false; +layout(constant_id = 13) const uint strideAinElements = 1; +layout(constant_id = 14) const uint strideBinElements = 1; +layout(constant_id = 15) const uint strideCinElements = 1; +layout(constant_id = 16) const uint strideRinElements = 1; + +// #defines set on compiler GLSL to SPIR-V command line: +// A_TYPE = e.g. float or float16_t +// R_TYPE = e.g. float or float16_t + +layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; +layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; +layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; +layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; + +//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params; + +// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Very simple shader, similar to our OpenCL MxM https://github.qualcomm.com/grtrt/TCU/blob/main/Kernels/MxM_Baseline_Sections.cl +// +void main() +{ + //int la = (layoutA_Mfirst ? 1 : 0); + //int lb = (layoutB_Kfirst ? 1 : 0); + // Example of how to use printf, for details https://confluence.qualcomm.com/confluence/display/GCEA/Use+printf%28...%29+inside+a+Vulkan+shader+using+GLSL + //if ( (gl_GlobalInvocationID.x == 0) && (gl_GlobalInvocationID.y == 0) && (gl_GlobalInvocationID.z == 0)) + // debugPrintfEXT("\nMxM_Basic.comp with:\nTOTAL_M(%d), TOTAL_N(%d), TOTAL_K(%d)\nTILE_M(%d), TILE_N(%d), TILE_K(%d)\nlayoutA_Mfirst(%d), layoutB_Kfirst(%d)\nWGSize(%d, %d, %d), numWG(%d, %d, %d)\n", + // TOTAL_M, TOTAL_N, TOTAL_K, TILE_M, TILE_N, TILE_K, la, lb, + // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z, + // gl_NumWorkGroups.x, gl_NumWorkGroups.y, gl_NumWorkGroups.z); + // //debugPrintfEXT("\nRunning GLSL shader MxM_Basic.comp at GlobalInvocationID(0,0,0), WorkGroupSize(%d, %d, %d)\n", gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z); + +// if ((gl_GlobalInvocationID.x == 0) && (gl_GlobalInvocationID.y == 0) && (gl_GlobalInvocationID.z == 0)) +// debugPrintfEXT("\nMxM_Basic.comp with gl_SubgroupSize(%d)\n", gl_SubgroupSize); + + const uint32_t block_id_m = gl_GlobalInvocationID.y; + const uint32_t block_id_n = gl_GlobalInvocationID.z; + if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) + { + // On each iteration, load a row of cooperative matrices from matrix A, + // load a column of cooperative matrices from matrix B, and multiply all + // pairs of those matrices. + uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; + uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; + + //debugPrintfEXT("\nstep(%d), subMatrixAStartInElements(%d), strideAinElements(%d), subMatrixBStartInElements(%d), strideBinElements(%d)\nat GlobalID(%d, %d, %d), LocalID(%d, %d, %d), WGID(%d, %d, %d)\n", + // step, subMatrixAStartInElements, strideAinElements, subMatrixBStartInElements, strideBinElements, + // gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z, + // gl_LocalInvocationID.x, gl_LocalInvocationID.y, gl_LocalInvocationID.z, + // gl_WorkGroupID.x, gl_WorkGroupID.y, gl_WorkGroupID.z); + + + coopmat matA; + coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst)); + + coopmat matB; + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); + + //for (int i = (gl_LocalInvocationID.x > 63 ? 20 : 0); i < 100; i++) // diable unroll, test gpu_freq, should around 1% + matR = coopMatMulAdd(matA, matB, matR); + } + + // Store results + uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; + + coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); + + // Example of how to peek results before storing back + //float f = float(matR[1]); + //if (gl_LocalInvocationIndex == 0) debugPrintfEXT("matR[0]=%f\n", f); + +} +)"; + +const char* Test03_CONV = R"( +#version 450 core +#pragma use_vulkan_memory_model +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader + +#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable +#extension GL_QCOM_cooperative_matrix_conversion : enable + +// These specialized constants are set inside the host +layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 +layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 +layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 +layout(constant_id = 3) const uint TOTAL_M = 1; +layout(constant_id = 4) const uint TOTAL_N = 1; +layout(constant_id = 5) const uint TOTAL_K = 1; +layout(constant_id = 6) const uint TILE_M = 1; +layout(constant_id = 7) const uint TILE_N = 1; +layout(constant_id = 8) const uint TILE_K = 1; +layout(constant_id = 9) const uint INPUT_W = 1; +layout(constant_id = 10) const uint INPUT_H = 1; +layout(constant_id = 11) const uint FILTER_W = 1; +layout(constant_id = 12) const uint FILTER_H = 1; +layout(constant_id = 13) const uint DILATION = 1; +layout(constant_id = 14) const uint STRIDE = 1; +layout(constant_id = 15) const uint strideAinElements = 1; +layout(constant_id = 16) const uint strideBinElements = 1; +layout(constant_id = 17) const uint strideCinElements = 1; +layout(constant_id = 18) const uint strideRinElements = 1; + +// #defines set on compiler GLSL to SPIR-V command line: +// A_TYPE = e.g. float or float16_t +// R_TYPE = e.g. float or float16_t + +layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; +layout(set=0, binding=0) readonly buffer InputAuint { uint32_t x[]; } inputAuint; +layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; +layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; +layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; + +// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Very simple shader, similar to our OpenCL CONV https://github.qualcomm.com/grtrt/TCU/blob/main/Kernels/Conv_MxM_Short.cl +// +void main() +{ + const uint32_t block_id_m = gl_GlobalInvocationID.y; + const uint32_t block_id_n = gl_GlobalInvocationID.z; + if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + uint32_t gidx_m = gl_GlobalInvocationID.x + TILE_M * gl_GlobalInvocationID.y; // fibers along M + uint32_t out_col_id = gidx_m % INPUT_W; + uint32_t out_row_id = gidx_m / INPUT_W; + + uint32_t filter_offset_h = (FILTER_H % 2 == 0)? 0 : FILTER_H/2; + uint32_t filter_offset_w = (FILTER_W % 2 == 0)? 0 : FILTER_W/2; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) + { + uint32_t subMatrixBStartInElements = col * FILTER_H * FILTER_W * strideBinElements + step; // B is Kfirst + for (uint32_t filter_row = 0; filter_row < FILTER_H; filter_row++) + { + for (uint32_t filter_col = 0; filter_col < FILTER_W; filter_col++) + { + coopmat matB; + coopmat matA; + + // load B matrix input data using coop_mat extension + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, FILTER_H * FILTER_W * strideBinElements, int(true)); + + // load A matrix input data as vectors using regular vector load + uint32_t input_row_id = STRIDE * out_row_id + DILATION * (filter_row - filter_offset_h); + uint32_t input_col_id = STRIDE * out_col_id + DILATION * (filter_col - filter_offset_w); + + // load A vector data from memory + uint32_t vecA[TILE_K/NUM_PACK]; + for (int i=0; i= INPUT_H) || (input_col_id < 0) || (input_col_id >= INPUT_W)) + for (int i=0; i= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += 8) + { + // On each iteration, load a row of cooperative matrices from matrix A, + // load a column of cooperative matrices from matrix B, and multiply all + // pairs of those matrices. + uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; + uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; + + coopmat matA; +#define NEW +#ifdef NEW + uint32_t uvecA[8]; + for (int i=0; i<8; i++) + uvecA[i] = floatBitsToInt(inputA.x[subMatrixAStartInElements + gl_GlobalInvocationID.x * strideAinElements + i]); + matA = constructCoopMatA64QCOM(uvecA, gl_Float32QCOM); +#else + coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst)); +#endif + + coopmat matB; + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); + + matR = coopMatMulAdd(matA, matB, matR); + } + + // Store results + uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; + + coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); +} +)"; \ No newline at end of file diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp new file mode 100644 index 0000000..00f138f --- /dev/null +++ b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp @@ -0,0 +1,1803 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// + +#include "cooperative_matrix_tester.hpp" +#include "cooperative_matrix_shaders.hpp" +#include "vulkan/extensionHelpers.hpp" +#include "vulkan/extensionLib.hpp" +#include <../external/glslang/glslang/Include/glslang_c_interface.h> +#include <../external/glslang/glslang/Public/resource_limits_c.h> + +#pragma push_macro("BOOL") +#define BOOL HALF_BOOL +#include "half/half.h" +#pragma pop_macro("BOOL") + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 1 +#endif + +#include "imgui.h" + +#include +#include +#include +#include + +#define CHECK_VK(cmd) \ + { \ + VkResult local_result = cmd; \ + if(local_result == VK_SUCCESS){} \ + else if (local_result == VK_NOT_READY || local_result == VK_TIMEOUT || \ + local_result == VK_EVENT_SET || local_result == VK_EVENT_RESET || \ + local_result == VK_INCOMPLETE) \ + { \ + LOGW("CHECK_VK: Warning - %s returned %d", #cmd, static_cast(local_result)); \ + } \ + else \ + { \ + LOGE("CHECK_VK: Error - %s returned %d", #cmd, static_cast(local_result)); \ + assert(false); \ + } \ + } + + +#define CHECK_BOOL(expr) \ + { \ + bool local_result = (expr); \ + if (!local_result) \ + { \ + LOGE("CHECK_BOOL: Error - %s evaluated to false", #expr); \ + } \ + } + +namespace +{ + enum gpu_vendors + { + VK_VENDOR_ID_UNKNOWN = 0, + VK_VENDOR_ID_NVIDIA = 0x10de, + VK_VENDOR_ID_QUALCOMM = 0x5143, + VK_VENDOR_ID_AMD = 0x1002, + VK_VENDOR_ID_INTEL = 0x8086, + VK_VENDOR_ID_APPLE = 0x106b + }; + + enum gpu_tiers + { + TIER_UNKNOWN = 0, + QCOM_TIER_PAKALA = 0x44050000, + QCOM_TIER_KAANAPALI = 0x44050A30, + QCOM_TIER_GLYMUR = 0x44070040, + QCOM_TIER_GLYMUR_TEST = 0x36334630, + QCOM_TIER_HAWI = 0x44051430, + NVIDIA_TIER_RTX2070 = 0x1F14 + }; + + const char* GetMatrixTypeName(VkComponentTypeKHR component_type) + { + switch (component_type) + { + case VK_COMPONENT_TYPE_FLOAT16_KHR: return "FLOAT16"; + case VK_COMPONENT_TYPE_FLOAT32_KHR: return "FLOAT32"; + case VK_COMPONENT_TYPE_FLOAT64_KHR: return "FLOAT64"; + case VK_COMPONENT_TYPE_SINT8_KHR: return "SINT8"; + case VK_COMPONENT_TYPE_SINT16_KHR: return "SINT16"; + case VK_COMPONENT_TYPE_SINT32_KHR: return "SINT32"; + case VK_COMPONENT_TYPE_SINT64_KHR: return "SINT64"; + case VK_COMPONENT_TYPE_UINT8_KHR: return "UINT8"; + case VK_COMPONENT_TYPE_UINT16_KHR: return "UINT16"; + case VK_COMPONENT_TYPE_UINT32_KHR: return "UINT32"; + case VK_COMPONENT_TYPE_UINT64_KHR: return "UINT64"; + case VK_COMPONENT_TYPE_BFLOAT16_KHR: return "BFLOAT16"; + case VK_COMPONENT_TYPE_SINT8_PACKED_NV: return "SINT8_PACKED"; + case VK_COMPONENT_TYPE_UINT8_PACKED_NV: return "UINT8_PACKED"; + case VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT: return "FLOAT8_E4M3"; + case VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT: return "FLOAT8_E5M2"; + default: return "UNKNOWN TYPE"; + } + } + + const char* GetMatrixComponentTypeName(VkComponentTypeKHR type) + { + switch (type) + { + case VK_COMPONENT_TYPE_FLOAT64_KHR: return "FP64"; + case VK_COMPONENT_TYPE_FLOAT32_KHR: return "FP32"; + case VK_COMPONENT_TYPE_FLOAT16_KHR: return "FP16"; + case VK_COMPONENT_TYPE_SINT8_KHR: return "INT8"; + case VK_COMPONENT_TYPE_SINT16_KHR: return "INT16"; + case VK_COMPONENT_TYPE_SINT32_KHR: return "INT32"; + case VK_COMPONENT_TYPE_SINT64_KHR: return "INT64"; + default: return "UNKNOWN"; + } + } + + bool FindMatrixProperty( + std::span cooperativeMatrixProperties, + VkCooperativeMatrixPropertiesKHR &cooperativeMatrixProps, + uint32_t MSize, + uint32_t NSize, + uint32_t KSize, + VkComponentTypeKHR AType, + VkComponentTypeKHR BType, + VkComponentTypeKHR CType, + VkComponentTypeKHR RType) + { + bool valid_testtypes = false; + + int32_t matrixprop; + for(matrixprop = 0; matrixprop < cooperativeMatrixProperties.size() && !valid_testtypes; ++matrixprop) + { + if ((cooperativeMatrixProperties[matrixprop].ResultType == RType) && + (cooperativeMatrixProperties[matrixprop].CType == CType) && + (cooperativeMatrixProperties[matrixprop].BType == BType) && + (cooperativeMatrixProperties[matrixprop].AType == AType) && + (MSize != 0 ? cooperativeMatrixProperties[matrixprop].MSize == MSize : true) && + (NSize != 0 ? cooperativeMatrixProperties[matrixprop].NSize == NSize : true) && + (KSize != 0 ? cooperativeMatrixProperties[matrixprop].KSize == KSize : true) ) + { + valid_testtypes = true; + cooperativeMatrixProps = cooperativeMatrixProperties[matrixprop]; + } + } + + return valid_testtypes; + } + + + static const char* ShaderPaths[] + { + Test01_MxM_Basic, + Test02_MxM_VecToMat, + Test03_CONV, + }; + + struct TestCase + { + TestType testType; + VkComponentTypeKHR inputType; + VkComponentTypeKHR outputType; + + // TOTAL_M, TOTAL_N, TOTAL_K is the size of the full R=AxB+C matrix multiply + uint32_t TOTAL_M; + uint32_t TOTAL_N; + uint32_t TOTAL_K; + + // Each cooperative matrix multiply is R[TILE_M, TILE_N] = A[TILE_M, TILE_K] x B[TILE_K, TILE_N] + C[TILE_M, TILE_N] + uint32_t TILE_M; + uint32_t TILE_N; + uint32_t TILE_K; + + bool layoutA_Mfirst; + bool layoutB_Kfirst; + bool layoutC_Mfirst; + bool layoutR_Mfirst; + + uint32_t strideAinElements; + uint32_t strideBinElements; + uint32_t strideCinElements; + uint32_t strideRinElements; + }; + + struct sComponentTypeInfo + { + const char* typeName; + uint32_t bits; + }; + + struct sComponentTypeInfo ComponentTypeInfo[] = + { // From vulkan_core.h + { "float16", 16 }, // VK_COMPONENT_TYPE_FLOAT16_KHR = 0, + { "float32", 32 }, // VK_COMPONENT_TYPE_FLOAT32_KHR = 1, + { "float64", 64 }, // VK_COMPONENT_TYPE_FLOAT64_KHR = 2, + { "int8", 8 }, // VK_COMPONENT_TYPE_SINT8_KHR = 3, + { "int16", 16 }, // VK_COMPONENT_TYPE_SINT16_KHR = 4, + { "int32", 32 }, // VK_COMPONENT_TYPE_SINT32_KHR = 5, + { "int64", 64 }, // VK_COMPONENT_TYPE_SINT64_KHR = 6, + { "uint8", 8 }, // VK_COMPONENT_TYPE_UINT8_KHR = 7, + { "uint16", 16 }, // VK_COMPONENT_TYPE_UINT16_KHR = 8, + { "uint32", 32 }, // VK_COMPONENT_TYPE_UINT32_KHR = 9, + { "uint64", 64 }, // VK_COMPONENT_TYPE_UINT64_KHR = 10, + }; + + const char* scopeString[] = { + "invalid", + "device", + "workgroup", + "subgroup", + "invalid", + "queuefamily", + }; + + struct MatrixDesc + { + struct + { + uint32_t rows, cols; + } dims; + VkComponentTypeKHR dataType; + size_t elementSize; + VkDeviceSize bufferSize; + uint32_t totalElements; + + // Create a host- and device-local buffer for each input and output. + // Descriptors point at the device buffers. + VkBuffer hostBuffer; + VkDeviceMemory hostMemory; + VkBuffer deviceBuffer; + VkDeviceMemory deviceMemory; + void* ptr; + + bool isFloatType() const + { + switch (dataType) + { + default: + return false; + case VK_COMPONENT_TYPE_FLOAT16_KHR: + case VK_COMPONENT_TYPE_FLOAT32_KHR: + case VK_COMPONENT_TYPE_FLOAT64_KHR: + return true; + } + } + + void setDataFloat(uint32_t i, float value) + { + if (dataType == VK_COMPONENT_TYPE_FLOAT32_KHR) + { + ((float*)ptr)[i] = value; + } + else + { + uint32_t asInt = *(uint32_t*)&value; + int sign = (asInt & 0x80000000) >> 31; + int exp = ((asInt & 0x7f800000) >> 23) - 127; + int mantissa = (asInt & 0x7FFFFF); + + sign = sign << 15; + exp = (exp + 15) << 10; + mantissa = mantissa >> (23 - 10); + + if (asInt != 0) { + asInt = sign | exp | mantissa; + } + + ((uint16_t*)ptr)[i] = asInt; + } + } + + float getDataFloat(uint32_t i) const + { + if (dataType == VK_COMPONENT_TYPE_FLOAT32_KHR) + { + return ((float*)ptr)[i]; + } + else + { + uint32_t asInt = ((uint16_t*)ptr)[i]; + int sign = (asInt & 0x8000) >> 15; + int exp = ((asInt & 0x7c00) >> 10) - 15; + int mantissa = (asInt & 0x3FF); + + sign = sign << 31; + exp = (exp + 127) << 23; + mantissa = mantissa << (23 - 10); + + if (asInt != 0) { + asInt = sign | exp | mantissa; + } + + return *(float*)&asInt; + } + } + + float getDataFloat(int m, int n, bool colMajor) const + { + return getDataFloat(colMajor ? (n * dims.rows + m) : (m * dims.cols + n)); + } + + void setDataInt(uint32_t i, uint32_t value) + { + assert(ComponentTypeInfo[dataType].bits == 8 || ComponentTypeInfo[dataType].bits == 32); + switch (dataType) { + default: assert(0); // fallthrough + case VK_COMPONENT_TYPE_UINT8_KHR: ((uint8_t*)ptr)[i] = (uint8_t)value; break; + case VK_COMPONENT_TYPE_UINT32_KHR: ((uint32_t*)ptr)[i] = (uint32_t)value; break; + case VK_COMPONENT_TYPE_SINT8_KHR: ((int8_t*)ptr)[i] = (int8_t)value; break; + case VK_COMPONENT_TYPE_SINT32_KHR: ((int32_t*)ptr)[i] = (int32_t)value; break; + } + } + + uint32_t getDataInt(uint32_t i) const + { + assert(ComponentTypeInfo[dataType].bits == 8 || ComponentTypeInfo[dataType].bits == 32); + switch (dataType) { + default: assert(0); // fallthrough + case VK_COMPONENT_TYPE_UINT8_KHR: return ((uint8_t*)ptr)[i]; + case VK_COMPONENT_TYPE_UINT32_KHR: return ((uint32_t*)ptr)[i]; + case VK_COMPONENT_TYPE_SINT8_KHR: return ((int8_t*)ptr)[i]; + case VK_COMPONENT_TYPE_SINT32_KHR: return ((int32_t*)ptr)[i]; + } + } + + uint32_t getDataInt(int m, int n, bool colMajor) const + { + return getDataInt(colMajor ? (n * dims.rows + m) : (m * dims.cols + n)); + } + }; + + + template + void InitMatrix(T* matrix, unsigned int mrows, unsigned int mcols, unsigned int stride, FillDataType init, unsigned int set_num_decimals=2)//, int sequence, float const_init) + { + struct MatrixKey + { + unsigned int mrows; + unsigned int mcols; + unsigned int stride; + FillDataType init; + unsigned int set_num_decimals; + + bool operator==(const MatrixKey& other) const + { + return mrows == other.mrows && + mcols == other.mcols && + stride == other.stride && + init == other.init && + set_num_decimals == other.set_num_decimals; + } + }; + + struct MatrixKeyHasher + { + std::size_t operator()(const MatrixKey& key) const + { + std::size_t h1 = std::hash{}(key.mrows); + std::size_t h2 = std::hash{}(key.mcols); + std::size_t h3 = std::hash{}(key.stride); + std::size_t h4 = std::hash{}(key.init); + std::size_t h5 = std::hash{}(key.set_num_decimals); + return h1 ^ (h2 << 1) ^ (h3 << 2) ^ (h4 << 3) ^ (h5 << 4); + } + }; + + static std::unordered_map, MatrixKeyHasher> cache; + + MatrixKey key{ mrows, mcols, stride, init, set_num_decimals }; + + auto it = cache.find(key); + if (it != cache.end()) + { + std::memcpy(matrix, it->second.data(), mrows * stride * sizeof(T)); + return; + } + + std::vector temp_matrix(mrows * stride, T(0)); + + float r, rr; + + float flow = 0.0f; + float fhigh = 1.0f; + int range = 3; // 3 -> -1, 0 and 1 + static int counter = 0; + float const_init = 1.0f; + //int sequence = 2048;// Float16: Integers between 0 and 2048 can be exactly represented (and also between -2048 and 0) + int sequence = 3;// 2048;// Float16: Integers between 0 and 2048 can be exactly represented (and also between -2048 and 0) + if (sizeof(T) == 1) sequence = 255; // 256 is too simple of a sequence + + static unsigned seed = 3; + std::srand(seed++); // srand seed doesn't work with time(0) + std::cout << "Initializing ROWxCOL=" << mrows << "x" << mcols << " matrix (stride=" << stride << ") with init option = " << init << " and using " << set_num_decimals << " number of decimals\n"; + + // Set the buffer to '0' in case mcols < stride, init only mrows*mcols elements, + memset((void*)matrix, 0, size_t(mrows * stride)); + + // unsigned int counter=1; // for debugging purpose + for (unsigned int row = 0; row < mrows; row++) // y + { + for (unsigned int col = 0; col < mcols; col++) // x + { + switch (init) + { + case FILL_WITH_ZERO: + r = 0; + break; + case FILL_WITH_CONSTANTS: + r = const_init; // default const_init=1.0f + break; + case FILL_WITH_RANDON_UINT: + r = float(std::rand() % range); // defualt range=3 -> init_matrix will be 0, 1, 2 + break; + case FILL_WITH_RANDON_INT: + r = float(std::rand() % range - ((range - 1) / 2)); // defualt range=3 -> init_matrix will be - 1, 0 and 1 -> guarantee average 0 for dot products preventing float16 going out of range + break; + case FILL_SEQUENCE_INT: + r = float(counter++ % sequence);// + const_init; + break; + case FILL_WITH_RANDOM_LOW_HIGH_INT: + r = T(std::rand() % int(fhigh)) + int(flow); + break; + case FILL_WITH_RANDOM_FLOAT: + r = flow + float(rand()) / ((float(RAND_MAX) / (fhigh - flow))); + break; + case FILL_WITH_RANDOM_PLUS1_MINUS1_FLOAT: + //r = float(rand()); + r = rand() > RAND_MAX/2 ? float(1.0) : float(-1.0); + break; + default: + LOGE("Invalid InitMatrix(...) initialization option '-i:%d'", init); + } + // Force to fixed number of decimals based on user input + std::ostringstream o; + o << std::setprecision(set_num_decimals) << std::fixed << r; + rr = std::stof(o.str()); + // load the matrix + temp_matrix[row * stride + col] = T(rr); + } + } + + std::memcpy(matrix, temp_matrix.data(), mrows* stride * sizeof(T)); + cache[key] = std::move(temp_matrix); + } + + template + void TransposeMatrix(T* matrix, const unsigned int& mrows, const unsigned int& mcols, const char *info) + { + std::cout << "\nTransposing MxM(" << info << ") on CPU, input type '" << typeid(matrix).name() << "', number of rows: '" << mrows << "', number of columns: '" << mcols << "', IT'LL TAKE SOME TIME!!!\n\n"; + + unsigned int count = mcols * mrows; + + for (unsigned int col = 0; col < mcols; ++col) + { + unsigned int count_adjustment = mcols - col - 1; + + for (unsigned int row = 0, step = 1; row < mrows; ++row, step += count_adjustment) + { + unsigned int last = count - (row + col * mrows); + unsigned int first = last - step; + + std::rotate(matrix + first, matrix + first + 1, matrix + last); + } + } + + //std::swap(mrows, mcols); + std::cout << "\nFinished Transposing MxM on CPU\n"; + } + + template + void TransposeMatrix(T* matrix, const unsigned int& mrows, const unsigned int& mcols, T* matrixOut) + { + std::cout << "\nTransposing MxM on CPU, input type '" << typeid(matrix).name() << "', output type '" << typeid(matrixOut).name() << "', number of rows: '" << mrows << "', number of columns: '" << mcols << "', IT'LL TAKE SOME TIME!!!\n\n"; + + unsigned int count = mcols * mrows; + + for (unsigned int col = 0; col < mcols; col++) + for (unsigned int row = 0; row < mrows; row++) + matrixOut[col*mrows + row] = matrix[row*mcols + col]; + + std::cout << "\nFinished Transposing MxM on CPU\n"; + } +} + +CooperativeMatrixRunner::CooperativeMatrixRunner(Vulkan& vulkan_instance) + : m_vulkan_instance(vulkan_instance) +{ + glslang_initialize_process(); +} + +CooperativeMatrixRunner::~CooperativeMatrixRunner() +{ + glslang_finalize_process(); +} + +bool CooperativeMatrixRunner::InitializeRunner() +{ + if (!m_vulkan_instance.HasLoadedVulkanDeviceExtension(VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME)) + { + LOGE("Required Extension not supported %s", VK_KHR_COOPERATIVE_MATRIX_EXTENSION_NAME); + LOGE("Platform does not support Cooperative Matrices. Cannot test.\n"); + + return false; + } + + auto cooperativeMatrixEXT = m_vulkan_instance.GetExtension(); + if (!cooperativeMatrixEXT) + { + LOGE("Ext_VK_KHR_cooperative_matrix potentially unresolved!"); + } + + // select supported cooperative matrix types/sizes + uint32_t nCoopMatrixPropCount = 0; + CHECK_VK(cooperativeMatrixEXT->m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + m_vulkan_instance.m_VulkanGpu, + &nCoopMatrixPropCount, + NULL + )); + + m_hFoundCooperativeMatrices.resize(nCoopMatrixPropCount); + for (auto& matrixProp : m_hFoundCooperativeMatrices) + { + matrixProp.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + matrixProp.pNext = nullptr; + } + + CHECK_VK(cooperativeMatrixEXT->m_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + m_vulkan_instance.m_VulkanGpu, + &nCoopMatrixPropCount, + &m_hFoundCooperativeMatrices[0] + )); + + LOGI("Found Cooperative Matrices:\n"); + for (auto& cm : m_hFoundCooperativeMatrices) + { + LOGI("\tMxNxK: %ux%ux%u\n", cm.MSize, cm.NSize, cm.KSize); + LOGI("\tA: %s | ", GetMatrixTypeName(cm.AType)); + LOGI("B: %s | ", GetMatrixTypeName(cm.BType)); + LOGI("C: %s | ", GetMatrixTypeName(cm.CType)); + LOGI("D: %s\n", GetMatrixTypeName(cm.ResultType)); + LOGI("\tSaturating Accumulation: %u | Scope: %u\n\n", cm.saturatingAccumulation, cm.scope); + } + + // Setup the test templates + + m_test_group_templates.push_back(TestGroupTemplateDescription{ + VK_COMPONENT_TYPE_FLOAT32_KHR , + VK_COMPONENT_TYPE_FLOAT32_KHR , + { + {8, 6, 128, // SizeInBlocks + 0, 64, 0}, // Size (tile) + + {8, 12, 128, + 0, 32, 0}, + + {8, 24, 128, + 0, 16, 0} + } }); + + m_test_group_templates.push_back(TestGroupTemplateDescription{ + VK_COMPONENT_TYPE_FLOAT16_KHR , + VK_COMPONENT_TYPE_FLOAT16_KHR , + { + {8, 6, 128, // SizeInBlocks + 0, 64, 0}, // Size (tile) + + {8, 12, 128, + 0, 32, 0}, + + {8, 24, 128, + 0, 16, 0} + } }); + + m_test_group_templates.push_back(TestGroupTemplateDescription{ + VK_COMPONENT_TYPE_SINT8_KHR , + VK_COMPONENT_TYPE_SINT32_KHR , + { + {8, 6, 128, // SizeInBlocks + 0, 64, 0}, // Size (tile) + + {8, 12, 128, + 0, 32, 0}, + + {8, 24, 128, + 0, 16, 0} + } }); + + return true; +} + +bool CooperativeMatrixRunner::TriggerPendingTests() +{ + if (!m_is_processing_tests) + { + return true; + } + + for (auto& test_group : m_test_groups) + { + for (auto& test_entry : test_group.test_entries) + { + if (test_entry.test_descriptions.size() != test_entry.test_results.size()) + { + for (const auto& test_description : test_entry.test_descriptions) + { + const auto test_result = RunTest(test_description); + if (test_result) + { + test_entry.test_results.push_back(test_result.value()); + } + else + { + test_entry.test_results.push_back(TestResult()); + } + + m_total_processed_tests++; + } + + // Process a single test entry per frame (so we can display progress on the UI) + return true; + } + } + } + + m_is_processing_tests = false; + + return true; +} + +void CooperativeMatrixRunner::RenderUI() +{ + const bool disable_ui = m_is_processing_tests; + ImGui::BeginDisabled(disable_ui); + ImGui::BeginGroup(); + + if (ImGui::CollapsingHeader("Test Configuration", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::DragInt("Test Repeats", &m_test_repeats, 1.0f, 0, 100); + + // NOTE: Validation (and its transpose option) will be added in a future path + ImGui::BeginDisabled(); + if (m_validate_matrix_result) + { + ImGui::BeginDisabled(); + static bool always_true = true; + ImGui::Checkbox("Transpose When Needed", &always_true); + ImGui::EndDisabled(); + } + else + { + ImGui::Checkbox("Transpose When Needed", &m_transpose_when_needed); + } + + ImGui::Checkbox("Validate Result", &m_validate_matrix_result); + ImGui::EndDisabled(); + + static const char* test_case_names[] = { + "MxM Basic", + "MxM Vector To Matrix", + "CONV", + }; + + int test_type_current_index = static_cast(m_test_type); + bool changed = false; + + if (ImGui::BeginCombo("Test Case", test_case_names[test_type_current_index])) + { + for (int i = 0; i < static_cast(TestType::TT_COUNT); ++i) + { + const bool is_selected = (test_type_current_index == i); + if (ImGui::Selectable(test_case_names[i], is_selected)) + { + m_test_type = static_cast(i); + changed = true; + } + + if (is_selected) + ImGui::SetItemDefaultFocus(); + } + ImGui::EndCombo(); + } + + ImGui::Separator(); + + static const char* fill_type_labels[] = { + "Fill with Zero", + "Fill with Constants", + "Fill with Random UInt", + "Fill with Random Int", + "Fill Sequence Int", + "Fill with Random Low/High Int", + "Fill with Random Float", + "Fill with Random +/-1 Float" + }; + + int fill_data_current_index = static_cast(m_fill_data_type); + + if (ImGui::Combo("Fill Data Type", &fill_data_current_index, fill_type_labels, IM_ARRAYSIZE(fill_type_labels))) + { + m_fill_data_type = static_cast(fill_data_current_index); + } + + ImGui::Separator(); + + static const char* option_labels[] = { "True", "False", "Variable" }; + static const char* matrix_labels[] = { "A", "B", "C", "R"}; + + for (std::size_t i = 0; i < NUM_MATS; ++i) + { + int current_index = static_cast(m_matrix_transpose_options[i]); + + char label[32]; + std::snprintf(label, sizeof(label), "Transpose Matrix %s", matrix_labels[i]); + + if (ImGui::Combo(label, ¤t_index, option_labels, IM_ARRAYSIZE(option_labels))) + { + m_matrix_transpose_options[i] = static_cast(current_index); + } + } + } + + if (ImGui::CollapsingHeader("Device Configuration", 0)) + { + ImGui::Text("Default values for Pakala [SM8750][Adreno830] - Change as needed"); + ImGui::DragInt("GPU Frequency MHz", &m_gpu_freq_MHz, 1.0f, 0, 999999); + ImGui::DragInt("GPU Micro SP", &m_gpu_microSP, 1.0f, 0, 999999); + ImGui::DragInt("GPU ALU per Micro SP", &m_gpu_ALU_per_microSP, 1.0f, 0, 999999); + ImGui::DragInt("GPU OPs per MAD", &m_gpu_ops_per_mad, 1.0f, 0, 999999); + } + + ImGui::Separator(); + + if (ImGui::Button("Run Tests")) + { + PrepareTestSession(); + } + + ImGui::Text("For accurate values, make sure you are using the right device configurations (check 'Device Configuration' tab)"); + + if (m_is_processing_tests) + { + ImGui::SameLine(); + ImGui::EndDisabled(); + ImGui::Text("Processing Test [%d] of [%d]", m_total_processed_tests, m_total_tests); + ImGui::SameLine(); + ImGui::ProgressBar(static_cast(m_total_processed_tests) / static_cast(std::max(0u, m_total_tests))); + ImGui::BeginDisabled(disable_ui); + } + + if (!m_test_groups.empty()) + { + for (int i=0; i< m_test_groups.size(); i++) + { + const auto& test_group = m_test_groups[i]; + + // Quick table exit if none of its entries are valid/supported + if (!test_group.test_entries.empty() && !test_group.test_entries.back().test_results.empty()) + { + bool is_any_result_valid = false; + for (const auto& test_result : test_group.test_entries.back().test_results) + { + is_any_result_valid |= test_result.is_valid; + } + + if (!is_any_result_valid) + { + continue; + } + } + + std::string collapsing_header_title = std::string("Test #").append(std::to_string(i)) + + std::string(" - ") + GetMatrixComponentTypeName(test_group.template_description.input_type) + + std::string(" input / ") + + GetMatrixComponentTypeName(test_group.template_description.output_type) + + std::string(" output"); + + const bool show_matrix_d = false; + + if (ImGui::CollapsingHeader(collapsing_header_title.c_str())) + { + ImGuiStyle& style = ImGui::GetStyle(); + const float original_scrollbar_size = style.ScrollbarSize; + style.ScrollbarSize = 40.0f; + + ImGui::BeginChild("##test_results"); + if (ImGui::BeginTable("TestResultTable", (NUM_MATS - (show_matrix_d ? 0 : 1)) + 3, ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg | ImGuiTableFlags_Resizable)) + { + ImGui::TableSetupColumn("A", ImGuiTableColumnFlags_WidthFixed, 100.0f); + ImGui::TableSetupColumn("B", ImGuiTableColumnFlags_WidthFixed, 100.0f); + ImGui::TableSetupColumn("C", ImGuiTableColumnFlags_WidthFixed, 100.0f); + + if (show_matrix_d) + { + ImGui::TableSetupColumn("D", ImGuiTableColumnFlags_WidthFixed, 100.0f); + } + + for (const auto& size_configuration : test_group.template_description.size_configurations) + { + ImGui::TableSetupColumn(("NSize=" + std::to_string(size_configuration.NSize)).c_str()); + // TODO: Should we print the rest of the size config? + } + + ImGui::TableHeadersRow(); + + // Each test entry will be a table row + for (int test_entry_index = 0; test_entry_index < test_group.test_entries.size(); test_entry_index++) + { + const auto& test_entry = test_group.test_entries[test_entry_index]; + int current_column_index = 0; + + ImGui::TableNextRow(); + + // Transpose flags + ImGui::TableSetColumnIndex(current_column_index++); + ImGui::Text("%s", test_entry.layoutA_Mfirst ? "M-first" : "K-first"); + + ImGui::TableSetColumnIndex(current_column_index++); + ImGui::Text("%s", test_entry.layoutB_Nfirst ? "N-first" : "K-first"); + + ImGui::TableSetColumnIndex(current_column_index++); + ImGui::Text("%s", test_entry.layoutC_Mfirst ? "M-first" : "N-first"); + + if (show_matrix_d) + { + ImGui::TableSetColumnIndex(current_column_index++); + ImGui::Text("%s", test_entry.layoutR_Mfirst ? "M-first" : "N-first"); + } + + // For each of the NSize configs + for (int test_result_index = 0; test_result_index < test_entry.test_results.size(); test_result_index++) + { + ImGui::TableSetColumnIndex(current_column_index++); + + const auto& test_description = test_entry.test_descriptions[test_result_index]; + const auto& test_result = test_entry.test_results[test_result_index]; + + if (test_result.is_valid) + { + auto GetPercentageColor = [](float value) -> ImVec4 + { + value = std::clamp(value, 0.0f, 1.0f); + + if (value < 0.5f) + { + float t = value / 0.5f; + return ImVec4(1.0f, t, 0.0f, 1.0f); + } + else + { + float t = (value - 0.5f) / 0.5f; + return ImVec4(1.0f - t, 1.0f, 0.0f, 1.0f); + } + }; + + ImGui::Text("[Time]: %.2fus", test_result.time_total); + ImGui::Text("[TOPS]: %.2f", test_result.TOPS); + ImVec4 color = GetPercentageColor(test_result.percentage / 100.0f); + ImGui::PushStyleColor(ImGuiCol_Text, color); + ImGui::Text("[%%]: %.2f", test_result.percentage); + ImGui::PopStyleColor(); + } + else + { + ImGui::Text("N/A - Not Supported"); + } + } + } + + ImGui::EndTable(); + } + ImGui::EndChild(); + + style.ScrollbarSize = original_scrollbar_size; + } + } + } + + ImGui::EndGroup(); + ImGui::EndDisabled(); +} + +void CooperativeMatrixRunner::PrepareTestSession() +{ + m_vulkan_instance.WaitUntilIdle(); + + m_test_groups.clear(); + m_total_tests = 0; + m_total_processed_tests = 0; + + auto GenerateTransposeCombinations = [&]() -> std::vector> + { + std::vector> combinations; + + std::vector variable_indices; + std::vector fixed_values(NUM_MATS); + + for (std::size_t i = 0; i < NUM_MATS; ++i) + { + switch (m_matrix_transpose_options[i]) + { + case MatrixTransposeOption::ALWAYS_TRUE: + fixed_values[i] = true; + break; + case MatrixTransposeOption::ALWAYS_FALSE: + fixed_values[i] = false; + break; + case MatrixTransposeOption::VARIABLE: + variable_indices.push_back(i); + break; + } + } + + std::size_t num_combinations = 1ULL << variable_indices.size(); + combinations.reserve(num_combinations); + + for (std::size_t combo = 0; combo < num_combinations; ++combo) + { + std::vector current(NUM_MATS); + + for (std::size_t i = 0; i < NUM_MATS; ++i) + { + current[i] = fixed_values[i]; + } + + for (std::size_t bit = 0; bit < variable_indices.size(); ++bit) + { + std::size_t index = variable_indices[bit]; + current[index] = (combo >> bit) & 1; + } + + combinations.push_back(std::move(current)); + } + + return combinations; + }; + + const auto transpose_combinations = GenerateTransposeCombinations(); + + for (const auto& test_template_description : m_test_group_templates) + { + TestGroup new_test_group; + new_test_group.template_description = test_template_description; + + TestDescription new_test_description; + + new_test_description.fill_data_type = m_fill_data_type; + new_test_description.gpu_freq_MHz = m_gpu_freq_MHz; + new_test_description.test_type = m_test_type; + + new_test_description.inputWidth = 1; + new_test_description.inputHeight = 1; + + new_test_description.input_type = test_template_description.input_type; + new_test_description.output_type = test_template_description.output_type; + + new_test_description.perf_loop = static_cast(m_test_repeats); + + for (auto& transposeCombination : transpose_combinations) + { + TestGroup::TestRowEntry test_entry; + + new_test_description.layoutA_Mfirst = transposeCombination[0]; + new_test_description.layoutB_Nfirst = transposeCombination[1]; + new_test_description.layoutC_Mfirst = transposeCombination[2]; + new_test_description.layoutR_Mfirst = transposeCombination[3]; + + test_entry.layoutA_Mfirst = new_test_description.layoutA_Mfirst; + test_entry.layoutB_Nfirst = new_test_description.layoutB_Nfirst; + test_entry.layoutC_Mfirst = new_test_description.layoutC_Mfirst; + test_entry.layoutR_Mfirst = new_test_description.layoutR_Mfirst; + + for (auto& size_configuration : test_template_description.size_configurations) + { + new_test_description.MSizeInBlocks = size_configuration.MSizeInBlocks; + new_test_description.NSizeInBlocks = size_configuration.NSizeInBlocks; + new_test_description.KSizeInBlocks = size_configuration.KSizeInBlocks; + new_test_description.MSize = size_configuration.MSize; + new_test_description.NSize = size_configuration.NSize; + new_test_description.KSize = size_configuration.KSize; + + test_entry.test_descriptions.push_back(new_test_description); + m_total_tests++; + } + + new_test_group.test_entries.push_back(test_entry); + } + + m_test_groups.push_back(new_test_group); + } + + m_is_processing_tests = true; +} + +std::optional CooperativeMatrixRunner::RunTest(const TestDescription& test_description) +{ + TestResult test_result = {}; + test_result.is_valid = true; + + VkResult result; + + uint32_t gpu_freq_MHz = test_description.gpu_freq_MHz; + + int MSize = test_description.MSize; + int NSize = test_description.NSize; + int KSize = test_description.KSize; + int MSizeInBlocks = test_description.MSizeInBlocks; + int NSizeInBlocks = test_description.NSizeInBlocks; + int KSizeInBlocks = test_description.KSizeInBlocks; + + uint32_t perf_loop = test_description.perf_loop; + + bool layoutA_Mfirst = test_description.layoutA_Mfirst; + bool layoutB_Kfirst = !test_description.layoutB_Nfirst; + bool layoutC_Mfirst = test_description.layoutC_Mfirst; + bool layoutR_Mfirst = test_description.layoutR_Mfirst; + + int inputWidth = test_description.inputWidth; + int inputHeight = test_description.inputHeight; + + uint32_t tt = static_cast(test_description.test_type); + int init = test_description.fill_data_type; + + auto command_pool_queue_family_index = m_vulkan_instance.m_VulkanQueues[Vulkan::QueueIndex::eGraphicsQueue].QueueFamilyIndex; + auto submission_queue = m_vulkan_instance.m_VulkanQueues[command_pool_queue_family_index].Queue; + + // Not optimal at all but we are drawing the UI and running the test in the same queue + m_vulkan_instance.QueueWaitIdle(Vulkan::QueueIndex::eGraphicsQueue); + + const auto subgroup_size = m_vulkan_instance.GetExtension()->Properties.subgroupSize; + const auto gpuvendor_id = static_cast(m_vulkan_instance.GetGpuProperties().Base.properties.vendorID); + const auto gputier_id = static_cast(m_vulkan_instance.GetGpuProperties().Base.properties.deviceID); + + const auto device_limits = m_vulkan_instance.GetGpuProperties().Base.properties.limits; + + // Create descriptor set and descriptor set layout for our A,B,C,R matrices (buffers) + // + VkDescriptorSetLayout descriptorSetLayout; + VkDescriptorSet descriptorSet; + + auto create_buffers_desc_set = [](VkDevice device, VkDescriptorSetLayout & descriptorSetLayout, VkDescriptorSet & descriptorSet, const uint32_t num_buffers) + { + VkResult result; + + // Descriptor set are always bound at command buffer level + // There is only 1 descriptor per resource + + // How to allocate descriptor sets: + // + // 1. Create a pool of sufficient size (use multiple VkDescriptorPoolSize) + // Use vkCreateDescriptorPool() to actually create the pool on the GPU + // 2. Create a VkDescriptorSetLayout for each descriptor set + // Specify the resource bindings within the descriptor set using + // VkDescriptorSetLayoutBinding elements per resource + // 3. Allocate a new set from the pool using vkAllocateDescriptorSets + // The reference to the VkDescriptorPool is specified in the associated + // VkDescriptorSetAllocateInfo config struct. + // Bind all relevant VkDescriptorSet handles (from step 3.) for + // draw/compute/ray tracing via vkCmdBindDescriptorSets + + // 1) Create a descriptor pool (1 set) + VkDescriptorPoolSize* poolSizes = new VkDescriptorPoolSize[num_buffers]; + for (uint32_t i = 0; i < num_buffers; i++) + poolSizes[i] = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 }; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {}; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = NULL; + descriptorPoolCreateInfo.maxSets = 1; // Use only 1 set for all descriptors + descriptorPoolCreateInfo.poolSizeCount = num_buffers; + descriptorPoolCreateInfo.pPoolSizes = poolSizes; + + VkDescriptorPool descriptorPool; + result = vkCreateDescriptorPool(device, &descriptorPoolCreateInfo, NULL, &descriptorPool); + CHECK_VK(result); + + // 2) Create a VkDescriptorSetLayout for each descriptor set + // This compute shader uses 3 UBO and 1 SBO + VkDescriptorSetLayoutBinding* layoutBindings = new VkDescriptorSetLayoutBinding[num_buffers]; + for (uint32_t i = 0; i < num_buffers; i++) + { + layoutBindings[i].binding = i; + layoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layoutBindings[i].descriptorCount = 1; + layoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layoutBindings[i].pImmutableSamplers = nullptr; + } + + // Next take layout bindings and use them to create a descriptor set layout + VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = {}; + descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + descriptorSetLayoutCreateInfo.pNext = nullptr; + descriptorSetLayoutCreateInfo.flags = 0; + descriptorSetLayoutCreateInfo.bindingCount = num_buffers; + descriptorSetLayoutCreateInfo.pBindings = layoutBindings; + + result = vkCreateDescriptorSetLayout(device, &descriptorSetLayoutCreateInfo, NULL, &descriptorSetLayout); + CHECK_VK(result); + + // 3. Allocate a new set from the pool using vkAllocateDescriptorSets + VkDescriptorSetAllocateInfo setAllocateInfo = {}; + setAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + setAllocateInfo.pNext = nullptr; + setAllocateInfo.descriptorPool = descriptorPool; + setAllocateInfo.descriptorSetCount = 1; // Use only 1 set for all descriptors + setAllocateInfo.pSetLayouts = &descriptorSetLayout; + + result = vkAllocateDescriptorSets(device, &setAllocateInfo, &descriptorSet); + CHECK_VK(result); + + delete[] poolSizes; + delete[] layoutBindings; + }; + + create_buffers_desc_set(m_vulkan_instance.m_VulkanDevice, descriptorSetLayout, descriptorSet, NUM_MATS); + + // Create command pool + // + VkCommandPoolCreateInfo commandPoolCreateInfo = { VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, (uint32_t)command_pool_queue_family_index }; + VkCommandPool commandPool; + result = vkCreateCommandPool(m_vulkan_instance.m_VulkanDevice, &commandPoolCreateInfo, NULL, &commandPool); + CHECK_VK(result); + + // Create command buffer + // + // The command buffers, one for initializing buffers, one for compute, one + // for reading back the results. This lets us time the compute work more + // precisely. + VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, nullptr, commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 3 }; + VkCommandBuffer commandBuffers[3]; + result = vkAllocateCommandBuffers(m_vulkan_instance.m_VulkanDevice, &commandBufferAllocateInfo, commandBuffers); + CHECK_VK(result); + + // Creat Pipeline layout + // Use only 1 set for all descriptors + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, NULL, 0, 1, &descriptorSetLayout, 0, nullptr }; + VkPipelineLayout pipelineLayout; + result = vkCreatePipelineLayout(m_vulkan_instance.m_VulkanDevice, &pipelineLayoutCreateInfo, NULL, &pipelineLayout); + CHECK_VK(result); + + // Query matrix properties and see if the test is supported for the given GPU + bool valid_testtypes = false; + VkCooperativeMatrixPropertiesKHR cooperativeMatrixProps = {}; + if (!FindMatrixProperty(m_hFoundCooperativeMatrices, cooperativeMatrixProps, MSize, NSize, KSize, test_description.input_type, test_description.input_type, test_description.output_type, test_description.output_type)) + { + return std::nullopt; + } + + // Set local_size (workgroup size) based on GPU/Tier (nvidia, glymur, pakala, etc.), and datatype (fp32, fp16, etc) + // Default for 'unknown' or gpu/tier not recohgnized is local_size(64,2,2) for all datatyes + uint32_t local_size_x = 0, local_size_y = 0, local_size_z = 0; + + switch (gpuvendor_id) + { + case VK_VENDOR_ID_NVIDIA: + local_size_x = subgroup_size; + local_size_y = 1; + local_size_z = 1; + break; + case VK_VENDOR_ID_AMD: + local_size_x = subgroup_size; + local_size_y = 1; + local_size_z = 1; + break; + case VK_VENDOR_ID_INTEL: + local_size_x = subgroup_size; + local_size_y = 1; + local_size_z = 1; + break; + case VK_VENDOR_ID_APPLE: + local_size_x = subgroup_size; + local_size_y = 1; + local_size_z = 1; + break; + case VK_VENDOR_ID_QUALCOMM: + local_size_x = subgroup_size; + local_size_y = 2; + local_size_z = 2; + break; + default: // unknown, including gpu option not part of the map + printf("\nUnknown GPU or GPU no set with -gpu:[nvidia|qualcomm|pakala|kaanapali|glymmur|etc.]"); + local_size_x = 64; + local_size_y = 2; + local_size_z = 2; + break; + } + + RuntimeShader runtime_shader; + + // Set compiler options + // + std::vector compiler_options; + int bytesPerInput; // = int8 ? 1 : fp16 ? 2 : 4; + int bytesPerOutput;// = fp16 ? 2 : 4; + + if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR) + { + runtime_shader.AddDefine("A_TYPE", std::string("float")); + runtime_shader.AddDefine("R_TYPE", std::string("float")); + bytesPerInput = 4; + bytesPerOutput = 4; + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR) + { + runtime_shader.AddDefine("A_TYPE", std::string("float16_t")); + runtime_shader.AddDefine("R_TYPE", std::string("float16_t")); + bytesPerInput = 2; + bytesPerOutput = 2; + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR) + { + runtime_shader.AddDefine("A_TYPE", std::string("uint8_t")); + runtime_shader.AddDefine("R_TYPE", std::string("uint32_t")); + bytesPerInput = 1; + bytesPerOutput = 4; + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR) + { + runtime_shader.AddDefine("A_TYPE", std::string("int8_t")); + runtime_shader.AddDefine("R_TYPE", std::string("int32_t")); + bytesPerInput = 1; + bytesPerOutput = 4; + } + else + { + return std::nullopt; + } + + if (!runtime_shader.Build(ShaderPaths[tt], m_vulkan_instance.m_VulkanDevice, "main", glslang_stage_t::GLSLANG_STAGE_COMPUTE)) + { + LOGE("Failed to compile test shader"); + return std::nullopt; + } + + VkShaderModule shaderModule = runtime_shader.GetShaderModule(); + + if (tt == TT_CONV && (inputWidth * inputHeight != MSizeInBlocks * cooperativeMatrixProps.MSize)) + { + LOGE("Convolution ConvInputWidth * ConvInputHeight (%d) must equal MSizeInBlocks * MSize (%d) for current datatype", + (inputWidth * inputHeight), (MSizeInBlocks * cooperativeMatrixProps.MSize)); + return std::nullopt; + } + + int filterWidth = 3; + int filterHeight = 3; + int dilation = 1; + int stride = 1; + + TestCase testCase = {}; + + testCase.testType = (TestType)tt; + testCase.inputType = cooperativeMatrixProps.AType; + testCase.outputType = cooperativeMatrixProps.ResultType; + + // MxNxK is the size of the full matrix multiply + testCase.TOTAL_M = cooperativeMatrixProps.MSize * MSizeInBlocks; + testCase.TOTAL_N = cooperativeMatrixProps.NSize * NSizeInBlocks; + testCase.TOTAL_K = cooperativeMatrixProps.KSize * KSizeInBlocks; + + int mA_paddedM = testCase.TOTAL_M; + int mA_paddedK = testCase.TOTAL_K; + int mB_paddedN = testCase.TOTAL_N; + int mB_paddedK = testCase.TOTAL_K; + int mC_paddedM = testCase.TOTAL_M; + int mC_paddedN = testCase.TOTAL_N; + int mR_paddedM = testCase.TOTAL_M; + int mR_paddedN = testCase.TOTAL_N; + + std::cout << "\nPadding image width to fix CCHE bank mapping issue." << std::endl; + // 512bits is one line in the CCHE (512bits/8bits = 64bytes) + if (layoutA_Mfirst) mA_paddedM += (mA_paddedM % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput; else mA_paddedK += (mA_paddedK % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput; + if (layoutB_Kfirst) mB_paddedK += (mB_paddedK % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput; else mB_paddedN += (mB_paddedN % (128 / bytesPerInput)) ? 0 : 64 / bytesPerInput; + if (layoutC_Mfirst) mC_paddedM += (mC_paddedM % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; else mC_paddedN += (mC_paddedN % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; + if (layoutR_Mfirst) mR_paddedM += (mR_paddedM % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; else mR_paddedN += (mR_paddedN % (128 / bytesPerOutput)) ? 0 : 64 / bytesPerOutput; + + // Each cooperative matrix multiply is R[TILE_M, TILE_N] = A[TILE_M, TILE_K] x B[TILE_K, TILE_N] + C[TILE_M, TILE_N] + testCase.TILE_M = cooperativeMatrixProps.MSize; + testCase.TILE_N = cooperativeMatrixProps.NSize; + testCase.TILE_K = cooperativeMatrixProps.KSize; + + testCase.layoutA_Mfirst = (uint32_t)layoutA_Mfirst; + testCase.layoutB_Kfirst = (uint32_t)layoutB_Kfirst; + testCase.layoutC_Mfirst = (uint32_t)layoutC_Mfirst; + testCase.layoutR_Mfirst = (uint32_t)layoutR_Mfirst; + + testCase.strideAinElements = (layoutA_Mfirst ? mA_paddedM : mA_paddedK); + testCase.strideBinElements = (layoutB_Kfirst ? mB_paddedK : mB_paddedN); + testCase.strideCinElements = (layoutC_Mfirst ? mC_paddedM : mC_paddedN); + testCase.strideRinElements = (layoutR_Mfirst ? mR_paddedM : mR_paddedN); + + auto FindProperties = [](const VkPhysicalDeviceMemoryProperties* pMemoryProperties, + uint32_t memoryTypeBitsRequirement, VkMemoryPropertyFlags requiredProperties) -> int32_t + { + const uint32_t memoryCount = pMemoryProperties->memoryTypeCount; + for (uint32_t memoryIndex = 0; memoryIndex < memoryCount; ++memoryIndex) { + const uint32_t memoryTypeBits = (1 << memoryIndex); + const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits; + + const VkMemoryPropertyFlags properties = + pMemoryProperties->memoryTypes[memoryIndex].propertyFlags; + const bool hasRequiredProperties = + (properties & requiredProperties) == requiredProperties; + + if (isRequiredMemoryType && hasRequiredProperties) + return static_cast(memoryIndex); + } + + // failed to find memory type + return -1; + }; + + auto CreateMatrixDesc = [&]( + VkDevice device, + VkPhysicalDeviceMemoryProperties& memory_properties, + MatrixDesc& m, + VkComponentTypeKHR dt, + int rows, + int cols) + { + VkResult result; + + m.dims.rows = rows; + m.dims.cols = cols; + m.dataType = dt; + m.elementSize = ComponentTypeInfo[m.dataType].bits / 8; // float->4-buyes, float16->2 bytes, int8->1 byte + m.totalElements = m.dims.cols * m.dims.rows; + m.bufferSize = m.totalElements * m.elementSize; + + VkBufferCreateInfo bufferCreateInfo = { + VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + NULL, + 0, + m.bufferSize, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT, + VK_SHARING_MODE_EXCLUSIVE, + 0u, + NULL, + }; + + result = vkCreateBuffer(device, &bufferCreateInfo, NULL, &m.hostBuffer); + CHECK_VK(result); + result = vkCreateBuffer(device, &bufferCreateInfo, NULL, &m.deviceBuffer); + CHECK_VK(result); + + VkMemoryRequirements memReqs; + vkGetBufferMemoryRequirements(device, m.hostBuffer, &memReqs); + + int32_t hostIndex = FindProperties(&memory_properties, memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + int32_t deviceIndex = FindProperties(&memory_properties, memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + VkMemoryAllocateFlagsInfo memAllocateFlagsInfo = {VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, NULL,VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT, 0}; + VkMemoryAllocateInfo memAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &memAllocateFlagsInfo, memReqs.size, (uint32_t)hostIndex}; + + result = vkAllocateMemory(device, &memAllocateInfo, NULL, &m.hostMemory); + CHECK_VK(result); + + memAllocateInfo.memoryTypeIndex = deviceIndex; + result = vkAllocateMemory(device, &memAllocateInfo, NULL, &m.deviceMemory); + CHECK_VK(result); + + result = vkBindBufferMemory(device, m.hostBuffer, m.hostMemory, 0); + CHECK_VK(result); + + result = vkBindBufferMemory(device, m.deviceBuffer, m.deviceMemory, 0); + CHECK_VK(result); + + result = vkMapMemory(device, m.hostMemory, 0, m.bufferSize, 0, &m.ptr); + CHECK_VK(result); + }; + + VkPhysicalDeviceMemoryProperties memory_properties; + vkGetPhysicalDeviceMemoryProperties(m_vulkan_instance.m_VulkanGpu, &memory_properties); + + MatrixDesc matrices[NUM_MATS]; + + CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_A], cooperativeMatrixProps.AType, mA_paddedM, mA_paddedK); + if (tt == TT_CONV) CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_B], cooperativeMatrixProps.AType, filterWidth*filterWidth*mB_paddedN, mB_paddedK); + else CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_B], cooperativeMatrixProps.AType, mB_paddedK, mB_paddedN); + CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_C], cooperativeMatrixProps.CType, mC_paddedM, mC_paddedN); + CreateMatrixDesc(m_vulkan_instance.m_VulkanDevice, memory_properties, matrices[MAT_R], cooperativeMatrixProps.ResultType, mR_paddedM, mR_paddedN); + + auto update_buffer_descriptor_set = [](VkDevice device, MatrixDesc * matrices, uint32_t num_matrices, VkDescriptorSet & descriptorSet) + { + VkDescriptorBufferInfo* bufferDescriptor = new VkDescriptorBufferInfo[num_matrices]; + + for (uint32_t i = 0; i < num_matrices; i++) + { + bufferDescriptor[i].buffer = matrices[i].deviceBuffer; + bufferDescriptor[i].offset = 0; + bufferDescriptor[i].range = matrices[i].bufferSize; + } + + VkWriteDescriptorSet* writeDescriptorset = new VkWriteDescriptorSet[num_matrices]; + + for (uint32_t i = 0; i < num_matrices; i++) + { + writeDescriptorset[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorset[i].pNext = nullptr; + writeDescriptorset[i].dstSet = descriptorSet; + writeDescriptorset[i].dstBinding = i; + writeDescriptorset[i].dstArrayElement = 0; + writeDescriptorset[i].descriptorCount = 1; + writeDescriptorset[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writeDescriptorset[i].pImageInfo = nullptr; + writeDescriptorset[i].pBufferInfo = &bufferDescriptor[i]; + writeDescriptorset[i].pTexelBufferView = nullptr; + } + + vkUpdateDescriptorSets(device, num_matrices, writeDescriptorset, 0, NULL); + + delete[] bufferDescriptor; + delete[] writeDescriptorset; + }; + + update_buffer_descriptor_set(m_vulkan_instance.m_VulkanDevice, matrices, NUM_MATS, descriptorSet); + + float* matrixR_CPU_fp32 = new float[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols](); + FLOAT16* matrixR_CPU_fp16 = new FLOAT16[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols](); + int32_t* matrixR_CPU_sint32 = new int32_t[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols](); + uint32_t* matrixR_CPU_uint32 = new uint32_t[matrices[MAT_R].dims.rows * matrices[MAT_R].dims.cols](); + std::ostringstream fna, fnb, fnr_cpu, fnr_vk; + fna << "matrixA_" << "M" << testCase.TOTAL_M << "xK" << testCase.TOTAL_K << ".txt"; + fnb << "matrixB_" << "K" << testCase.TOTAL_K << "xN" << testCase.TOTAL_N << ".txt"; + + // ToDo: Think in how to use templates! + if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR)) // CONV test case, input/output data Type Float 32? + { + InitMatrix((float*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((float*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((float*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((float*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + } + else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR)) // CONV test case, input/output data Type Float 16? + { + InitMatrix((FLOAT16*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((FLOAT16*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((FLOAT16*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((FLOAT16*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + } + else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR)) // CONV test case, Input data Type signed int8, output data type signed int 32? + { + InitMatrix((int8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((int8_t*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((int32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((int32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + } + else if ((tt == TT_CONV) && (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR)) // CONV test case, Input data Type signed int8, output data type signed int 32? + { + InitMatrix((uint8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((uint8_t*)matrices[MAT_B].ptr, filterHeight*filterWidth*testCase.TOTAL_N, testCase.TOTAL_K, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((uint32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((uint32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + } + else if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT32_KHR) // Input/output data Type Float 32? + { + InitMatrix((float*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((float*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((float*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((float*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + + if (m_transpose_when_needed || m_validate_matrix_result) + { + if (layoutA_Mfirst) // Matrix A M-First? + TransposeMatrix((float*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst"); + if (layoutB_Kfirst) // Matrix B K-First? + TransposeMatrix((float*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst"); + if (layoutC_Mfirst) // Matrix C M-First? + TransposeMatrix((float*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst"); + } + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_FLOAT16_KHR) // Input/output data Type Float 16? + { + + InitMatrix((FLOAT16*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((FLOAT16*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((FLOAT16*)matrices[MAT_C].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((FLOAT16*)matrices[MAT_R].ptr, testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + + if (m_transpose_when_needed || m_validate_matrix_result) + { + if (layoutA_Mfirst) // Matrix A M-First? + TransposeMatrix((FLOAT16*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst"); + if (layoutB_Kfirst) // Matrix B K-First? + TransposeMatrix((FLOAT16*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst"); + if (layoutC_Mfirst) // Matrix C M-First? + TransposeMatrix((FLOAT16*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst"); + } + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_SINT8_KHR) // Input data Type signed int8, output data type signed int 32? + { + InitMatrix((int8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((int8_t*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((int32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((int32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + + if (m_transpose_when_needed || m_validate_matrix_result) + { + if (layoutA_Mfirst) // Matrix A M-First? + TransposeMatrix((int8_t*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst"); + if (layoutB_Kfirst) // Matrix B K-First? + TransposeMatrix((int8_t*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst"); + if (layoutC_Mfirst) // Matrix C M-First? + TransposeMatrix((int32_t*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst"); + } + } + else + if (test_description.input_type == VK_COMPONENT_TYPE_UINT8_KHR) // Data Type input unsigned int 8, data type output unsigned int 32? + { + InitMatrix((uint8_t*)matrices[MAT_A].ptr, testCase.TOTAL_M, testCase.TOTAL_K, matrices[MAT_A].dims.cols, (FillDataType)init, 2); + InitMatrix((uint8_t*)matrices[MAT_B].ptr, testCase.TOTAL_K, testCase.TOTAL_N, matrices[MAT_B].dims.cols, (FillDataType)init, 2); + InitMatrix((uint32_t*)matrices[MAT_C].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_C].dims.cols, FILL_WITH_ZERO, 2); + InitMatrix((uint32_t*)matrices[MAT_R].ptr,testCase.TOTAL_M, testCase.TOTAL_N, matrices[MAT_R].dims.cols, FILL_WITH_ZERO, 2); + + if (m_transpose_when_needed || m_validate_matrix_result) + { + if (layoutA_Mfirst) // Matrix A M-First? + TransposeMatrix((uint8_t*)matrices[MAT_A].ptr, matrices[MAT_A].dims.rows, matrices[MAT_A].dims.cols, "layoutA_Mfirst"); + if (layoutB_Kfirst) // Matrix B K-First? + TransposeMatrix((uint8_t*)matrices[MAT_B].ptr, matrices[MAT_B].dims.rows, matrices[MAT_B].dims.cols, "layoutB_Kfirst"); + if (layoutC_Mfirst) // Matrix C M-First? + TransposeMatrix((uint32_t*)matrices[MAT_C].ptr, matrices[MAT_C].dims.rows, matrices[MAT_C].dims.cols, "layoutC_Mfirst"); + } + } + else + { + return std::nullopt; + } + + // Specialize the shader with the matrix sizes, strides, and constants. + // Also, work-group sizes + const uint32_t specDataMxM[] = { // pass to shader_name.comp + local_size_x, // layout(constant_id = 0) const uint local_size_x; + local_size_y, // layout(constant_id = 1) const uint local_size_y; + local_size_z, // layout(constant_id = 2) const uint local_size_z; + testCase.TOTAL_M, // layout(constant_id = 3) const uint TOTAL_M = 1; + testCase.TOTAL_N, // layout(constant_id = 4) const uint TOTAL_N = 1; + testCase.TOTAL_K, // layout(constant_id = 5) const uint TOTAL_K = 1; + testCase.TILE_M, // layout(constant_id = 6) const uint TILE_M = 1; + testCase.TILE_N, // layout(constant_id = 7) const uint TILE_N = 1; + testCase.TILE_K, // layout(constant_id = 8) const uint TILE_K = 1; + testCase.layoutA_Mfirst, // layout(constant_id = 9) const bool layoutA_Mfirst = false; + testCase.layoutB_Kfirst, // layout(constant_id =10) const bool layoutB_Kfirst = false; + testCase.layoutC_Mfirst, // layout(constant_id =11) const bool layoutC_Mfirst = false; + testCase.layoutR_Mfirst, // layout(constant_id =12) const bool layoutR_Mfirst = false; + testCase.strideAinElements, // layout(constant_id =13) const uint strideAinElements = 1; + testCase.strideBinElements, // layout(constant_id =14) const uint strideBinElements = 1; + testCase.strideCinElements, // layout(constant_id =15) const uint strideCinElements = 1; + testCase.strideRinElements // layout(constant_id =16) const uint strideRinElements = 1; + }; + + const uint32_t specDataCONV[] = { // pass to shader_name.comp + local_size_x, // layout(constant_id = 0) const uint local_size_x; + local_size_y, // layout(constant_id = 1) const uint local_size_y; + local_size_z, // layout(constant_id = 2) const uint local_size_z; + testCase.TOTAL_M, // layout(constant_id = 3) const uint TOTAL_M = 1; + testCase.TOTAL_N, // layout(constant_id = 4) const uint TOTAL_N = 1; + testCase.TOTAL_K, // layout(constant_id = 5) const uint TOTAL_K = 1; + testCase.TILE_M, // layout(constant_id = 6) const uint TILE_M = 1; + testCase.TILE_N, // layout(constant_id = 7) const uint TILE_N = 1; + testCase.TILE_K, // layout(constant_id = 8) const uint TILE_K = 1; + (uint32_t)inputWidth, // layout(constant_id = 9) const uint INPUT_W = 1; + (uint32_t)inputHeight, // layout(constant_id =10) const uint INPUT_H = 1; + (uint32_t)filterWidth, // layout(constant_id =11) const uint FILTER_W = 1; + (uint32_t)filterHeight, // layout(constant_id =12) const uint FILTER_H = 1; + (uint32_t)dilation, // layout(constant_id =13) const uint DILATION = 1; + (uint32_t)stride, // layout(constant_id =14) const uint STRIDE = 1; + testCase.strideAinElements, // layout(constant_id =15) const uint strideAinElements = 1; + testCase.strideBinElements, // layout(constant_id =16) const uint strideBinElements = 1; + testCase.strideCinElements, // layout(constant_id =17) const uint strideCinElements = 1; + testCase.strideRinElements // layout(constant_id =18) const uint strideRinElements = 1; + }; + + auto fill_specialized_map_entries = [](VkSpecializationMapEntry entries[], uint32_t num_entries, uint32_t sizeof_entry) + { + for (uint32_t i = 0; i < num_entries; i++) + entries[i] = { i, sizeof_entry * i, sizeof_entry }; + }; + +#define ARRAY_LENGTH(x) (sizeof(x) / sizeof(x[0])) + + VkSpecializationMapEntry entriesMxM[ARRAY_LENGTH(specDataMxM)]; + fill_specialized_map_entries(entriesMxM, ARRAY_LENGTH(specDataMxM), sizeof(uint32_t)); // {0, sizeof(uint32_t) * 0, sizeof(uint32_t)},...,//{end, sizeof(uint32_t) * end, sizeof(uint32_t)} + + VkSpecializationMapEntry entriesCONV[ARRAY_LENGTH(specDataCONV)]; + fill_specialized_map_entries(entriesCONV, ARRAY_LENGTH(specDataCONV), sizeof(uint32_t)); // {0, sizeof(uint32_t) * 0, sizeof(uint32_t)}, ...,//{end, sizeof(uint32_t) * end, sizeof(uint32_t)} + + VkSpecializationInfo specInfo; + switch (tt) + { + case TT_CONV: + specInfo = { ARRAY_LENGTH(specDataCONV), entriesCONV, sizeof(specDataCONV), specDataCONV, }; + break; + case TT_MXM_BASIC: + case TT_MXM_VecToMat: + specInfo = { ARRAY_LENGTH(specDataMxM), entriesMxM, sizeof(specDataMxM), specDataMxM, }; + break; + default: + LOGE("Unknown use case(%d), can't sent specialized constantas to shader!", tt); + } + +#undef ARRAY_LENGTH + + // Create pipeline with a desired subgroup size (e.g., AMD supports two subgroup sizes) + VkPipelineShaderStageRequiredSubgroupSizeCreateInfo subgroupSizeInfo = {}; + subgroupSizeInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO; + subgroupSizeInfo.requiredSubgroupSize = subgroup_size; // Must be between min and max + + VkPipelineShaderStageCreateInfo shaderCreateInfo = {VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, &subgroupSizeInfo, 0, VK_SHADER_STAGE_COMPUTE_BIT, shaderModule, "main", &specInfo}; + VkComputePipelineCreateInfo pipelineCreateInfo = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, NULL, 0, shaderCreateInfo, pipelineLayout, VK_NULL_HANDLE, 0 }; + + // Create the query pool + VkQueryPool query_pool_timestamps = VK_NULL_HANDLE; // A query pool is required to use GPU time stamps + std::vector time_stamps((size_t)perf_loop*2, 0);// We will get timestamps for the beginning and end of each of the compute passes + // GPU time stamps will be stored in a vector + // VK_QUERY_TYPE_TIMESTAMP: We need to specify the query type for this pool, which in our case is for time stamps + // time_stamps: Set the no. of queries in this pool + VkQueryPoolCreateInfo query_pool_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, nullptr, 0, VK_QUERY_TYPE_TIMESTAMP, static_cast(time_stamps.size()), 0 }; + result = vkCreateQueryPool(m_vulkan_instance.m_VulkanDevice, &query_pool_info, nullptr, &query_pool_timestamps); + CHECK_VK(result); + + std::cout << "\nExecuting vkCreateComputePipelines(...) (takes a while!)\n"; + VkPipeline pipeline; + result = vkCreateComputePipelines(m_vulkan_instance.m_VulkanDevice, VK_NULL_HANDLE, 1, &pipelineCreateInfo, NULL, &pipeline); + CHECK_VK(result); + + VkCommandBufferBeginInfo commandBufferBeginInfo{}; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + + // Download input buffers to device memory. + result = vkBeginCommandBuffer(commandBuffers[0], &commandBufferBeginInfo); // Begin command buffer recording + CHECK_VK(result); + + for (uint32_t i = 0; i < NUM_MATS; ++i) { + MatrixDesc &m = matrices[i]; + VkBufferCopy copy = { 0, 0, m.bufferSize }; + vkCmdCopyBuffer(commandBuffers[0], m.hostBuffer, m.deviceBuffer, 1, ©); + } + + result = vkEndCommandBuffer(commandBuffers[0]); // End command buffer recording + CHECK_VK(result); + + VkSubmitInfo submitInfo = {VK_STRUCTURE_TYPE_SUBMIT_INFO, NULL, 0, NULL, NULL,1, &commandBuffers[0], 0, NULL}; + + submitInfo.pCommandBuffers = &commandBuffers[0]; + result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE); + CHECK_VK(result); + result = vkQueueWaitIdle(submission_queue); + CHECK_VK(result); + + uint32_t groupCountX = 1; + uint32_t groupCountY = (testCase.TOTAL_M / testCase.TILE_M + (local_size_y - 1)) / local_size_y; + uint32_t groupCountZ = (testCase.TOTAL_N / testCase.TILE_N + (local_size_z - 1)) / local_size_z; + + result = vkBeginCommandBuffer(commandBuffers[1], &commandBufferBeginInfo); // Begin command buffer recording + CHECK_VK(result); + + vkCmdBindPipeline(commandBuffers[1], VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + vkCmdBindDescriptorSets(commandBuffers[1], VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0u, 1, &descriptorSet, 0u, NULL); + + // Reset the timestamp query pool, so we can start fetching new values into it + vkCmdResetQueryPool(commandBuffers[1], query_pool_timestamps, 0, static_cast(time_stamps.size())); + + perf_loop = time_stamps.size()/2; // Both should have the same value, but just in case... + + for (size_t loop = 0; loop < perf_loop; loop++) + { + vkCmdPipelineBarrier(commandBuffers[1], VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); + vkCmdWriteTimestamp( commandBuffers[1], VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, query_pool_timestamps, loop*2 ); // Start timer... + vkCmdDispatch( commandBuffers[1], groupCountX, groupCountY, groupCountZ); // Dispacth work + vkCmdWriteTimestamp( commandBuffers[1], VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, query_pool_timestamps,loop*2+1); // Stop timer... + } + + vkCmdPipelineBarrier(commandBuffers[1], VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); + + result = vkEndCommandBuffer(commandBuffers[1]); // End command buffer recording + CHECK_VK(result); + + submitInfo.pCommandBuffers = &commandBuffers[1]; + result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE); // Here is the actual work! + CHECK_VK(result); + result = vkQueueWaitIdle(submission_queue); + CHECK_VK(result); + + vkGetQueryPoolResults(m_vulkan_instance.m_VulkanDevice, query_pool_timestamps, 0, time_stamps.size(), time_stamps.size() * sizeof(uint64_t), time_stamps.data(), sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + + double ms = 0.0, min_ms = DBL_MAX, delta_in_ms = 0.0; + for (size_t loop = 0; loop < perf_loop; loop++) + { + delta_in_ms = double(time_stamps[loop*2+1] - time_stamps[loop*2]) * double(device_limits.timestampPeriod) / 1000000.0; + min_ms = (delta_in_ms < min_ms ? delta_in_ms : min_ms); + ms += delta_in_ms; + } + + if(gpuvendor_id == VK_VENDOR_ID_QUALCOMM ) + { + uint32_t num_uSP; + switch (gputier_id) + { + case QCOM_TIER_GLYMUR: + case QCOM_TIER_GLYMUR_TEST: + num_uSP = 16; + printf("\nQCOM Glymur GPU with num of uSP: %d, ", num_uSP); + break; + default: + num_uSP = 12; + printf("\nQCOM GPU with Num of uSP: %d, ", num_uSP); + } + + uint64_t total_ops = 0; + if (tt == TT_CONV) + { + total_ops = static_cast(testCase.TOTAL_M) * + static_cast(testCase.TOTAL_N) * + static_cast(testCase.TOTAL_K) * + static_cast(filterHeight) * + static_cast(filterWidth) * 2; + } + else + { + total_ops = static_cast(testCase.TOTAL_M) * + static_cast(testCase.TOTAL_N) * + static_cast(testCase.TOTAL_K) * 2; + } + + uint32_t theoreticalTime_ns = 1000 * ((unsigned long int)testCase.TOTAL_M * testCase.TOTAL_N * testCase.TOTAL_K / 64 / 2 / num_uSP / (4 / bytesPerInput)) / gpu_freq_MHz; + if (tt == TT_CONV) + theoreticalTime_ns = 1000 * ((unsigned long int)testCase.TOTAL_M * testCase.TOTAL_N * testCase.TOTAL_K * filterHeight * filterWidth / 64 / 2 / num_uSP / (4 / bytesPerInput)) / gpu_freq_MHz; + + std::cout << "Maximum theoretical perf on device @" << gpu_freq_MHz << "MHz is " << theoreticalTime_ns / 1000 << "us." << std::endl; + ms /= double(perf_loop); + double percentOfPeak_avg = 100 * theoreticalTime_ns / ms / 1000 / 1000; + double percentOfPeak_min = 100 * theoreticalTime_ns / min_ms / 1000 / 1000; + std::cout << "MxM kernel time, average of " << perf_loop << " run(s): " << ms * 1000 << "us (" << percentOfPeak_avg << "% of theoretical peak (assuming " << gpu_freq_MHz << "MHz frequency))\n"; + std::cout << "MxM kernel time, min of " << perf_loop << " run(s): " << min_ms * 1000 << "us (" << percentOfPeak_min << "% of theoretical peak (assuming " << gpu_freq_MHz << "MHz frequency))\n"; + + test_result.time_total = ms * 1000; + test_result.TOPS = static_cast(total_ops) / (ms / 1000.0) / 1e12; + test_result.percentage = percentOfPeak_avg; + } + else + { + ms /= double(perf_loop); + std::cout << "MxM kernel time, average of " << perf_loop << " run(s): " << ms * 1000 << "us\n"; + std::cout << "MxM kernel time, min of " << perf_loop << " run(s): " << min_ms * 1000 << "us\n"; + + test_result.time_total = ms * 1000; + test_result.TOPS = 0.0; + test_result.percentage = 0.0; + } + + // Upload the result from device memory. + result = vkBeginCommandBuffer(commandBuffers[2], &commandBufferBeginInfo); // Begin command buffer recording + CHECK_VK(result); + { + MatrixDesc &m = matrices[MAT_R]; + VkBufferCopy copy = { 0, 0, m.bufferSize }; + vkCmdCopyBuffer(commandBuffers[2], m.deviceBuffer, m.hostBuffer, 1, ©); + } + result = vkEndCommandBuffer(commandBuffers[2]); // End command buffer recording + CHECK_VK(result); + + submitInfo.pCommandBuffers = &commandBuffers[2]; + result = vkQueueSubmit(submission_queue, 1, &submitInfo, VK_NULL_HANDLE); + CHECK_VK(result); + result = vkQueueWaitIdle(submission_queue); + CHECK_VK(result); + + auto destroyMatrixDesc = [](VkDevice device, MatrixDesc & m) + { + vkDestroyBuffer(device, m.hostBuffer, NULL); + vkDestroyBuffer(device, m.deviceBuffer, NULL); + vkFreeMemory(device, m.hostMemory, NULL); + vkFreeMemory(device, m.deviceMemory, NULL); + }; + + // Free the memory/buffers/pipeline for this iteration. + for (int i = 0; i < NUM_MATS; ++i) + { + destroyMatrixDesc(m_vulkan_instance.m_VulkanDevice, matrices[i]); + } + + vkDestroyPipeline(m_vulkan_instance.m_VulkanDevice, pipeline, NULL); + + vkDestroyShaderModule(m_vulkan_instance.m_VulkanDevice, shaderModule, NULL); + + return test_result; +} diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp new file mode 100644 index 0000000..867893d --- /dev/null +++ b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.hpp @@ -0,0 +1,152 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// +#pragma once + +#include +#include "runtime_shader.hpp" +#include +#include +#include +#include +#include + +enum { MAT_A = 0, MAT_B = 1, MAT_C = 2, MAT_R = 3, NUM_MATS = 4 }; +enum TestType +{ + TT_MXM_BASIC = 0, + TT_MXM_VecToMat = 1, + TT_CONV = 2, + TT_COUNT, +}; + +enum MatrixTransposeOption +{ + ALWAYS_TRUE, + ALWAYS_FALSE, + VARIABLE, +}; + +enum FillDataType { FILL_WITH_ZERO = 0, FILL_WITH_CONSTANTS, FILL_WITH_RANDON_UINT, FILL_WITH_RANDON_INT, FILL_SEQUENCE_INT, FILL_WITH_RANDOM_LOW_HIGH_INT, FILL_WITH_RANDOM_FLOAT, FILL_WITH_RANDOM_PLUS1_MINUS1_FLOAT }; + +class CooperativeMatrixRunner +{ + struct TestDescription + { + TestType test_type = TT_MXM_BASIC; + FillDataType fill_data_type = FILL_WITH_RANDON_INT; + + uint32_t gpu_freq_MHz = 900; + + VkComponentTypeKHR input_type; + VkComponentTypeKHR output_type; + + int MSize; + int NSize; + int KSize; + int MSizeInBlocks; + int NSizeInBlocks; + int KSizeInBlocks; + uint32_t perf_loop; + + bool layoutA_Mfirst = false; + bool layoutB_Nfirst = false; + bool layoutC_Mfirst = false; + bool layoutR_Mfirst = false; + + int inputWidth = 1; + int inputHeight = 1; + }; + + struct TestResult + { + bool is_valid = false; + double time_total; + double TOPS; + double percentage; + }; + + struct SizeConfiguration + { + int MSizeInBlocks; + int NSizeInBlocks; + int KSizeInBlocks; + + int MSize; + int NSize; + int KSize; + }; + + struct TestGroupTemplateDescription + { + VkComponentTypeKHR input_type; + VkComponentTypeKHR output_type; + + std::vector size_configurations; + }; + + struct TestGroup + { + struct TestRowEntry + { + std::vector test_descriptions; + std::vector test_results; + + bool layoutA_Mfirst = false; + bool layoutB_Nfirst = false; + bool layoutC_Mfirst = false; + bool layoutR_Mfirst = false; + }; + + TestGroupTemplateDescription template_description; + std::vector test_entries; // One per size_in_block_configuration from the template description + }; + +public: + + CooperativeMatrixRunner(Vulkan& vulkan_instance); + ~CooperativeMatrixRunner(); + + bool InitializeRunner(); + + bool TriggerPendingTests(); + void RenderUI(); + +private: + + void PrepareTestSession(); + std::optional RunTest(const TestDescription& test_description); + +private: + + Vulkan& m_vulkan_instance; + + std::vector m_hFoundCooperativeMatrices; + + TestType m_test_type = TT_MXM_BASIC; + FillDataType m_fill_data_type = FILL_WITH_RANDON_INT; + int32_t m_gpu_freq_MHz = 900; + int32_t m_gpu_microSP = 12; + int32_t m_gpu_ALU_per_microSP = 2; + int32_t m_gpu_ops_per_mad = 2; + + MatrixTransposeOption m_matrix_transpose_options[NUM_MATS] = { VARIABLE , VARIABLE , VARIABLE , ALWAYS_FALSE }; + + int m_test_repeats = 1; + bool m_transpose_when_needed = false; + bool m_validate_matrix_result = false; + + bool m_is_processing_tests = false; + uint32_t m_total_tests = 0; + uint32_t m_total_processed_tests = 0; + std::vector m_test_group_templates; + std::vector m_test_groups; +}; \ No newline at end of file diff --git a/samples/cooperative_matrix/code/main/runtime_shader.cpp b/samples/cooperative_matrix/code/main/runtime_shader.cpp new file mode 100644 index 0000000..dec1696 --- /dev/null +++ b/samples/cooperative_matrix/code/main/runtime_shader.cpp @@ -0,0 +1,207 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// + +#include "runtime_shader.hpp" +#include "main/applicationEntrypoint.hpp" +#include "camera/cameraController.hpp" +#include "camera/cameraControllerTouch.hpp" +#include "camera/cameraData.hpp" +#include "camera/cameraGltfLoader.hpp" +#include "gui/imguiVulkan.hpp" +#include "material/drawable.hpp" +#include "material/vulkan/shaderModule.hpp" +#include "material/shaderManagerT.hpp" +#include "material/materialManager.hpp" +#include "material/vulkan/specializationConstantsLayout.hpp" +#include "mesh/meshHelper.hpp" +#include "mesh/meshLoader.hpp" +#include "system/math_common.hpp" +#include "texture/textureManager.hpp" +#include "vulkan/extensionHelpers.hpp" +#include "imgui.h" +#include <../external/glslang/glslang/Include/glslang_c_interface.h> +#include <../external/glslang/glslang/Public/resource_limits_c.h> + +#include +#include +#include + +bool RuntimeShader::Build(const std::string& glsl_code, + VkDevice device, + const char* entry_point, + glslang_stage_t stage) +{ + m_is_valid = false; + + m_spirv = CompileGLSLToSPIRV(glsl_code, entry_point, stage, m_defines); + if (m_spirv.empty()) + { + LOGE("Runtime Shader failed to compile GLSL into SPIRV blob"); + return false; + } + + VkShaderModuleCreateInfo create_info{}; + create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + create_info.codeSize = m_spirv.size() * sizeof(uint32_t); + create_info.pCode = m_spirv.data(); + + if (vkCreateShaderModule(device, &create_info, nullptr, &m_shader_module) != VK_SUCCESS) + { + LOGE("Runtime Shader failed to create vk shader module"); + return false; + } + + m_is_valid = true; + + return true; +} + +std::vector RuntimeShader::CompileGLSLToSPIRV( + const std::string& glsl_source, + const char* entry_name, + glslang_stage_t stage, + std::span> defines) +{ + //////////////////// + // COMPOSE SHADER // + //////////////////// + + + size_t version_string_index = glsl_source.find_first_of("version"); + if (version_string_index == std::string::npos) + { + LOGE("Shader compilation failed -> Could not locate 'version' string on shader code"); + return {}; + } + version_string_index += std::string_view("version").length(); + size_t line_under_version_string_index = glsl_source.find_first_of('\n', version_string_index); + if (line_under_version_string_index == std::string::npos) + { + LOGE("Shader compilation failed -> Could not locate 'version' string on shader code"); + return {}; + } + line_under_version_string_index += 1; + + std::string composed_shader_code = glsl_source; + for (auto& [define_text, value_text] : defines) + { + composed_shader_code.insert(line_under_version_string_index, "#define " + define_text + " " + value_text + "\n"); + } + +#if 1 + glslang_input_t input = { + .language = GLSLANG_SOURCE_GLSL, + .stage = stage, + .client = GLSLANG_CLIENT_VULKAN, + .client_version = GLSLANG_TARGET_VULKAN_1_3, + .target_language = GLSLANG_TARGET_SPV, + .target_language_version = GLSLANG_TARGET_SPV_1_6, + .code = composed_shader_code.c_str(), + .default_version = 100, + .default_profile = GLSLANG_NO_PROFILE, + .force_default_version_and_profile = false, + .forward_compatible = false, + .messages = GLSLANG_MSG_DEFAULT_BIT, +// .resource = s_slslang_built_in_resource, + .resource = glslang_default_resource(), +// .resource = nullptr, + }; + + if (!glslang_shader_create(&input)) // initialize internally + { + LOGE("Failed to create shader\n"); + return {}; + } + + glslang_shader_t* shader = glslang_shader_create(&input); + + if (!glslang_shader_preprocess(shader, &input)) + { + LOGE("Preprocessing failed:\n%s\n", glslang_shader_get_info_log(shader)); + glslang_shader_delete(shader); + return {}; + } + + if (!glslang_shader_parse(shader, &input)) + { + LOGE("Parsing failed:\n%s\n", glslang_shader_get_info_log(shader)); + glslang_shader_delete(shader); + return {}; + } + + glslang_program_t* program = glslang_program_create(); + glslang_program_add_shader(program, shader); + + if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT)) + { + LOGE("Linking failed:\n%s\n", glslang_program_get_info_log(program)); + glslang_program_delete(program); + glslang_shader_delete(shader); + return {}; + } + + glslang_program_SPIRV_generate(program, stage); + + const auto* words = glslang_program_SPIRV_get_ptr(program); + const auto size = glslang_program_SPIRV_get_size(program); + + std::vector spirv(size); + std::memcpy(spirv.data(), words, size * sizeof(uint32_t)); + + glslang_program_delete(program); + glslang_shader_delete(shader); + + return spirv; +#else + + /////////////// + // SLANG API // + /////////////// + + SlangSession* session = spCreateSession(nullptr); + SlangCompileRequest* request = spCreateCompileRequest(&m_global_session); + + spSetCodeGenTarget(request, SLANG_SPIRV); + spSetTargetProfile(request, 0, spFindProfile(&m_global_session, "vk_1_3")); // Vulkan 1.3 compatibility + + int translationUnitIndex = spAddTranslationUnit(request, SlangSourceLanguage::SLANG_SOURCE_LANGUAGE_GLSL, nullptr); + spAddTranslationUnitSourceString(request, translationUnitIndex, nullptr, composed_shader_code.c_str()); + + spAddEntryPoint( + request, + translationUnitIndex, + entry_name, + stage); + + int compileResult = spCompile(request); + if (SLANG_FAILED(compileResult)) + { + const char* diagnosticOutput = spGetDiagnosticOutput(request); + spDestroyCompileRequest(request); + LOGE("Shader compilation failed -> Compilation failed"); + return {}; + } + + size_t spvSize = 0; + const void* spvData = spGetEntryPointCode(request, 0, &spvSize); + if (!spvData || spvSize == 0) + { + spDestroyCompileRequest(request); + LOGE("Shader compilation failed -> Failed to retrieve entrypoint from compiled code"); + return {}; + } + + std::vector spirv(spvSize / 4); + std::memcpy(spirv.data(), spvData, spvSize); + return spirv; +#endif +} \ No newline at end of file diff --git a/samples/cooperative_matrix/code/main/runtime_shader.hpp b/samples/cooperative_matrix/code/main/runtime_shader.hpp new file mode 100644 index 0000000..4f8ee83 --- /dev/null +++ b/samples/cooperative_matrix/code/main/runtime_shader.hpp @@ -0,0 +1,95 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +/// +/// Sample app demonstrating the loading of a .gltf file (hello world) +/// +#pragma once + +#include +#include +#include +#include +#include +#include +#include <../external/glslang/glslang/Include/glslang_c_interface.h> +#include <../external/glslang/glslang/Public/resource_limits_c.h> + +//////////////////////////////////////////////////////////////////////////////// +// Class name: RuntimeShader +//////////////////////////////////////////////////////////////////////////////// +class RuntimeShader +{ +public: + + /* + * Adds a preprocessor definition to be used during shader compilation. + * @param name : Name of the macro + * @param value : Value of the macro + */ + template + inline void AddDefine(const std::string& name, const Arg& arg) + { + if constexpr (std::is_same_v || std::is_same_v) + { + m_defines.emplace_back(name, std::string(arg)); + } + else + { + m_defines.emplace_back(name, std::to_string(static_cast(arg))); + } + } + + + /* + * Builds the shader by compiling GLSL to SPIR-V and creating a Vulkan shader module. + * @param glsl_code : GLSL source code as a string + * @param device : Vulkan logical device used to create the shader module + * @param entry_point : Entry point name in the GLSL code (e.g., "main") + * @param stage : Shader stage (e.g., SLANG_STAGE_VERTEX) + * @return true if compilation and module creation succeeded + * @note If compilation fails, m_is_valid will be false + */ + bool Build( + const std::string& glsl_code, + VkDevice device, + const char* entry_point, + glslang_stage_t stage); + + /* + * Returns the Vulkan shader module. + * @return VkShaderModule handle + */ + inline VkShaderModule GetShaderModule() const + { + return m_shader_module; + } + + /* + * Checks if the shader was successfully built. + * @return true if valid + */ + inline bool IsValid() const + { + return m_is_valid; + } + +private: + + std::vector CompileGLSLToSPIRV( + const std::string& glsl_source, + const char* entry_name, + glslang_stage_t stage, + std::span> defines = {}); + +private: + std::vector> m_defines; + std::vector m_spirv; + VkShaderModule m_shader_module = VK_NULL_HANDLE; + bool m_is_valid = false; +}; \ No newline at end of file diff --git a/samples/cooperative_matrix/install_apk.bat b/samples/cooperative_matrix/install_apk.bat new file mode 100644 index 0000000..62ab3c4 --- /dev/null +++ b/samples/cooperative_matrix/install_apk.bat @@ -0,0 +1,21 @@ +@echo off +cd /D "%~dp0" + +:: Get the name of the current folder (assumed to be the project name) +for %%I in ("%~dp0.") do set "project_name=%%~nxI" + +@echo. +@echo **************************************** +@echo Installing APK for project: %project_name% +@echo **************************************** + +set "apk_path=..\..\build\android\%project_name%\outputs\apk\debug\%project_name%-debug.apk" + +call adb install -r -t "%apk_path%" + +@echo. +@echo **************************************** +@echo Done! +@echo **************************************** + +IF "%~dpnx0"=="%0" PAUSE \ No newline at end of file diff --git a/samples/cooperative_matrix/install_config.bat b/samples/cooperative_matrix/install_config.bat new file mode 100644 index 0000000..c3d20e8 --- /dev/null +++ b/samples/cooperative_matrix/install_config.bat @@ -0,0 +1,31 @@ + +@echo off +cd /D "%~dp0" + +:: Get the name of the current folder (assumed to be the project name) +for %%I in ("%~dp0.") do set "project_name=%%~nxI" + +:: Check if app_config.txt exists +if exist "app_config.txt" ( + @echo. + @echo **************************************** + @echo Pushing app_config.txt to: /sdcard/Android/data/com.quic.%project_name%/files/ + @echo **************************************** + adb push ./app_config.txt /sdcard/Android/data/com.quic.%project_name%/files/app_config.txt + + @echo. + @echo **************************************** + @echo Done! + @echo **************************************** +) else ( + @echo. + @echo **************************************** + @echo No app_config.txt was found. + @echo It's not necessary for the app, but it can be used to override application settings. + @echo If such functionality is desired, please create the file and override the global variables + @echo according to how they are defined in the project. + @echo **************************************** +) + +:: Pause only if run directly +IF "%~dpnx0"=="%0" PAUSE \ No newline at end of file diff --git a/samples/cooperative_matrix/project/android/AndroidManifest.xml b/samples/cooperative_matrix/project/android/AndroidManifest.xml new file mode 100644 index 0000000..6d317b7 --- /dev/null +++ b/samples/cooperative_matrix/project/android/AndroidManifest.xml @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png new file mode 100644 index 0000000..1b58b37 Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-hdpi/ic_launcher.png differ diff --git a/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png new file mode 100644 index 0000000..11acf77 Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-mdpi/ic_launcher.png differ diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png new file mode 100644 index 0000000..b8016f2 Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xhdpi/ic_launcher.png differ diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png new file mode 100644 index 0000000..c0b9e85 Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xxhdpi/ic_launcher.png differ diff --git a/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png b/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png new file mode 100644 index 0000000..7df4d52 Binary files /dev/null and b/samples/cooperative_matrix/project/android/res/mipmap-xxxhdpi/ic_launcher.png differ diff --git a/samples/cooperative_matrix/project/android/res/values/strings.xml b/samples/cooperative_matrix/project/android/res/values/strings.xml new file mode 100644 index 0000000..2c159d9 --- /dev/null +++ b/samples/cooperative_matrix/project/android/res/values/strings.xml @@ -0,0 +1,4 @@ + + + SGS Cooperative Matrix + diff --git a/samples/cooperative_matrix/project/img/screenshot.png b/samples/cooperative_matrix/project/img/screenshot.png new file mode 100644 index 0000000..13f6aa8 Binary files /dev/null and b/samples/cooperative_matrix/project/img/screenshot.png differ diff --git a/samples/cooperative_matrix/shaders/Blit.frag b/samples/cooperative_matrix/shaders/Blit.frag new file mode 100644 index 0000000..bf6f90e --- /dev/null +++ b/samples/cooperative_matrix/shaders/Blit.frag @@ -0,0 +1,34 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +#version 400 + +#extension GL_ARB_separate_shader_objects : enable +#extension GL_ARB_shading_language_420pack : enable + +// Buffer binding locations +#define SHADER_OVERLAY_TEXTURE_LOC 0 + +layout(set = 0, binding = SHADER_OVERLAY_TEXTURE_LOC) uniform sampler2D u_OverlayTex; + +// Varying's +layout (location = 0) in vec2 v_TexCoord; +layout (location = 1) in vec4 v_VertColor; + +// Finally, the output color +layout (location = 0) out vec4 FragColor; + +//----------------------------------------------------------------------------- +void main() +//----------------------------------------------------------------------------- +{ + vec2 LocalTexCoord = vec2(v_TexCoord.xy); + vec4 OverlayColor = texture( u_OverlayTex, LocalTexCoord.xy ); + FragColor = OverlayColor; +} + diff --git a/samples/cooperative_matrix/shaders/Blit.json b/samples/cooperative_matrix/shaders/Blit.json new file mode 100644 index 0000000..f9f7df6 --- /dev/null +++ b/samples/cooperative_matrix/shaders/Blit.json @@ -0,0 +1,58 @@ +{ + "$schema": "../../../framework/schema/shaderSchema.json", + "Passes": [ + { + "Name": "RP_BLIT", + "Shaders": { + "Vertex": "Media/Shaders/Blit.vert.spv", + "Fragment": "Media/Shaders/Blit.frag.spv" + }, + "DescriptorSets": [ + { + "Buffers": [ + { + "Type": "ImageSampler", + "Stages": [ "Fragment" ], + "Count": 1, + "Names": [ "Overlay" ] + } + ] + } + ], + "VertexBindings": [ "VB0" ] + } + ], + "Vertex": [ + { + "Span": 60, + "Name": "VB0", + "Elements": [ + { + "Name": "Position", + "Offset": 0, + "Type": "Vec3" + }, + { + "Name": "Normal", + "Offset": 12, + "Type": "Vec3" + }, + { + "Name": "UV", + "Offset": 24, + "Type": "Vec2" + }, + { + "Name": "Color", + "Offset": 32, + "Type": "Vec4" + }, + { + "Name": "Tangent", + "Offset": 48, + "Type": "Vec3" + } + ] + } + ] +} diff --git a/samples/cooperative_matrix/shaders/Blit.vert b/samples/cooperative_matrix/shaders/Blit.vert new file mode 100644 index 0000000..d11750a --- /dev/null +++ b/samples/cooperative_matrix/shaders/Blit.vert @@ -0,0 +1,38 @@ +//============================================================================================================ +// +// +// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +#version 400 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_ARB_shading_language_420pack : enable + +#define SHADER_ATTRIB_LOC_POSITION 0 +#define SHADER_ATTRIB_LOC_NORMAL 1 +#define SHADER_ATTRIB_LOC_TEXCOORD0 2 +#define SHADER_ATTRIB_LOC_COLOR 3 +#define SHADER_ATTRIB_LOC_TANGENT 4 + +layout (location = SHADER_ATTRIB_LOC_POSITION ) in vec4 a_Position; +layout (location = SHADER_ATTRIB_LOC_NORMAL ) in vec3 a_Normal; +layout (location = SHADER_ATTRIB_LOC_TEXCOORD0) in vec2 a_TexCoord; +layout (location = SHADER_ATTRIB_LOC_COLOR ) in vec4 a_Color; +layout (location = SHADER_ATTRIB_LOC_TANGENT ) in vec4 a_Tangent; + +// Varying's +layout (location = 0) out vec2 v_TexCoord; +layout (location = 1) out vec4 v_VertColor; + +void main() +{ + // Position and text coord are simple (Except Y in inverted on screen compared to OpenGL) + vec4 TempPos = vec4(a_Position.xyz, 1.0); + gl_Position = vec4(TempPos.x, -TempPos.y, TempPos.z, TempPos.w); + v_TexCoord = vec2(a_TexCoord.xy); + + // Color is simple attribute color + v_VertColor.xyzw = vec4(a_Color.xyz, 1.0); +} diff --git a/samples/tile_memory/README.md b/samples/tile_memory/README.md new file mode 100644 index 0000000..243f385 --- /dev/null +++ b/samples/tile_memory/README.md @@ -0,0 +1,8 @@ +# Tile Memory Heap Sample + +This sample demonstrates a light clustering algorithm using Vulkan, with specific support for the *VK_QCOM_tile_memory_heap* extension. +This extension allows the application to allocate and manage tile memory, which is used for efficient memory management within a command buffer submission batch. + +The sample showcases how tile memory can be used to optimize rendering performance by reducing memory bandwidth and improving cache locality. It implements a forward rendering pipeline with clustered lighting, where lights are grouped based on screen-space tiles. These tiles are processed using tile-local memory allocations, enabling fast access and minimizing global memory usage. + +The rendering technique is designed to highlight the benefits of tile memory in scenarios with many dynamic lights, demonstrating how Vulkan applications can leverage Qualcomm™-specific extensions to achieve better performance on Adreno™ GPUs. \ No newline at end of file diff --git a/samples/tile_shading/README.md b/samples/tile_shading/README.md new file mode 100644 index 0000000..c787a00 --- /dev/null +++ b/samples/tile_shading/README.md @@ -0,0 +1,11 @@ +# Tile Shading Sample + +This sample demonstrates a tile-based shading technique using Vulkan, with support for the *VK_QCOM_tile_memory_heap* extension. + +The extension enables the application to allocate and manage tile-local memory, which is scoped to the duration of a command buffer submission and optimized for high-bandwidth, low-latency access within a tile. + +The sample implements a forward rendering pipeline where shading computations are performed per tile, rather than per pixel or per fragment. This approach leverages the tiling architecture of Adreno™ GPUs to reduce memory traffic and improve cache efficiency. + +By using tile memory, the sample avoids costly round-trips to global memory for intermediate shading data. Instead, lighting calculations and material evaluations are performed directly in tile-local memory, which is faster and more power-efficient. + +The technique is particularly well-suited for mobile GPUs, where bandwidth and power are constrained. It demonstrates how Vulkan applications can take advantage of Qualcomm™-specific extensions to optimize rendering workloads and achieve better performance on Snapdragon™ platforms. \ No newline at end of file diff --git a/samples/tile_shading/code/main/application.cpp b/samples/tile_shading/code/main/application.cpp index d02c336..a263276 100644 --- a/samples/tile_shading/code/main/application.cpp +++ b/samples/tile_shading/code/main/application.cpp @@ -456,6 +456,7 @@ bool Application::CreateRenderTargets() gSurfaceHeight, FinalColorType, TextureFormat::UNDEFINED, + nullptr, blitColorTypes, }, "BLIT RT")) @@ -475,6 +476,7 @@ bool Application::CreateRenderTargets() gRenderHeight, TileShadingSceneColorType, desiredDepthFormat, + nullptr, tileShadingSceneColorTypes, TT_DEPTH_TARGET_LOCAL_READ, },