diff --git a/Config.txt b/Config.txt index 615d689..cd06a6c 100644 --- a/Config.txt +++ b/Config.txt @@ -30,10 +30,10 @@ framework\ imgui @ https://github.com/ocornut/imgui/archive/4f9ba19e520bea478f5cb654d37ef45e6404bd52.zip MD5:2d0aa43693cdada8abb9d49a44c1337b implot @ https://github.com/epezent/implot/archive/f156599faefe316f7dd20fe6c783bf87c8bb6fd9.zip MD5:b95d158f69b1716da2cd7c17d63bdce4 portable-file-dialogs @ https://github.com/samhocevar/portable-file-dialogs/archive/7f852d88a480020d7f91957cbcefe514fc95000c.zip MD5:ec1fd9e86f260b99a50294b8f53f872f - Vulkan-Headers @ https://github.com/KhronosGroup/Vulkan-Headers/archive/refs/tags/v1.4.328.zip + Vulkan-Headers @ https://github.com/KhronosGroup/Vulkan-Headers/archive/refs/tags/v1.4.338.zip glm @ https://github.com/g-truc/glm/archive/6ad79aae3eb5bf809c30bf1168171e9e55857e45.zip GameSampleAssets @ https://github.com/SnapdragonStudios/game-assets-for-adreno-gpu-code-samples/archive/0ef8e70049ffd9ee4f9138b43815b21f959497d0.zip - volk @ https://github.com/zeux/volk/archive/1e0ec168f1726e6389b8647435a3018f0cef9428.zip + volk @ https://github.com/zeux/volk/archive/d34b5e0d46b28c22d69b97ee7da074b6e68d9e25.zip SPIRV-Cross @ https://github.com/KhronosGroup/SPIRV-Cross/archive/7affe74d77f93a622bb5002789d5332d32e512ee.zip glslang @ https://github.com/KhronosGroup/glslang/archive/3a7f78758f8faa9a6e059b09e25fc64ede7fbfb0.zip json @ https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz MD5:4b67aba51ddf17c798e80361f527f50e diff --git a/framework/code/camera/camera.cpp b/framework/code/camera/camera.cpp index 3e681cf..1281333 100644 --- a/framework/code/camera/camera.cpp +++ b/framework/code/camera/camera.cpp @@ -113,6 +113,19 @@ void Camera::UpdateMatrices() glm::mat4 Camera::GetProjectionWithJitter(const glm::vec3 jitter) const //----------------------------------------------------------------------------- { +#if 0 glm::mat4 jm = glm::translate(jitter); return jm * m_ProjectionMatrixNoJitter; +#else + + // Assumes jitts is in NDC (not pixel). + // Remember to flip jitter y, if needed. + glm::mat4 jitteredProj = m_ProjectionMatrixNoJitter; + for (int col = 0; col < 4; ++col) + { + jitteredProj[col][0] += jitter.x * jitteredProj[col][3]; + jitteredProj[col][1] += jitter.y * jitteredProj[col][3]; + } + return jitteredProj; +#endif } diff --git a/framework/code/camera/cameraControllerTouch.cpp b/framework/code/camera/cameraControllerTouch.cpp index 271b49e..4496a43 100644 --- a/framework/code/camera/cameraControllerTouch.cpp +++ b/framework/code/camera/cameraControllerTouch.cpp @@ -50,6 +50,7 @@ void CameraControllerTouch::TouchDownEvent(int iPointerID, float xPos, float yPo m_LookaroundTouchId = iPointerID; m_LastLookaroundTouchPosition = { xPos, yPos }; m_CurrentLookaroundTouchPosition = m_LastLookaroundTouchPosition; + m_LookDeltaPixelsAccum = glm::vec2(0.0f); } else if (xPos < m_ScreenSize.x * 0.5f && m_MovementTouchId == -1) { @@ -65,7 +66,11 @@ void CameraControllerTouch::TouchMoveEvent(int iPointerID, float xPos, float yPo { if (iPointerID == m_LookaroundTouchId) { - m_CurrentLookaroundTouchPosition = { xPos, yPos }; + const glm::vec2 new_pos = { xPos, yPos }; + const glm::vec2 delta = new_pos - m_CurrentLookaroundTouchPosition; + + m_CurrentLookaroundTouchPosition = new_pos; + m_LookDeltaPixelsAccum += delta; } else if (iPointerID == m_MovementTouchId) { @@ -81,6 +86,7 @@ void CameraControllerTouch::TouchUpEvent(int iPointerID, float xPos, float yPos) { m_LookaroundTouchId = -1; m_CurrentLookaroundTouchPosition = { xPos, yPos }; + m_LookDeltaPixelsAccum = glm::vec2(0.0f); } else if (iPointerID == m_MovementTouchId) { @@ -94,24 +100,52 @@ void CameraControllerTouch::TouchUpEvent(int iPointerID, float xPos, float yPos) void CameraControllerTouch::Update(float frameTime, glm::vec3& position, glm::quat& rot, bool& cut) { cut = false; + if (m_LookaroundTouchId != -1) { - auto mouseDiff = m_LastLookaroundTouchPosition - m_CurrentLookaroundTouchPosition; - auto angleChange = mouseDiff * frameTime * m_RotateSpeed; + const float tau = glm::max(0.0001f, m_LookSmoothTauSec); + const float alpha = 1.0f - glm::exp(-frameTime / tau); // 0..1 + + const glm::vec2 applyPixels = m_LookDeltaPixelsAccum * alpha; + m_LookDeltaPixelsAccum -= applyPixels; + + const glm::vec2 ndcDelta = + { + applyPixels.x / glm::max(1.0f, m_ScreenSize.x), + applyPixels.y / glm::max(1.0f, m_ScreenSize.y) + }; + + const float yaw = -ndcDelta.x * glm::pi() * m_RotateSpeed; + const float pitch = -ndcDelta.y * glm::pi() * m_RotateSpeed; + + const glm::vec3 viewRight = rot * cVecViewRight; - m_LastLookaroundTouchPosition = m_CurrentLookaroundTouchPosition; - // one (touch) rotation axis is relative to the view direction, other is relative to world - prevents camera from 'twisting' although does introduce gimbal when looking along the UP axis and rotationg left/right. - rot = glm::angleAxis( angleChange.x, m_WorldUp ) * rot * glm::angleAxis( angleChange.y, cVecViewRight ); - rot = glm::normalize( rot ); + rot = glm::angleAxis(yaw, m_WorldUp) * rot; + rot = glm::angleAxis(pitch, viewRight) * rot; + rot = glm::normalize(rot); + } + else + { + m_LookDeltaPixelsAccum = glm::vec2(0.0f); } if (m_MovementTouchId != -1) { - auto mouseDiff = m_LastMovementTouchPosition - m_CurrentMovementTouchPosition; - auto directionChange = mouseDiff * frameTime * m_MoveSpeed * cTouchMoveSpeedMultipler; +#if 1 + const auto mouseDiff = m_LastMovementTouchPosition - m_CurrentMovementTouchPosition; + const auto directionChange = mouseDiff * frameTime * m_MoveSpeed * cTouchMoveSpeedMultipler; + + position -= rot * cVecViewRight * directionChange.x; + position += rot * cVecViewForward * directionChange.y; +#else + const glm::vec2 mouseDiff = m_LastMovementTouchPosition - m_CurrentMovementTouchPosition; + const glm::vec2 directionChange = mouseDiff * frameTime * m_MoveSpeed * cTouchMoveSpeedMultipler; position -= rot * cVecViewRight * directionChange.x; position += rot * cVecViewForward * directionChange.y; + + m_LastMovementTouchPosition = m_CurrentMovementTouchPosition; +#endif } } diff --git a/framework/code/camera/cameraControllerTouch.hpp b/framework/code/camera/cameraControllerTouch.hpp index 89a4614..0fe6ac5 100644 --- a/framework/code/camera/cameraControllerTouch.hpp +++ b/framework/code/camera/cameraControllerTouch.hpp @@ -47,4 +47,7 @@ class CameraControllerTouch : public CameraControllerBase int m_MovementTouchId; int m_LookaroundTouchId; + + glm::vec2 m_LookDeltaPixelsAccum = glm::vec2(0.0f); + float m_LookSmoothTauSec = 0.03f; }; diff --git a/framework/code/graphicsApi/renderTarget.hpp b/framework/code/graphicsApi/renderTarget.hpp index 5ec33fc..0060fdb 100644 --- a/framework/code/graphicsApi/renderTarget.hpp +++ b/framework/code/graphicsApi/renderTarget.hpp @@ -22,11 +22,11 @@ struct RenderTargetInitializeInfo std::span LayerFormats = {}; TextureFormat DepthFormat = TextureFormat::UNDEFINED; RenderTargetBase* InheritedDepthAttachment = nullptr; - const std::span TextureTypes = {}; - const std::optional DepthTextureType = std::nullopt; + std::span TextureTypes = {}; + std::optional DepthTextureType = std::nullopt; std::span Msaa = {}; - const std::span ResolveTextureFormats = {}; - const std::span FilterModes = {}; + std::span ResolveTextureFormats = {}; + std::span FilterModes = {}; }; class RenderTargetBase diff --git a/framework/code/texture/texture.hpp b/framework/code/texture/texture.hpp index 482cbcb..2c910c8 100644 --- a/framework/code/texture/texture.hpp +++ b/framework/code/texture/texture.hpp @@ -118,6 +118,7 @@ class ImageView final : public ImageViewBase enum TEXTURE_TYPE { TT_NORMAL = 0, + TT_SAMPLED_TRANSFERDST, TT_RENDER_TARGET, TT_RENDER_TARGET_WITH_STORAGE, TT_RENDER_TARGET_TRANSFERSRC, diff --git a/framework/code/texture/vulkan/texture.cpp b/framework/code/texture/vulkan/texture.cpp index 9f9ae88..4e62865 100644 --- a/framework/code/texture/vulkan/texture.cpp +++ b/framework/code/texture/vulkan/texture.cpp @@ -480,6 +480,10 @@ Texture CreateTextureObject(Vulkan& vulkan, const CreateTexObjec ImageInfo.tiling = VK_IMAGE_TILING_OPTIMAL;// VK_IMAGE_TILING_LINEAR; // VK_IMAGE_TILING_OPTIMAL ImageInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT; break; + case TT_SAMPLED_TRANSFERDST: + ImageInfo.tiling = VK_IMAGE_TILING_OPTIMAL; + ImageInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT; + break; case TT_RENDER_TARGET: // If using VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT then tiling MUST be VK_IMAGE_TILING_OPTIMAL ImageInfo.tiling = VK_IMAGE_TILING_OPTIMAL; @@ -581,6 +585,7 @@ Texture CreateTextureObject(Vulkan& vulkan, const CreateTexObjec vulkan.SetImageLayout(vmaImage.GetVkBuffer(), SetupCmdBuffer, VK_IMAGE_ASPECT_COLOR_BIT, ImageInfo.initialLayout, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, (VkPipelineStageFlags)0/*unused param*/, (VkPipelineStageFlags)0/*unused param*/, 0, ImageInfo.mipLevels, 0, ImageInfo.arrayLayers); retImageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; break; + case TT_SAMPLED_TRANSFERDST: case TT_NORMAL: vulkan.SetImageLayout(vmaImage.GetVkBuffer(), SetupCmdBuffer, VK_IMAGE_ASPECT_COLOR_BIT, ImageInfo.initialLayout, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, (VkPipelineStageFlags)0/*unused param*/, (VkPipelineStageFlags)0/*unused param*/, 0, ImageInfo.mipLevels, 0, ImageInfo.arrayLayers); retImageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; @@ -667,6 +672,7 @@ Texture CreateTextureObject(Vulkan& vulkan, const CreateTexObjec case TT_SHADING_RATE_IMAGE: case TT_CPU_UPDATE: case TT_NORMAL: + case TT_SAMPLED_TRANSFERDST: case TT_RENDER_TARGET: case TT_RENDER_TARGET_WITH_STORAGE: case TT_RENDER_TARGET_TRANSFERSRC: diff --git a/framework/code/vulkan/extensionLib.cpp b/framework/code/vulkan/extensionLib.cpp index 0c35d66..5c40e7a 100644 --- a/framework/code/vulkan/extensionLib.cpp +++ b/framework/code/vulkan/extensionLib.cpp @@ -452,6 +452,14 @@ namespace ExtensionLib } #endif // VK_ARM_data_graph +#if VK_QCOM_data_graph_model + void Ext_VK_QCOM_data_graph_model::PrintFeatures() const + { + LOGI("VK_QCOM_data_graph_model (VkPhysicalDeviceDataGraphModelFeaturesQCOM):"); + LOGI(" dataGraphModel: %s", this->AvailableFeatures.dataGraphModel ? "True" : "False"); + } +#endif // VK_QCOM_data_graph_model + #if VK_KHR_get_physical_device_properties2 void Ext_VK_KHR_get_physical_device_properties2::LookupFunctionPointers(VkInstance vkInstance) diff --git a/framework/code/vulkan/extensionLib.hpp b/framework/code/vulkan/extensionLib.hpp index 37055e6..3ed8f8a 100644 --- a/framework/code/vulkan/extensionLib.hpp +++ b/framework/code/vulkan/extensionLib.hpp @@ -564,6 +564,32 @@ namespace ExtensionLib #endif // VK_ARM_data_graph +#if VK_QCOM_data_graph_model + + struct Ext_VK_QCOM_data_graph_model : public VulkanFeaturesAndFunctionPointerExtensionHelper< + VkPhysicalDeviceDataGraphModelFeaturesQCOM, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DATA_GRAPH_MODEL_FEATURES_QCOM> + { + using tBase = VulkanFeaturesAndFunctionPointerExtensionHelper; + static constexpr auto Name = VK_QCOM_DATA_GRAPH_MODEL_EXTENSION_NAME; + + explicit Ext_VK_QCOM_data_graph_model(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : tBase(Name, status) + { + } + + void PopulateRequestedFeatures() override + { + RequestedFeatures.sType = AvailableFeatures.sType; + RequestedFeatures.dataGraphModel = AvailableFeatures.dataGraphModel; + } + + void PrintFeatures() const override; + void LookupFunctionPointers(VkInstance vkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_QCOM_data_graph_model + #if VK_QCOM_tile_properties struct Ext_VK_QCOM_tile_properties : public VulkanFeaturesAndFunctionPointerExtensionHelper< @@ -705,6 +731,146 @@ namespace ExtensionLib #endif // VK_KHR_get_physical_device_properties2 +#if VK_KHR_external_memory_capabilities + + // Instance extension: exposes capability queries (buffer/image external memory). + struct Ext_VK_KHR_external_memory_capabilities : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME; + explicit Ext_VK_KHR_external_memory_capabilities(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + // Volk will resolve globally; we keep the overrides for consistency. + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_memory_capabilities + +#if VK_KHR_external_memory + + // Device extension: enables external memory types/flags. No commands to load. + struct Ext_VK_KHR_external_memory : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME; + explicit Ext_VK_KHR_external_memory(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_memory + +#if VK_KHR_external_memory_fd + + // Device extension: POSIX file descriptor handles for external memory. + struct Ext_VK_KHR_external_memory_fd : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME; + explicit Ext_VK_KHR_external_memory_fd(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_memory_fd + +#if defined(VK_USE_PLATFORM_WIN32_KHR) && VK_KHR_external_memory_win32 + + // Device extension: Win32 HANDLEs for external memory. + struct Ext_VK_KHR_external_memory_win32 : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME; + explicit Ext_VK_KHR_external_memory_win32(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_USE_PLATFORM_WIN32_KHR && VK_KHR_external_memory_win32 + + +#if VK_KHR_external_semaphore_capabilities + + // Instance extension: exposes semaphore external capability queries. + struct Ext_VK_KHR_external_semaphore_capabilities : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME; + explicit Ext_VK_KHR_external_semaphore_capabilities(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_semaphore_capabilities + +#if VK_KHR_external_semaphore + + // Device extension: enables external semaphore types/flags. No commands to load. + struct Ext_VK_KHR_external_semaphore : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME; + explicit Ext_VK_KHR_external_semaphore(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_semaphore + + +#if VK_KHR_external_semaphore_fd + + // Device extension: POSIX file descriptor handles for external semaphores. + struct Ext_VK_KHR_external_semaphore_fd : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME; + explicit Ext_VK_KHR_external_semaphore_fd(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_KHR_external_semaphore_fd + + +#if defined(VK_USE_PLATFORM_WIN32_KHR) && VK_KHR_external_semaphore_win32 + + // Device extension: Win32 HANDLEs for external semaphores. + struct Ext_VK_KHR_external_semaphore_win32 : public VulkanFunctionPointerExtensionHelper + { + static constexpr auto Name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME; + explicit Ext_VK_KHR_external_semaphore_win32(VulkanExtensionStatus status = VulkanExtensionStatus::eRequired) + : VulkanFunctionPointerExtensionHelper(Name, status) + { + } + + void LookupFunctionPointers(VkInstance) override {} + void LookupFunctionPointers(VkDevice, PFN_vkGetDeviceProcAddr) override {} + }; + +#endif // VK_USE_PLATFORM_WIN32_KHR && VK_KHR_external_semaphore_win32 + #if VK_KHR_surface // Instance extension diff --git a/framework/code/vulkan/renderContext.cpp b/framework/code/vulkan/renderContext.cpp index f3c0668..c0496b8 100644 --- a/framework/code/vulkan/renderContext.cpp +++ b/framework/code/vulkan/renderContext.cpp @@ -189,7 +189,7 @@ RenderContext::RenderPassContextData& RenderContext::RenderPassC RenderContext::RenderContext( RenderPass _renderPass, Pipeline _pipeline, Framebuffer _framebuffer, std::string _name ) noexcept //----------------------------------------------------------------------------- : v{std::move( RenderPassContextData { - std::move( _renderPass ), std::move( _pipeline ), std::move( _framebuffer ), _framebuffer.GetRenderPassClearData().Copy() + std::move( _renderPass ), std::move( _pipeline ), _framebuffer, _framebuffer.GetRenderPassClearData().Copy() } )} , name{std::move( _name )} { diff --git a/framework/code/vulkan/vulkan.cpp b/framework/code/vulkan/vulkan.cpp index 3a9e7d3..629a3d3 100644 --- a/framework/code/vulkan/vulkan.cpp +++ b/framework/code/vulkan/vulkan.cpp @@ -1224,6 +1224,8 @@ bool Vulkan::InitInstanceExtensions() return true; } + + //----------------------------------------------------------------------------- bool Vulkan::GetDataGraphProcessingEngine() //----------------------------------------------------------------------------- @@ -1233,105 +1235,149 @@ bool Vulkan::GetDataGraphProcessingEngine() return true; } - // If Ext_VK_ARM_data_graph->AvailableFeatures.dataGraph is supported, force graph pipeline support here while that - // isn't fully supported publicly by the driver -#if defined(OS_ANDROID) - { -#if 0 - auto* Ext_VK_ARM_tensors = static_cast(m_DeviceExtensions.GetExtension(VK_ARM_TENSORS_EXTENSION_NAME)); - auto* Ext_VK_ARM_data_graph = static_cast(m_DeviceExtensions.GetExtension(VK_ARM_DATA_GRAPH_EXTENSION_NAME)); - auto fpGetDeviceProcAddr = (PFN_vkGetDeviceProcAddr)vkGetInstanceProcAddr(GetVulkanInstance(), "vkGetDeviceProcAddr"); - if (Ext_VK_ARM_tensors - && Ext_VK_ARM_data_graph - && Ext_VK_ARM_data_graph->AvailableFeatures.dataGraph - && fpGetDeviceProcAddr) - { - LOGI("Forcing registering and enabling Graph Pipelines extensions for Android"); - - try - { - Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); - Ext_VK_ARM_tensors->LookupFunctionPointers(m_VulkanInstance); - - Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanDevice, fpGetDeviceProcAddr); - Ext_VK_ARM_data_graph->LookupFunctionPointers(m_VulkanInstance); + const auto* Ext_VK_QCOM_data_graph_model = + static_cast(m_DeviceExtensions.GetExtension(VK_QCOM_DATA_GRAPH_MODEL_EXTENSION_NAME)); - LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Done"); - } - catch (...) - { - Ext_VK_ARM_tensors->Status = VulkanExtensionStatus::eLoaded; - Ext_VK_ARM_data_graph->Status = VulkanExtensionStatus::eLoaded; - - LOGI("Forcing registering and enabling Graph Pipelines extensions for Android - Failed, disabling EXT"); - } - } -#endif - } -#endif + const bool is_qcom_data_graph_model_supported = (Ext_VK_QCOM_data_graph_model != nullptr); LOGI("************************************"); LOGI("*** DATA GRAPH PROCESSING ENGINE ***"); LOGI("************************************"); - uint32_t propCount = 0; - vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( + const uint32_t queue_family_index = m_VulkanQueues[Vulkan::eDataGraphQueue].QueueFamilyIndex; + + uint32_t prop_count = 0; + VkResult result = vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( m_VulkanGpu, - m_VulkanQueues[Vulkan::eDataGraphQueue].QueueFamilyIndex, - &propCount, + queue_family_index, + &prop_count, nullptr); - std::vector dataGraphProps = std::vector(propCount); - vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( + + if (result != VK_SUCCESS || prop_count == 0) + { + LOGW("*** No data graph properties returned (result=%d, propCount=%u). Disabling data graph.", static_cast(result), prop_count); + m_VulkanGraphicsQueueSupportsDataGraph = false; + return true; + } + + std::vector data_graph_props(prop_count); + for (auto& p : data_graph_props) + { + p.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_DATA_GRAPH_PROPERTIES_ARM; + p.pNext = nullptr; + } + + result = vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( m_VulkanGpu, - m_VulkanQueues[Vulkan::eDataGraphQueue].QueueFamilyIndex, - &propCount, - dataGraphProps.data()); + queue_family_index, + &prop_count, + data_graph_props.data()); + + if (result != VK_SUCCESS || prop_count == 0) + { + LOGW("*** Failed to query data graph properties (result=%d, propCount=%u). Disabling data graph.", static_cast(result), prop_count); + m_VulkanGraphicsQueueSupportsDataGraph = false; + return true; + } LOGI("*** Checking queue data graph props:"); - LOGI("*** \tpropCount: %d", propCount); - bool validEngineAvailable = false; - for (uint32_t j = 0; j < propCount; j++) + LOGI("*** \tpropCount: %u", prop_count); + + for (uint32_t j = 0; j < prop_count; ++j) { LOGI("*** \t\tEngine:"); - LOGI("*** \t\t\tType: 0x%x", dataGraphProps[j].engine.type); - LOGI("*** \t\t\tisForeign: %d", static_cast(dataGraphProps[j].engine.isForeign)); + LOGI("*** \t\t\tType: 0x%x", data_graph_props[j].engine.type); + LOGI("*** \t\t\tisForeign: %d", static_cast(data_graph_props[j].engine.isForeign)); LOGI("*** \t\tOperation:"); - LOGI("*** \t\t\toperationType: 0x%x", dataGraphProps[j].operation.operationType); - LOGI("*** \t\t\toperationType: %s", dataGraphProps[j].operation.name); - LOGI("*** \t\t\toperationType: %d", dataGraphProps[j].operation.version); - //if ((dataGraphProps[j].engine.type == VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_NEURAL_ARM) && - // (dataGraphProps[j].operation.operationType == VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_ARM )) + LOGI("*** \t\t\toperationType: 0x%x", data_graph_props[j].operation.operationType); + LOGI("*** \t\t\tname: %s", data_graph_props[j].operation.name); + LOGI("*** \t\t\tversion: %u", data_graph_props[j].operation.version); + } + + auto Select_engine_and_op = [&]( + VkPhysicalDeviceDataGraphProcessingEngineTypeARM engine_type, + VkPhysicalDeviceDataGraphOperationTypeARM op_type) -> bool + { + + for (uint32_t j = 0; j < prop_count; ++j) { - // Should also verify operation name and version to ensure compatibility with offline compiler - m_VulkanDataGraphProcessingEngine = dataGraphProps[j].engine; - break; + auto& entry = data_graph_props[j]; + if (entry.engine.type == engine_type && entry.operation.operationType == op_type) + { + m_VulkanDataGraphProcessingEngine = entry.engine; + + LOGI("*** Selected engine/op:"); + LOGI("*** \tengine.type=0x%x isForeign=%d", + entry.engine.type, + static_cast(entry.engine.isForeign)); + LOGI("*** \top.type=0x%x name=%s version=%u", + entry.operation.operationType, + entry.operation.name, + entry.operation.version); + + return true; + } + } + + return false; + }; + + + bool selected = false; + + if (is_qcom_data_graph_model_supported) + { + // Prefer QCOM Neural engine + QCOM NeuralModel op, then QCOM Compute engine + QCOM NeuralModel op. + // If unavailable, try QCOM BuiltinModel op. + // + // QCOM op/engine enums are provided by VK_QCOM_data_graph_model + selected = + Select_engine_and_op(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_NEURAL_QCOM, + VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_QCOM) || + Select_engine_and_op(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_COMPUTE_QCOM, + VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_QCOM) || + Select_engine_and_op(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_NEURAL_QCOM, + VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_BUILTIN_MODEL_QCOM) || + Select_engine_and_op(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_COMPUTE_QCOM, + VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_BUILTIN_MODEL_QCOM); + + if (!selected) + { + LOGW("*** VK_QCOM_data_graph_model is active, but no QCOM engine/op pair was found on this queue family."); } } - LOGI("Ensuring Model <-> Device Capabilities support"); + if (!selected) { - // NOTE: Here you would normally compre the device limits with the graph you want to execute, you should - // make sure the tensor dimensions (VkPhysicalDeviceTensorPropertiesARM) are big enough to handle your model. + // Fallback: first available + m_VulkanDataGraphProcessingEngine = data_graph_props[0].engine; + selected = true; + + LOGI("*** Fallback selected first engine:"); + LOGI("*** \tengine.type=0x%x isForeign=%d", + m_VulkanDataGraphProcessingEngine.type, + static_cast(m_VulkanDataGraphProcessingEngine.isForeign)); } LOGI("Checking for Tensor Storage Format Support"); { - if(HasLoadedVulkanDeviceExtension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME)) + if (HasLoadedVulkanDeviceExtension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME)) { - VkTensorFormatPropertiesARM tensorFmtProps = {}; - VkFormatProperties2 f32Props = {}; + VkTensorFormatPropertiesARM tensor_fmt_props = {}; + VkFormatProperties2 format_props = {}; - tensorFmtProps.sType = VK_STRUCTURE_TYPE_TENSOR_FORMAT_PROPERTIES_ARM; - f32Props.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2; - f32Props.pNext = &tensorFmtProps; + tensor_fmt_props.sType = VK_STRUCTURE_TYPE_TENSOR_FORMAT_PROPERTIES_ARM; + tensor_fmt_props.pNext = nullptr; - vkGetPhysicalDeviceFormatProperties2KHR(m_VulkanGpu, VK_FORMAT_R32_SFLOAT, &f32Props); - LOGI("*** \t\t\ttensorFmtProps.linearTilingTensorFeatures: %d", static_cast(tensorFmtProps.linearTilingTensorFeatures)); - LOGI("*** \t\t\ttensorFmtProps.optimalTilingTensorFeatures: %d", static_cast(tensorFmtProps.optimalTilingTensorFeatures)); + format_props.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2; + format_props.pNext = &tensor_fmt_props; - if ((tensorFmtProps.linearTilingTensorFeatures & VK_FORMAT_FEATURE_2_TENSOR_DATA_GRAPH_BIT_ARM) == 0) + vkGetPhysicalDeviceFormatProperties2KHR(m_VulkanGpu, VK_FORMAT_R32_SFLOAT, &format_props); + + LOGI("*** \t\t\ttensorFmtProps.linearTilingTensorFeatures: %lld", static_cast(tensor_fmt_props.linearTilingTensorFeatures)); + LOGI("*** \t\t\ttensorFmtProps.optimalTilingTensorFeatures: %lld", static_cast(tensor_fmt_props.optimalTilingTensorFeatures)); + + if ((tensor_fmt_props.linearTilingTensorFeatures & VK_FORMAT_FEATURE_2_TENSOR_DATA_GRAPH_BIT_ARM) == 0) { LOGI("*** \t\t\t - NOTE: Device doesn't support tensor storage format"); } @@ -1342,30 +1388,31 @@ bool Vulkan::GetDataGraphProcessingEngine() { VkPhysicalDeviceQueueFamilyDataGraphProcessingEngineInfoARM info = {}; info.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_QUEUE_FAMILY_DATA_GRAPH_PROCESSING_ENGINE_INFO_ARM; - info.queueFamilyIndex = m_VulkanQueues[Vulkan::eDataGraphQueue].QueueFamilyIndex; + info.pNext = nullptr; + info.queueFamilyIndex = queue_family_index; info.engineType = m_VulkanDataGraphProcessingEngine.type; - VkQueueFamilyDataGraphProcessingEnginePropertiesARM engineProps = {}; - engineProps.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_QUEUE_FAMILY_DATA_GRAPH_PROCESSING_ENGINE_INFO_ARM; - vkGetPhysicalDeviceQueueFamilyDataGraphProcessingEnginePropertiesARM(m_VulkanGpu, &info, &engineProps); + VkQueueFamilyDataGraphProcessingEnginePropertiesARM engine_props = {}; + engine_props.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_DATA_GRAPH_PROCESSING_ENGINE_PROPERTIES_ARM; // correct [3](https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/546) + engine_props.pNext = nullptr; + + vkGetPhysicalDeviceQueueFamilyDataGraphProcessingEnginePropertiesARM(m_VulkanGpu, &info, &engine_props); - // NOTE: These are only needed if you are using external objects (memory, synchronization, etc.). For this sample we only - // care about Vulkan primitives, but if you are using e.g. Android buffers, you should ensure they are supported first. #if 0 - if ((engineProps.foreignSemaphoreHandleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) == 0) + if ((engine_props.foreignSemaphoreHandleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) == 0) { return false; } - if ((engineProps.foreignMemoryeHandleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) == 0) + if ((engine_props.foreignMemoryHandleTypes & VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) == 0) { return false; } #endif } - LOGI("************************************"); - LOGI("************************************"); - LOGI("************************************"); + LOGI("******************************************"); + LOGI("* DATA Graph Processing Engine Completed *"); + LOGI("******************************************"); return true; } diff --git a/framework/code/vulkan/vulkan.hpp b/framework/code/vulkan/vulkan.hpp index 3d74fa2..de170d0 100644 --- a/framework/code/vulkan/vulkan.hpp +++ b/framework/code/vulkan/vulkan.hpp @@ -77,11 +77,21 @@ namespace ExtensionLib { struct Ext_VK_KHR_get_memory_requirements2; struct Ext_VK_ARM_tensors; struct Ext_VK_ARM_data_graph; + struct Ext_VK_QCOM_data_graph_model; struct Vulkan_SubgroupPropertiesHook; struct Vulkan_StorageFeaturesHook; struct Ext_VK_KHR_mesh_shader; struct Ext_VK_KHR_dynamic_rendering; + struct Ext_VK_KHR_external_memory_capabilities; + struct Ext_VK_KHR_external_memory; + struct Ext_VK_KHR_external_semaphore_capabilities; + struct Ext_VK_KHR_external_semaphore; + struct Ext_VK_KHR_external_memory_fd; + struct Ext_VK_KHR_external_semaphore_fd; + struct Ext_VK_KHR_external_memory_win32; + struct Ext_VK_KHR_external_semaphore_win32; }; + namespace vk {}; class VulkanDebugCallback; enum class TextureFormat; diff --git a/samples/README.md b/samples/README.md index 17be11b..2008e3a 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,100 +1,33 @@ # Samples -Unless noted all samples run on Windows and Android. +Unless noted, all samples run on Windows and Android. -## [empty](empty) +## [Cooperative Matrix](cooperative_matrix) +Demonstrates **VK_KHR_cooperative_matrix** for high‑throughput matrix operations such as GEMM and convolution on Adreno™ GPUs. -Empty app. Minimal app linked against Framework. +## [Graph Pipelines](graph_pipelines) +Shows how to use **VK_ARM_tensors**, **VK_ARM_data_graph**, and **VK_QCOM_data_graph_model** to run ML‑backed image processing using Vulkan Data Graph pipelines. -## [hello-gltf](hello-gltf) +## [HDR Swapchain](hdr_swapchain) +Creates and presents to an **HDR‑capable** Vulkan swapchain, selecting HDR formats/color spaces and falling back to SDR when needed. -Scene (gltf) loading app. Implements a working scene with camera movement and minimal lightning. +## [Image Processing](image_processing) +Implements a bloom effect using **VK_QCOM_image_processing**, with a toggle to compare the extension path against a standard downsample/blur pipeline. -## [AODemo](AODemo) +## [Rotated Copy](rotated_copy) +Demonstrates **VK_QCOM_rotated_copy_commands** to perform rotated image copies on devices without rotated‑swapchain support. -Vulkan implementation of Neural Network Ambient Occlusion. +## [SGSR](sgsr) +Integrates **Snapdragon™ Game Super Resolution**, with toggles for activation and optional edge‑direction processing. -## [FrameworkTest](FrameworkTest) +## [SGSR 2](sgsr2) +Showcases **Snapdragon™ Game Super Resolution 2**, featuring the temporal upscaling **compute 3‑pass** variant optimized for Adreno. -Simple test project that initializes the Vulkan Framework and displays a textured sphere. +## [Sub Pass](sub_pass) +Highlights multi‑subpass rendering workflows, including MSAA resolve/tonemap performed inside a subpass. -## [MLClothApp](MLClothApp) - -Sample project using machine learning to lower cloth simulation cost. - -## [deferredLpac](deferredLpac) - -App that renders a (reasonably) complex scene using forward rendering and compute shaders. - -Where LPAC (Low Priority Asyncronous Compute) is available the Compute jobs will be done on a low priority queue during shadow pass z-buffer write. - -## [DspOffload](dspOffload) - -App illustrating how the Hexagon DSP can be used to run graphics tasks and write results to GPU accessable Android Hardware Buffers. - -## [forward](forward) - -App illustrating a resonably complex forward rendered scene. - -## [hdrSwapchain](hdrSwapchain) - -Demonstrates the use of different swapchain image formats and colorspaces. Has a gui dropdown that allows for switching buffer formats on the fly. - -Also demonstrates Qualcomm Vulkan render-pass transform extension [VK_QCOM_render_pass_transform](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_QCOM_render_pass_transform.html) - -## [rayQueryShadows](rayQueryShadows) - -Uses Vulkan Ray Tracing extension (VK_KHR_ray_tracing) to implement shadows using Ray Queries. - -Currently Windows only. - -## [rotatedCopy](rotatedCopy) - -Uses VK_QCOM_rotated_copy_commands (and VK_KHR_copy_commands2) extension to blit from a (lower resolution) intermediate render target to the device framebuffer rotated to match the devices native orientation (and thus avoiding the Android SurfaceFlinger doing an additional rotation/composition step). - -## [shaderResolve](shaderResolve) - -Uses VK_QCOM_render_pass_shader_resolve extension to implement MSAA and order-independent transparency in a deferred renderer. - -## [shaderResolveTonemap](shaderResolveTonemap) - -Uses VK_QCOM_render_pass_shader_resolve to perform a filmic tonemapping operator (on a simple forward rendered scene) as part of the MSAA resolve. - -Optionally runs the tonemap/resolve as a subpass of the main scene pass. Has onscreen UI controls to modify MSAA sample counts and to enable/disable the shader resolve and use of subpasses (for measuring GPU subpass/shader-resolve efficiency). - -## [atmospherics](atmospherics) - -Atmospheric lighting. - -# Configuration - -Each sample can be configured by adding an 'app_config.txt' file in the root of the relevant sample (ie samples/forward/app_config.txt). - -On Android the app_config.txt needs to be pushed to device, into /sdcard/Android/data/ANDROID_APP_ID/files/. , many samples have a batch file to do this (eg 07_InstallConfig.bat). - -If this file is missing or empty the sample application should run with 'reasonable' defaults. - -Samples share a set of common settings and can define additional settings specific to the sample's functionality. - -## Common config settings - -gFramesToRender = x - -Render a specific number of frames before exiting the app. x should be in integer. 0 (default) will render 'forever'. - -# File handling - -## Windows - -Executables are compiled to project\windows\solution\samples\APPLICATION\Debug\APPLICATION.exe - -Executables should be run from the samples\APPLICATION folder and data files (textures, models, shaders) are loaded from the Media subfolder. The Visual Studio solution is pre-configured to run the exe from the correct folder. - -## Android - -Apk application bundles are complied to build\android\APPLICATION\outputs\apk\debug\APPLICATION-debug.apk - -So long as the sample's Media files were prepared (02_PrepareMedia.bat) before building the apk, the apk is stand-alone and contains the application executable and Media files. - -If desired any files in the Media folder can be 'overridden' by copying the relevant file to /sdcard/Android/data/ANDROID_APP_ID/files/. with the expected folder path. Eg you can copy a shader file from Media\Shaders\. to /sdcard/Android/data/ANDROID_APP_ID/files/Media/Shaders/. and see your new shader code when the application is re-launched. +## [Tile Memory](tile_memory) +Explores tile‑local memory usage to reduce external bandwidth and improve on‑chip efficiency. +## [Tile Shading](tile_shading) +Implements tile‑friendly shading techniques designed to maximize performance on tile‑based GPU architectures. \ No newline at end of file diff --git a/samples/cooperative_matrix/README.md b/samples/cooperative_matrix/README.md index 6e44ffe..a964291 100644 --- a/samples/cooperative_matrix/README.md +++ b/samples/cooperative_matrix/README.md @@ -1,6 +1,7 @@ - # Cooperative Matrix Sample +![Screenshot](img/screenshot.png) + This sample demonstrates the use of the *VK_KHR_cooperative_matrix* extension in Vulkan to run matrix operations using GPU‑accelerated cooperative matrix arithmetic. The extension enables the application to query supported matrix tile sizes and data types, allocate the required buffers, and dispatch compute workloads that take advantage of hardware‑level cooperative matrix execution. diff --git a/samples/cooperative_matrix/code/main/application.cpp b/samples/cooperative_matrix/code/main/application.cpp index b4327fa..313ce86 100644 --- a/samples/cooperative_matrix/code/main/application.cpp +++ b/samples/cooperative_matrix/code/main/application.cpp @@ -1,15 +1,11 @@ //============================================================================================================ // // -// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ -/// -/// Sample app demonstrating the loading of a .gltf file (hello world) -/// - #include "application.hpp" #include "main/applicationEntrypoint.hpp" #include "camera/cameraController.hpp" diff --git a/samples/cooperative_matrix/code/main/application.hpp b/samples/cooperative_matrix/code/main/application.hpp index daa2400..35566dc 100644 --- a/samples/cooperative_matrix/code/main/application.hpp +++ b/samples/cooperative_matrix/code/main/application.hpp @@ -1,14 +1,10 @@ //============================================================================================================ // // -// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ - -/// -/// Sample app demonstrating the loading of a .gltf file (hello world) -/// #pragma once #include "main/applicationHelperBase.hpp" diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp b/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp deleted file mode 100644 index 21c2d97..0000000 --- a/samples/cooperative_matrix/code/main/cooperative_matrix_shaders.hpp +++ /dev/null @@ -1,306 +0,0 @@ -//============================================================================================================ -// -// -// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. -// SPDX-License-Identifier: BSD-3-Clause -// -//============================================================================================================ -#pragma once - -#include - -const char* Test01_MxM_Basic = R"( -#version 450 core -#pragma use_vulkan_memory_model -#extension GL_KHR_shader_subgroup_basic : enable -#extension GL_EXT_scalar_block_layout : enable -#extension GL_KHR_memory_scope_semantics : enable -#extension GL_KHR_cooperative_matrix : enable -#extension GL_EXT_buffer_reference : enable -#extension GL_EXT_control_flow_attributes : enable -#extension GL_KHR_shader_subgroup_basic : enable -#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader - -#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable - -// These specialized constants are set inside the host -layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 -layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 -layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 -layout(constant_id = 3) const uint TOTAL_M = 1; -layout(constant_id = 4) const uint TOTAL_N = 1; -layout(constant_id = 5) const uint TOTAL_K = 1; -layout(constant_id = 6) const uint TILE_M = 1; -layout(constant_id = 7) const uint TILE_N = 1; -layout(constant_id = 8) const uint TILE_K = 1; -layout(constant_id = 9) const bool layoutA_Mfirst = false; -layout(constant_id = 10) const bool layoutB_Kfirst = false; -layout(constant_id = 11) const bool layoutC_Mfirst = false; -layout(constant_id = 12) const bool layoutR_Mfirst = false; -layout(constant_id = 13) const uint strideAinElements = 1; -layout(constant_id = 14) const uint strideBinElements = 1; -layout(constant_id = 15) const uint strideCinElements = 1; -layout(constant_id = 16) const uint strideRinElements = 1; - -// #defines set on compiler GLSL to SPIR-V command line: -// A_TYPE = e.g. float or float16_t -// R_TYPE = e.g. float or float16_t - -layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; -layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; -layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; -layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; - -//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params; - -// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() -{ - const uint32_t block_id_m = gl_GlobalInvocationID.y; - const uint32_t block_id_n = gl_GlobalInvocationID.z; - if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; - - const uint32_t row = block_id_m * TILE_M; - const uint32_t col = block_id_n * TILE_N; - - // Initialize result matR to zero, not using matC in this shader - coopmat matR; - matR = coopmat(0.0); - - for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) - { - // On each iteration, load a row of cooperative matrices from matrix A, - // load a column of cooperative matrices from matrix B, and multiply all - // pairs of those matrices. - uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; - uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; - - coopmat matA; - coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst)); - - coopmat matB; - coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); - - //for (int i = (gl_LocalInvocationID.x > 63 ? 20 : 0); i < 100; i++) // diable unroll, test gpu_freq, should around 1% - matR = coopMatMulAdd(matA, matB, matR); - } - - // Store results - uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; - - coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); -} -)"; - -const char* Test03_CONV = R"( -#version 450 core -#pragma use_vulkan_memory_model -#extension GL_KHR_shader_subgroup_basic : enable -#extension GL_EXT_scalar_block_layout : enable -#extension GL_KHR_memory_scope_semantics : enable -#extension GL_KHR_cooperative_matrix : enable -#extension GL_EXT_buffer_reference : enable -#extension GL_EXT_control_flow_attributes : enable -#extension GL_KHR_shader_subgroup_basic : enable -#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader - -#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable -#extension GL_QCOM_cooperative_matrix_conversion : enable - -// These specialized constants are set inside the host -layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 -layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 -layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 -layout(constant_id = 3) const uint TOTAL_M = 1; -layout(constant_id = 4) const uint TOTAL_N = 1; -layout(constant_id = 5) const uint TOTAL_K = 1; -layout(constant_id = 6) const uint TILE_M = 1; -layout(constant_id = 7) const uint TILE_N = 1; -layout(constant_id = 8) const uint TILE_K = 1; -layout(constant_id = 9) const uint INPUT_W = 1; -layout(constant_id = 10) const uint INPUT_H = 1; -layout(constant_id = 11) const uint FILTER_W = 1; -layout(constant_id = 12) const uint FILTER_H = 1; -layout(constant_id = 13) const uint DILATION = 1; -layout(constant_id = 14) const uint STRIDE = 1; -layout(constant_id = 15) const uint strideAinElements = 1; -layout(constant_id = 16) const uint strideBinElements = 1; -layout(constant_id = 17) const uint strideCinElements = 1; -layout(constant_id = 18) const uint strideRinElements = 1; - -// #defines set on compiler GLSL to SPIR-V command line: -// A_TYPE = e.g. float or float16_t -// R_TYPE = e.g. float or float16_t - -layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; -layout(set=0, binding=0) readonly buffer InputAuint { uint32_t x[]; } inputAuint; -layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; -layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; -layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; - -// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() -{ - const uint32_t block_id_m = gl_GlobalInvocationID.y; - const uint32_t block_id_n = gl_GlobalInvocationID.z; - if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; - - const uint32_t row = block_id_m * TILE_M; - const uint32_t col = block_id_n * TILE_N; - - uint32_t gidx_m = gl_GlobalInvocationID.x + TILE_M * gl_GlobalInvocationID.y; // fibers along M - uint32_t out_col_id = gidx_m % INPUT_W; - uint32_t out_row_id = gidx_m / INPUT_W; - - uint32_t filter_offset_h = (FILTER_H % 2 == 0)? 0 : FILTER_H/2; - uint32_t filter_offset_w = (FILTER_W % 2 == 0)? 0 : FILTER_W/2; - - // Initialize result matR to zero, not using matC in this shader - coopmat matR; - matR = coopmat(0.0); - - for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) - { - uint32_t subMatrixBStartInElements = col * FILTER_H * FILTER_W * strideBinElements + step; // B is Kfirst - for (uint32_t filter_row = 0; filter_row < FILTER_H; filter_row++) - { - for (uint32_t filter_col = 0; filter_col < FILTER_W; filter_col++) - { - coopmat matB; - coopmat matA; - - // load B matrix input data using coop_mat extension - coopMatLoad(matB, inputB.x, subMatrixBStartInElements, FILTER_H * FILTER_W * strideBinElements, int(true)); - - // load A matrix input data as vectors using regular vector load - uint32_t input_row_id = STRIDE * out_row_id + DILATION * (filter_row - filter_offset_h); - uint32_t input_col_id = STRIDE * out_col_id + DILATION * (filter_col - filter_offset_w); - - // load A vector data from memory - uint32_t vecA[TILE_K/NUM_PACK]; - for (int i=0; i= INPUT_H) || (input_col_id < 0) || (input_col_id >= INPUT_W)) - for (int i=0; i= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; - - const uint32_t row = block_id_m * TILE_M; - const uint32_t col = block_id_n * TILE_N; - - // Initialize result matR to zero, not using matC in this shader - coopmat matR; - matR = coopmat(0.0); - - for (uint32_t step = 0; step < TOTAL_K; step += 8) - { - // On each iteration, load a row of cooperative matrices from matrix A, - // load a column of cooperative matrices from matrix B, and multiply all - // pairs of those matrices. - uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; - uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; - - coopmat matA; - - uint32_t uvecA[8]; - for (int i=0; i<8; i++) - uvecA[i] = floatBitsToInt(inputA.x[subMatrixAStartInElements + gl_GlobalInvocationID.x * strideAinElements + i]); - - // convert A vector to A matrix - vectorToCoopmatQCOM(uvecA, matA); - - coopmat matB; - coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); - - matR = coopMatMulAdd(matA, matB, matR); - } - - // Store results - uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; - - coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); -} -)"; \ No newline at end of file diff --git a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp index 1573497..03a46cb 100644 --- a/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp +++ b/samples/cooperative_matrix/code/main/cooperative_matrix_tester.cpp @@ -7,12 +7,16 @@ //============================================================================================================ #include "cooperative_matrix_tester.hpp" -#include "cooperative_matrix_shaders.hpp" #include "vulkan/extensionHelpers.hpp" #include "vulkan/extensionLib.hpp" #include <../external/glslang/glslang/Include/glslang_c_interface.h> #include <../external/glslang/glslang/Public/resource_limits_c.h> +// Runtime shaders +#include "runtime_shaders/MxM_Basic.hpp" +#include "runtime_shaders/MxM_VecToMat.hpp" +#include "runtime_shaders/Conv.hpp" + #pragma push_macro("BOOL") #define BOOL HALF_BOOL #include "half/half.h" @@ -552,14 +556,14 @@ bool CooperativeMatrixRunner::InitializeRunner() VK_COMPONENT_TYPE_FLOAT32_KHR , VK_COMPONENT_TYPE_FLOAT32_KHR , { - {8, 6, 128, // SizeInBlocks - 0, 64, 0}, // Size (tile) + {8, 6, 128, // SizeInBlocks + 64, 64, 8}, // Size (tile) - {8, 12, 128, - 0, 32, 0}, + {8, 12, 128, + 64, 32, 16}, - {8, 24, 128, - 0, 16, 0} + {8, 24, 128, + 64, 16, 32} } }); m_test_group_templates.push_back(TestGroupTemplateDescription{ diff --git a/samples/cooperative_matrix/code/main/runtime_shader.cpp b/samples/cooperative_matrix/code/main/runtime_shader.cpp index dec1696..f91b55d 100644 --- a/samples/cooperative_matrix/code/main/runtime_shader.cpp +++ b/samples/cooperative_matrix/code/main/runtime_shader.cpp @@ -1,15 +1,11 @@ //============================================================================================================ // // -// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ -/// -/// Sample app demonstrating the loading of a .gltf file (hello world) -/// - #include "runtime_shader.hpp" #include "main/applicationEntrypoint.hpp" #include "camera/cameraController.hpp" diff --git a/samples/cooperative_matrix/code/main/runtime_shader.hpp b/samples/cooperative_matrix/code/main/runtime_shader.hpp index 4f8ee83..dc4b29e 100644 --- a/samples/cooperative_matrix/code/main/runtime_shader.hpp +++ b/samples/cooperative_matrix/code/main/runtime_shader.hpp @@ -1,14 +1,10 @@ //============================================================================================================ // // -// Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ - -/// -/// Sample app demonstrating the loading of a .gltf file (hello world) -/// #pragma once #include diff --git a/samples/cooperative_matrix/code/main/runtime_shaders/Conv.hpp b/samples/cooperative_matrix/code/main/runtime_shaders/Conv.hpp new file mode 100644 index 0000000..53e44ec --- /dev/null +++ b/samples/cooperative_matrix/code/main/runtime_shaders/Conv.hpp @@ -0,0 +1,124 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include + +const char* Test03_CONV = R"( +#version 450 core +#pragma use_vulkan_memory_model +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader + +#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable +#extension GL_QCOM_cooperative_matrix_conversion : enable + +// These specialized constants are set inside the host +layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 +layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 +layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 +layout(constant_id = 3) const uint TOTAL_M = 1; +layout(constant_id = 4) const uint TOTAL_N = 1; +layout(constant_id = 5) const uint TOTAL_K = 1; +layout(constant_id = 6) const uint TILE_M = 1; +layout(constant_id = 7) const uint TILE_N = 1; +layout(constant_id = 8) const uint TILE_K = 1; +layout(constant_id = 9) const uint INPUT_W = 1; +layout(constant_id = 10) const uint INPUT_H = 1; +layout(constant_id = 11) const uint FILTER_W = 1; +layout(constant_id = 12) const uint FILTER_H = 1; +layout(constant_id = 13) const uint DILATION = 1; +layout(constant_id = 14) const uint STRIDE = 1; +layout(constant_id = 15) const uint strideAinElements = 1; +layout(constant_id = 16) const uint strideBinElements = 1; +layout(constant_id = 17) const uint strideCinElements = 1; +layout(constant_id = 18) const uint strideRinElements = 1; + +// #defines set on compiler GLSL to SPIR-V command line: +// A_TYPE = e.g. float or float16_t +// R_TYPE = e.g. float or float16_t + +layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; +layout(set=0, binding=0) readonly buffer InputAuint { uint32_t x[]; } inputAuint; +layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; +layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; +layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; + +// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() +{ + const uint32_t block_id_m = gl_GlobalInvocationID.y; + const uint32_t block_id_n = gl_GlobalInvocationID.z; + if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + uint32_t gidx_m = gl_GlobalInvocationID.x + TILE_M * gl_GlobalInvocationID.y; // fibers along M + uint32_t out_col_id = gidx_m % INPUT_W; + uint32_t out_row_id = gidx_m / INPUT_W; + + uint32_t filter_offset_h = (FILTER_H % 2 == 0)? 0 : FILTER_H/2; + uint32_t filter_offset_w = (FILTER_W % 2 == 0)? 0 : FILTER_W/2; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) + { + uint32_t subMatrixBStartInElements = col * FILTER_H * FILTER_W * strideBinElements + step; // B is Kfirst + for (uint32_t filter_row = 0; filter_row < FILTER_H; filter_row++) + { + for (uint32_t filter_col = 0; filter_col < FILTER_W; filter_col++) + { + coopmat matB; + coopmat matA; + + // load B matrix input data using coop_mat extension + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, FILTER_H * FILTER_W * strideBinElements, int(true)); + + // load A matrix input data as vectors using regular vector load + uint32_t input_row_id = STRIDE * out_row_id + DILATION * (filter_row - filter_offset_h); + uint32_t input_col_id = STRIDE * out_col_id + DILATION * (filter_col - filter_offset_w); + + // load A vector data from memory + uint32_t vecA[TILE_K/NUM_PACK]; + for (int i=0; i= INPUT_H) || (input_col_id < 0) || (input_col_id >= INPUT_W)) + for (int i=0; i + +const char* Test01_MxM_Basic = R"( +#version 450 core +#pragma use_vulkan_memory_model +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader + +#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable + +// These specialized constants are set inside the host +layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 +layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 +layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 +layout(constant_id = 3) const uint TOTAL_M = 1; +layout(constant_id = 4) const uint TOTAL_N = 1; +layout(constant_id = 5) const uint TOTAL_K = 1; +layout(constant_id = 6) const uint TILE_M = 1; +layout(constant_id = 7) const uint TILE_N = 1; +layout(constant_id = 8) const uint TILE_K = 1; +layout(constant_id = 9) const bool layoutA_Mfirst = false; +layout(constant_id = 10) const bool layoutB_Kfirst = false; +layout(constant_id = 11) const bool layoutC_Mfirst = false; +layout(constant_id = 12) const bool layoutR_Mfirst = false; +layout(constant_id = 13) const uint strideAinElements = 1; +layout(constant_id = 14) const uint strideBinElements = 1; +layout(constant_id = 15) const uint strideCinElements = 1; +layout(constant_id = 16) const uint strideRinElements = 1; + +// #defines set on compiler GLSL to SPIR-V command line: +// A_TYPE = e.g. float or float16_t +// R_TYPE = e.g. float or float16_t + +layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; +layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; +layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; +layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; + +//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params; + +// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() +{ + const uint32_t block_id_m = gl_GlobalInvocationID.y; + const uint32_t block_id_n = gl_GlobalInvocationID.z; + if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += TILE_K) + { + // On each iteration, load a row of cooperative matrices from matrix A, + // load a column of cooperative matrices from matrix B, and multiply all + // pairs of those matrices. + uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; + uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; + + coopmat matA; + coopMatLoad(matA, inputA.x, subMatrixAStartInElements, strideAinElements, int(layoutA_Mfirst)); + + coopmat matB; + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); + + //for (int i = (gl_LocalInvocationID.x > 63 ? 20 : 0); i < 100; i++) // diable unroll, test gpu_freq, should around 1% + matR = coopMatMulAdd(matA, matB, matR); + } + + // Store results + uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; + + coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); +} +)"; \ No newline at end of file diff --git a/samples/cooperative_matrix/code/main/runtime_shaders/MxM_VecToMat.hpp b/samples/cooperative_matrix/code/main/runtime_shaders/MxM_VecToMat.hpp new file mode 100644 index 0000000..f438f25 --- /dev/null +++ b/samples/cooperative_matrix/code/main/runtime_shaders/MxM_VecToMat.hpp @@ -0,0 +1,104 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include + +const char* Test02_MxM_VecToMat = R"( +#version 450 core +#pragma use_vulkan_memory_model +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_debug_printf : enable // Enable this extension if you want to use printf() inside the shader + +#extension GL_EXT_shader_explicit_arithmetic_types_float32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable +#extension GL_QCOM_cooperative_matrix_conversion : enable + +// These specialized constants are set inside the host +layout(constant_id = 0) const uint lsx = 64; // local_size_x set inside the host and map to constant_id = 0 +layout(constant_id = 1) const uint lsy = 2; // local_size_y set inside the host and map to constant_id = 1 +layout(constant_id = 2) const uint lsz = 2; // local_size_z set inside the host and map to constant_id = 2 +layout(constant_id = 3) const uint TOTAL_M = 1; +layout(constant_id = 4) const uint TOTAL_N = 1; +layout(constant_id = 5) const uint TOTAL_K = 1; +layout(constant_id = 6) const uint TILE_M = 1; +layout(constant_id = 7) const uint TILE_N = 1; +layout(constant_id = 8) const uint TILE_K = 1; +layout(constant_id = 9) const bool layoutA_Mfirst = false; +layout(constant_id = 10) const bool layoutB_Kfirst = false; +layout(constant_id = 11) const bool layoutC_Mfirst = false; +layout(constant_id = 12) const bool layoutR_Mfirst = false; +layout(constant_id = 13) const uint strideAinElements = 1; +layout(constant_id = 14) const uint strideBinElements = 1; +layout(constant_id = 15) const uint strideCinElements = 1; +layout(constant_id = 16) const uint strideRinElements = 1; + +// #defines set on compiler GLSL to SPIR-V command line: +// A_TYPE = e.g. float or float16_t +// R_TYPE = e.g. float or float16_t + +layout(set=0, binding=0) readonly buffer InputA { A_TYPE x[]; } inputA; +layout(set=0, binding=1) readonly buffer InputB { A_TYPE x[]; } inputB; +layout(set=0, binding=2) readonly buffer InputC { R_TYPE x[]; } inputC; +layout(set=0, binding=3) buffer Output { R_TYPE x[]; } outputO; + +//layout(set=0, binding=0, std430) uniform Params { InputA inputA; InputB inputB; InputC inputC; Output outputO; } params; + +// Set work-group size at dispacth time using specialized constant_id 0,1,2, see host source code for detail +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() +{ + const uint32_t block_id_m = gl_GlobalInvocationID.y; + const uint32_t block_id_n = gl_GlobalInvocationID.z; + if ((block_id_m >= TOTAL_M/TILE_M) || (block_id_n >= TOTAL_N/TILE_N)) return; + + const uint32_t row = block_id_m * TILE_M; + const uint32_t col = block_id_n * TILE_N; + + // Initialize result matR to zero, not using matC in this shader + coopmat matR; + matR = coopmat(0.0); + + for (uint32_t step = 0; step < TOTAL_K; step += 8) + { + // On each iteration, load a row of cooperative matrices from matrix A, + // load a column of cooperative matrices from matrix B, and multiply all + // pairs of those matrices. + uint32_t subMatrixAStartInElements = layoutA_Mfirst ? row + step * strideAinElements : row * strideAinElements + step; + uint32_t subMatrixBStartInElements = layoutB_Kfirst ? col * strideBinElements + step : col + step * strideBinElements; + + coopmat matA; + + uint32_t uvecA[8]; + for (int i=0; i<8; i++) + uvecA[i] = floatBitsToInt(inputA.x[subMatrixAStartInElements + gl_GlobalInvocationID.x * strideAinElements + i]); + + // convert A vector to A matrix + vectorToCoopmatQCOM(uvecA, matA); + + coopmat matB; + coopMatLoad(matB, inputB.x, subMatrixBStartInElements, strideBinElements, int(layoutB_Kfirst)); + + matR = coopMatMulAdd(matA, matB, matR); + } + + // Store results + uint32_t subMatrixRStartInElements = layoutR_Mfirst ? col * strideRinElements + row : row * strideRinElements + col; + + coopMatStore(matR, outputO.x, subMatrixRStartInElements, strideRinElements, int(layoutR_Mfirst)); +} +)"; \ No newline at end of file diff --git a/samples/cooperative_matrix/img/screenshot.png b/samples/cooperative_matrix/img/screenshot.png new file mode 100644 index 0000000..db6a3a2 Binary files /dev/null and b/samples/cooperative_matrix/img/screenshot.png differ diff --git a/samples/graph_pipelines/CMakeLists.txt b/samples/graph_pipelines/CMakeLists.txt index e1e1eab..8b4af63 100644 --- a/samples/graph_pipelines/CMakeLists.txt +++ b/samples/graph_pipelines/CMakeLists.txt @@ -9,6 +9,15 @@ set(CMAKE_CXX_STANDARD 20) set(CPP_SRC code/main/application.cpp code/main/application.hpp + code/main/ml/GraphPipelineTypes.hpp + code/main/ml/DataGraphPipeline.hpp + code/main/ml/DataGraphPipeline.cpp + code/main/ml/GraphDispatch.hpp + code/main/ml/GraphDispatch.cpp + code/main/ml/QcomDataGraphModel.hpp + code/main/ml/QcomDataGraphModel.cpp + code/main/ml/TensorResources.hpp + code/main/ml/TensorResources.cpp ) # diff --git a/samples/graph_pipelines/README.md b/samples/graph_pipelines/README.md new file mode 100644 index 0000000..5fb5be6 --- /dev/null +++ b/samples/graph_pipelines/README.md @@ -0,0 +1,9 @@ +# Graph Pipelines + +![Screenshot](img/screenshot.png) + +This sample demonstrates how to use the **VK_ARM_tensors**, **VK_ARM_data_graph**, and **VK_QCOM_data_graph_model** Vulkan extensions to run an ML‑powered image‑upscaling pipeline on supported hardware. + +The app renders the scene at a lower resolution, copies the result into tensor objects, dispatches a **Data Graph** pipeline (optionally backed by Qualcomm’s model‑import path), and writes the upscaled output back into an image for display. When these extensions are unavailable or disabled, the sample falls back to a standard GPU blit. + +The sample highlights how Vulkan applications can integrate tensor operations, graph pipelines, and Qualcomm™‑specific model execution paths to achieve real‑time upscaling performance on Adreno™ GPUs. diff --git a/samples/graph_pipelines/code/main/application.cpp b/samples/graph_pipelines/code/main/application.cpp index ec0d80c..905595c 100644 --- a/samples/graph_pipelines/code/main/application.cpp +++ b/samples/graph_pipelines/code/main/application.cpp @@ -1,7 +1,7 @@ //============================================================================================================ // // -// Copyright (c) 2025, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ @@ -33,17 +33,6 @@ #include #include -/* -* # Major Keypoints: -* -* - For the Data Graph queue selection, see "Vulkan::InitDataGraph()". -* - For the Data Graph Processing Engine selection, see "bool Vulkan::GetDataGraphProcessingEngine()". -* - For command pool initialization, see "bool Vulkan::InitCommandPools()". -* -* -* -*/ - namespace { static constexpr std::array sRenderPassNames = { "RP_SCENE", "RP_HUD", "RP_BLIT" }; @@ -57,7 +46,7 @@ namespace float gNormalAmount = 0.3f; float gNormalMirrorReflectAmount = 0.05f; - const char* gSceneAssetGraphModel = "PipelineCache.bin"; + const char* gSceneAssetGraphModel = "PipelineCache.bin"; // What we expect to load the model via VK_QCOM_data_graph_model const char* gSceneAssetModel = "SteamPunkSauna.gltf"; static uint32_t FindMemoryType(VkPhysicalDevice& physicalDevice, uint32_t type_bits, VkMemoryPropertyFlags properties) @@ -75,11 +64,6 @@ namespace }; } -/// -/// @brief Implementation of the Application entrypoint (called by the framework) -/// @return Pointer to Application (derived from @FrameworkApplicationBase). -/// Creates the Application class. Ownership is passed to the calling (framework) function. -/// FrameworkApplicationBase* Application_ConstructApplication() { return new Application(); @@ -102,12 +86,9 @@ void Application::PreInitializeSetVulkanConfiguration(Vulkan::AppConfiguration& config.RequiredExtension(); config.RequiredExtension(); - // config.RequiredExtension(); - // config.RequiredExtension(); - // config.RequiredExtension(); - config.OptionalExtension(); config.OptionalExtension(); + config.OptionalExtension(); } //----------------------------------------------------------------------------- @@ -121,9 +102,9 @@ bool Application::Initialize(uintptr_t windowHandle, uintptr_t hInstance) m_IsGraphPipelinesSupported &= GetVulkan()->HasLoadedVulkanDeviceExtension(VK_ARM_TENSORS_EXTENSION_NAME) && GetVulkan()->HasLoadedVulkanDeviceExtension(VK_ARM_DATA_GRAPH_EXTENSION_NAME); - - // If Ext_VK_ARM_data_graph->AvailableFeatures.dataGraph is supported, force graph pipeline support here while that - // isn't fully supported publicly by the driver + + // If Ext_VK_ARM_data_graph->AvailableFeatures.dataGraph is supported, force graph pipeline support here in case the + // driver has support but it isn't exposed #if defined(OS_ANDROID) { auto* Ext_VK_ARM_tensors = static_cast(GetVulkan()->m_DeviceExtensions.GetExtension(VK_ARM_TENSORS_EXTENSION_NAME)); @@ -223,6 +204,7 @@ void Application::Destroy() // Uniform Buffers ReleaseUniformBuffer(pVulkan, &m_ObjectVertUniform); ReleaseUniformBuffer(pVulkan, &m_LightUniform); + ReleaseUniformBuffer(pVulkan, &m_BlitFragUniform); for (auto& [hash, objectUniform] : m_ObjectFragUniforms) { @@ -335,304 +317,25 @@ bool Application::CreateTensors() const int64_t componentsPerPixel = 3; // R8G8B8_UNORM and Model is RGB - m_InputTensor.strides = { componentsPerPixel * m_RenderResolution.x, componentsPerPixel, 1 }; - m_InputTensor.dimensions = { m_RenderResolution.y, m_RenderResolution.x, componentsPerPixel }; - m_InputTensor.portBindingIndex = m_QNNInputPortBinding; + m_InputTensor.strides = { componentsPerPixel * m_RenderResolution.x, componentsPerPixel, 1 }; + m_InputTensor.dimensions = { m_RenderResolution.y, m_RenderResolution.x, componentsPerPixel }; + m_InputTensor.port_binding_index = m_QNNInputPortBinding; - m_OutputTensor.strides = { componentsPerPixel * m_UpscaledResolution.x, componentsPerPixel, 1 }; - m_OutputTensor.dimensions = { m_UpscaledResolution.y, m_UpscaledResolution.x, componentsPerPixel }; - m_OutputTensor.portBindingIndex = m_QNNOutputPortBinding; + m_OutputTensor.strides = { componentsPerPixel * m_UpscaledResolution.x, componentsPerPixel, 1 }; + m_OutputTensor.dimensions = { m_UpscaledResolution.y, m_UpscaledResolution.x, componentsPerPixel }; + m_OutputTensor.port_binding_index = m_QNNOutputPortBinding; - auto CreateTensor = [&](GraphPipelineTensor& targetTensor) -> bool - { - const uint32_t bufferSize = targetTensor.dimensions[0] * targetTensor.dimensions[1] * targetTensor.dimensions[2]; - - // TENSOR OBJECT // - - LOGI("Creating Tensors Object"); - - targetTensor.tensorDescription = VkTensorDescriptionARM - { - .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, - .pNext = nullptr, - .tiling = VK_TENSOR_TILING_LINEAR_ARM, // VK_TENSOR_TILING_OPTIMAL_ARM TODO: Find out why it cannot be optimal - .format = VK_FORMAT_R8_UNORM, - .dimensionCount = static_cast(targetTensor.dimensions.size()), - .pDimensions = targetTensor.dimensions.data(), - .pStrides = nullptr, - .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM/* | VK_TENSOR_USAGE_SHADER_BIT_ARM*/ - }; - - VkExternalMemoryTensorCreateInfoARM externalInfo - { - .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_TENSOR_CREATE_INFO_ARM, - .pNext = nullptr, - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, - }; - - VkTensorCreateInfoARM tensorInfo = - { - .sType = VK_STRUCTURE_TYPE_TENSOR_CREATE_INFO_ARM, - .pNext = &externalInfo, - .flags = 0, - .pDescription = &targetTensor.tensorDescription, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr - }; - - if (vkCreateTensorARM(vulkan.m_VulkanDevice, &tensorInfo, nullptr, &targetTensor.tensor) != VK_SUCCESS) - { - return false; - } - - // TENSOR MEMORY REQUIREMENTS // - -#if 0 - VkMemoryDedicatedAllocateInfoTensorARM dedicatedInfo = - { - .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_TENSOR_ARM, - .pNext = nullptr, - .tensor = targetTensor.tensor - }; -#endif - -#if 0 - VkTensorMemoryRequirementsInfoARM memReqInfo = - { - .sType = VK_STRUCTURE_TYPE_TENSOR_MEMORY_REQUIREMENTS_INFO_ARM, - .pNext = nullptr, - .tensor = targetTensor.tensor - }; - - VkMemoryRequirements2 memReq = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; - vkGetTensorMemoryRequirementsARM(vulkan.m_VulkanDevice, &memReqInfo, &memReq); -#else - - VkDeviceTensorMemoryRequirementsARM deviceMemReqInfo = - { - .sType = VK_STRUCTURE_TYPE_DEVICE_TENSOR_MEMORY_REQUIREMENTS_ARM, - .pNext = nullptr, - .pCreateInfo = &tensorInfo - }; - VkMemoryRequirements2 memReq = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; - vkGetDeviceTensorMemoryRequirementsARM(vulkan.m_VulkanDevice, &deviceMemReqInfo, &memReq); -#endif - - // TENSOR ALIASED BUFFER // - - LOGI("Creating Tensor Aliased Buffer - Tensor Size: %d - Buffer Size: %d", memReq.memoryRequirements.size, bufferSize); - - // Create buffer with aliasing usage - VkBufferCreateInfo bufferInfo = - { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = bufferSize, - .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT/* | VK_BUFFER_USAGE_2_DATA_GRAPH_FOREIGN_DESCRIPTOR_BIT_ARM*/, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE - }; - - if (vkCreateBuffer(vulkan.m_VulkanDevice, &bufferInfo, nullptr, &targetTensor.aliasedBuffer) != VK_SUCCESS) - { - return false; - } - - // TENSOR MEMORY // - - LOGI("Allocating Tensor Memory"); - - VkExportMemoryAllocateInfo exportAllocInfo = - { - .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, - }; - - VkMemoryRequirements bufferMemReq = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; - vkGetBufferMemoryRequirements(vulkan.m_VulkanDevice, targetTensor.aliasedBuffer, &bufferMemReq); - - VkMemoryAllocateInfo allocInfo = - { - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = &exportAllocInfo, - .allocationSize = bufferMemReq.size, - .memoryTypeIndex = FindMemoryType(vulkan.m_VulkanGpu, bufferMemReq.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) - }; - - if (vkAllocateMemory(vulkan.m_VulkanDevice, &allocInfo, nullptr, &targetTensor.tensorMemory) != VK_SUCCESS) - { - return false; - } - - VkBindTensorMemoryInfoARM bindInfo = - { - .sType = VK_STRUCTURE_TYPE_BIND_TENSOR_MEMORY_INFO_ARM, - .pNext = nullptr, - .tensor = targetTensor.tensor, - .memory = targetTensor.tensorMemory, - .memoryOffset = 0 - }; - - LOGI("Binding Tensor Memory"); - - if(vkBindTensorMemoryARM(vulkan.m_VulkanDevice, 1, &bindInfo) != VK_SUCCESS) - { - return false; - } - - LOGI("Binding Aliased Buffer Memory"); - - if (vkBindBufferMemory(vulkan.m_VulkanDevice, targetTensor.aliasedBuffer, targetTensor.tensorMemory, 0) != VK_SUCCESS) - { - return false; - } - - // TENSOR VIEW // - - LOGI("Creating Tensor View"); - - VkTensorViewCreateInfoARM viewInfo = - { - .sType = VK_STRUCTURE_TYPE_TENSOR_VIEW_CREATE_INFO_ARM, - .pNext = nullptr, - .flags = 0, - .tensor = targetTensor.tensor, - .format = targetTensor.tensorDescription.format - }; - - if (vkCreateTensorViewARM(vulkan.m_VulkanDevice, &viewInfo, nullptr, &targetTensor.tensorView) != VK_SUCCESS) - { - return false; - } - - return true; - }; - - if (!CreateTensor(m_InputTensor)) - { - return false; - } - - if (!CreateTensor(m_OutputTensor)) - { - return false; - } - - LOGI("Creating Tensors Descriptor Pool"); - - VkDataGraphProcessingEngineCreateInfoARM engineInfo = {}; - VkDescriptorPoolCreateInfo descPoolInfo = {}; - - engineInfo.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PROCESSING_ENGINE_CREATE_INFO_ARM; - engineInfo.processingEngineCount = 1; - engineInfo.pProcessingEngines = &vulkan.m_VulkanDataGraphProcessingEngine; - - VkDescriptorPoolSize pool = {}; - - pool.type = VK_DESCRIPTOR_TYPE_TENSOR_ARM; - pool.descriptorCount = m_QNNMaxPortIndex + 1; - - descPoolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - descPoolInfo.pNext = &engineInfo; - descPoolInfo.maxSets = 1; - descPoolInfo.poolSizeCount = 1; - descPoolInfo.pPoolSizes = &pool; - if (vkCreateDescriptorPool(vulkan.m_VulkanDevice, &descPoolInfo, NULL, &m_TensorDescriptorPool) != VK_SUCCESS) - { - return false; - } - - // TENSOR DESCRIPTOR SET LAYOUT // - - LOGI("Creating Tensor Descriptor Set Layout"); - - VkDescriptorSetLayoutCreateInfo descLayoutInfo = {}; - std::array< VkDescriptorSetLayoutBinding, 2> bindings = {}; - - bindings[0].binding = m_InputTensor.portBindingIndex; - bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; - bindings[0].descriptorCount = 1; - bindings[0].stageFlags = VK_SHADER_STAGE_ALL; - - bindings[1].binding = m_OutputTensor.portBindingIndex; - bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; - bindings[1].descriptorCount = 1; - bindings[1].stageFlags = VK_SHADER_STAGE_ALL; - - descLayoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - descLayoutInfo.pNext = NULL; - descLayoutInfo.flags = 0; - descLayoutInfo.bindingCount = static_cast(bindings.size()); - descLayoutInfo.pBindings = bindings.data(); - - if (vkCreateDescriptorSetLayout(vulkan.m_VulkanDevice, &descLayoutInfo, NULL, &m_TensorDescriptorSetLayout) != VK_SUCCESS) - { - return false; - } - - // TENSOR DESCRIPTOR SETS // - - LOGI("Creating Tensor Descriptor Sets"); - - // Allocate Descriptor Sets: - VkDescriptorSetAllocateInfo info = {}; - - info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - info.descriptorPool = m_TensorDescriptorPool; - info.descriptorSetCount = 1; - info.pSetLayouts = &m_TensorDescriptorSetLayout; - - if (vkAllocateDescriptorSets(vulkan.m_VulkanDevice, &info, &m_TensorDescriptorSet) != VK_SUCCESS) + if (!m_tensor_resources.Initialize( + vulkan.m_VulkanDevice, + vulkan.m_VulkanGpu, + vulkan.m_VulkanDataGraphProcessingEngine, + m_InputTensor, + m_OutputTensor, + m_QNNMaxPortIndex)) { return false; } - // UPDATE/BIND TENSOR DESCRIPTOR SETS // - - LOGI("Updating Tensor Descriptor Sets"); - - //Bind tensors to descriptor set - VkWriteDescriptorSet write[2]; - VkWriteDescriptorSetTensorARM tensorWrite[2]; - - tensorWrite[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM; - tensorWrite[0].pNext = NULL; - tensorWrite[0].tensorViewCount = 1; - tensorWrite[0].pTensorViews = &m_InputTensor.tensorView; - - tensorWrite[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM; - tensorWrite[1].pNext = NULL; - tensorWrite[1].tensorViewCount = 1; - tensorWrite[1].pTensorViews = &m_OutputTensor.tensorView; - - write[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - write[0].pNext = &tensorWrite[0]; - write[0].dstBinding = m_InputTensor.portBindingIndex; - write[0].descriptorCount = 1; - write[0].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; - write[0].dstSet = m_TensorDescriptorSet; - write[0].dstArrayElement = 0; - write[0].pBufferInfo = NULL; - write[0].pImageInfo = NULL; - write[0].pTexelBufferView = NULL; - - write[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - write[1].pNext = &tensorWrite[1]; - write[1].dstBinding = m_OutputTensor.portBindingIndex; - write[1].descriptorCount = 1; - write[1].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; - write[1].dstSet = m_TensorDescriptorSet; - write[1].dstArrayElement = 0; - write[1].pBufferInfo = NULL; - write[1].pImageInfo = NULL; - write[1].pTexelBufferView = NULL; - - vkUpdateDescriptorSets(vulkan.m_VulkanDevice, 2, write, 0, NULL); - - LOGI("Tensors Objects Created!"); - return true; } @@ -661,51 +364,47 @@ bool Application::CreateGraphPipeline() } } - LOGI("Creating Pipeline Cache from Model..."); - - VkPipelineCacheCreateInfo cacheInfo = - { - .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .initialDataSize = modelData.size(), - .pInitialData = modelData.data() - }; + LOGI("Validating model cache blob..."); - if (vkCreatePipelineCache(vulkan.m_VulkanDevice, &cacheInfo, nullptr, &m_GraphPipelineInstance.pipelineCache) != VK_SUCCESS) + uint32_t cache_version = 0; + if (!m_QCOM_data_graph_model.ValidateModelCacheBlob(modelData, cache_version)) { return false; } - LOGI("Creating Graph Pipeline Layout..."); + LOGI("QCOM data-graph cache validated. CacheVersion=%u", cache_version); - VkPipelineLayoutCreateInfo pipelineLayoutInfo = - { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .setLayoutCount = 1, - .pSetLayouts = &m_TensorDescriptorSetLayout, - .pushConstantRangeCount = 0, - .pPushConstantRanges = nullptr - }; + LOGI("Creating Pipeline Cache from Model..."); + + if(!m_data_graph_pipeline.CreatePipelineCacheFromBlob( + vulkan.m_VulkanDevice, + modelData, + m_GraphPipelineInstance.pipelineCache)) + { + return false; + } - if (vkCreatePipelineLayout(vulkan.m_VulkanDevice, &pipelineLayoutInfo, nullptr, &m_GraphPipelineInstance.pipelineLayout) != VK_SUCCESS) + LOGI("Creating Graph Pipeline Layout..."); + + if (!m_data_graph_pipeline.CreatePipelineLayout( + vulkan.m_VulkanDevice, + m_tensor_resources.GetResources().tensor_descriptor_set_layout, + m_GraphPipelineInstance.pipelineLayout)) { return false; } LOGI("Creating Graph Pipeline..."); - VkDataGraphPipelineResourceInfoARM resourceInfos[2]; + std::array< VkDataGraphPipelineResourceInfoARM, 2> resourceInfos; resourceInfos[0].sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM; resourceInfos[0].binding = m_QNNInputPortBinding; // Same as the input tensor - resourceInfos[0].pNext = &m_InputTensor.tensorDescription; + resourceInfos[0].pNext = &m_InputTensor.tensor_description; resourceInfos[1].sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM; resourceInfos[1].binding = m_QNNOutputPortBinding; // Same as the output tensor - resourceInfos[1].pNext = &m_OutputTensor.tensorDescription; + resourceInfos[1].pNext = &m_OutputTensor.tensor_description; //////////////////// // IMPORTANT NOTE // These values should be read from the file identifier!!! @@ -716,151 +415,40 @@ bool Application::CreateGraphPipeline() uint8_t qnnGraphId[32]; std::memcpy(qnnGraphId, &graphId, qnnGraphIdSize); - //////////////////// - //////////////////// - //////////////////// - - VkDataGraphProcessingEngineCreateInfoARM engineInfo = { VK_STRUCTURE_TYPE_DATA_GRAPH_PROCESSING_ENGINE_CREATE_INFO_ARM }; - engineInfo.processingEngineCount = 1; - engineInfo.pProcessingEngines = &vulkan.m_VulkanDataGraphProcessingEngine; - - VkDataGraphPipelineIdentifierCreateInfoARM identifier = { VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_IDENTIFIER_CREATE_INFO_ARM }; - identifier.pNext = &engineInfo; - identifier.identifierSize = qnnGraphIdSize; - identifier.pIdentifier = qnnGraphId; - - VkDataGraphPipelineShaderModuleCreateInfoARM moduleInfo = { VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM }; - moduleInfo.pNext = &identifier; - moduleInfo.module = VK_NULL_HANDLE; - moduleInfo.pName = ""; - - VkDataGraphPipelineCreateInfoARM pipelineInfo = - { - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM, - .pNext = &moduleInfo, - .flags = 0, - .layout = m_GraphPipelineInstance.pipelineLayout, - .resourceInfoCount = 2, - .pResourceInfos = resourceInfos - }; - - if (vkCreateDataGraphPipelinesARM( - vulkan.m_VulkanDevice, - VK_NULL_HANDLE, - m_GraphPipelineInstance.pipelineCache, - 1, - &pipelineInfo, - nullptr, - &m_GraphPipelineInstance.graphPipeline) != VK_SUCCESS) + if (!m_data_graph_pipeline.CreateGraphPipelineArmIdentifierPath( + vulkan.m_VulkanDevice, + vulkan.m_VulkanDataGraphProcessingEngine, + m_GraphPipelineInstance.pipelineLayout, + m_GraphPipelineInstance.pipelineCache, + resourceInfos.data(), + static_cast(resourceInfos.size()), + qnnGraphId, + qnnGraphIdSize, + m_GraphPipelineInstance.graphPipeline)) { return false; } LOGI("Creating Graph Pipeline Session..."); - VkDataGraphPipelineSessionCreateInfoARM sessionInfo = - { - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM, - .pNext = nullptr, - .flags = 0, - .dataGraphPipeline = m_GraphPipelineInstance.graphPipeline - }; - - if (vkCreateDataGraphPipelineSessionARM( - vulkan.m_VulkanDevice, - &sessionInfo, - nullptr, - &m_GraphPipelineInstance.graphSession) != VK_SUCCESS) + if (!m_data_graph_pipeline.CreateSession( + vulkan.m_VulkanDevice, + m_GraphPipelineInstance.graphPipeline, + m_GraphPipelineInstance.graphSession)) { return false; } - LOGI("Getting Graph Session Binding Points Requirements..."); - - VkDataGraphPipelineSessionBindPointRequirementsInfoARM bindReqsInfo = {}; - uint32_t bindReqsCount = 0; - - bindReqsInfo.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM; - bindReqsInfo.session = m_GraphPipelineInstance.graphSession; - - if (vkGetDataGraphPipelineSessionBindPointRequirementsARM(vulkan.m_VulkanDevice, &bindReqsInfo, &bindReqsCount, NULL) != VK_SUCCESS) - { - return false; - } - - std::vector bindReqs(bindReqsCount); - for (uint32_t i = 0; i < bindReqsCount; i++) - { - bindReqs[i].sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENT_ARM; - } + LOGI("Binding Graph Session Memory..."); - if (vkGetDataGraphPipelineSessionBindPointRequirementsARM(vulkan.m_VulkanDevice, &bindReqsInfo, &bindReqsCount, bindReqs.data()) != VK_SUCCESS) + if (!m_data_graph_pipeline.AllocateAndBindSessionMemory( + vulkan.m_VulkanDevice, + vulkan.m_VulkanGpu, + m_GraphPipelineInstance.graphSession, m_GraphPipelineInstance.sessionMemory)) { return false; } - LOGI("Binding Graph Session Memory..."); - - uint32_t memCount = 0; - for (uint32_t i = 0; i < bindReqsCount; i++) - { - m_GraphPipelineInstance.sessionMemory.resize(m_GraphPipelineInstance.sessionMemory.size() + bindReqs[i].numObjects); - switch (bindReqs[i].bindPointType) - { - case(VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM): - { - LOGI("*** Bind Point (VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) with %d objects", bindReqs[i].numObjects); - for (uint32_t j = 0; j < bindReqs[i].numObjects; j++) - { - VkDataGraphPipelineSessionMemoryRequirementsInfoARM memReqsInfo = { VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM }; - memReqsInfo.session = m_GraphPipelineInstance.graphSession; - memReqsInfo.bindPoint = bindReqs[i].bindPoint; - memReqsInfo.objectIndex = j; - - VkMemoryRequirements2 memReqs = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; - vkGetDataGraphPipelineSessionMemoryRequirementsARM(vulkan.m_VulkanDevice, &memReqsInfo, &memReqs); - - VkMemoryAllocateInfo info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; - info.pNext = nullptr; - info.allocationSize = memReqs.memoryRequirements.size; - // info.memoryTypeIndex = 0; // should query the indices to find most appropiate one - info.memoryTypeIndex = FindMemoryType(vulkan.m_VulkanGpu, memReqs.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); - - LOGI("*** Bind Object [%d]", j); - LOGI("*** Binding Point [%d]", bindReqs[i].bindPoint); - LOGI("*** Allocation Size [%d]", info.allocationSize); - LOGI("*** Memory Type Index [%d]", info.memoryTypeIndex); - - if (vkAllocateMemory(vulkan.m_VulkanDevice, &info, nullptr, &m_GraphPipelineInstance.sessionMemory[memCount]) != VK_SUCCESS) - { - return false; - } - - VkBindDataGraphPipelineSessionMemoryInfoARM bindMem; - bindMem.sType = VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM; - bindMem.session = m_GraphPipelineInstance.graphSession; - bindMem.bindPoint = bindReqs[i].bindPoint; - bindMem.objectIndex = j; - bindMem.memory = m_GraphPipelineInstance.sessionMemory[memCount]; - - if (vkBindDataGraphPipelineSessionMemoryARM(vulkan.m_VulkanDevice, 1, &bindMem) != VK_SUCCESS) - { - return false; - } - - memCount++; - } - - break; - } - default: - { - // Error unhandled / unexpected memory type - return false; - } - } - } - LOGI("Graph Pipeline Created!"); return true; @@ -871,68 +459,70 @@ void Application::CopyImageToTensor( CommandListVulkan& cmdList, const TextureVulkan& srcImage, VkImageLayout currentLayout, - const GraphPipelineTensor& tensorBinding) + const Ml::GraphPipelineTensor& tensorBinding) //----------------------------------------------------------------------------- { - VkImageMemoryBarrier2 imageBarrierToTransfer = - { - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, - .srcStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - .srcAccessMask = VK_ACCESS_2_MEMORY_READ_BIT, - .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT, - .dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT, - .oldLayout = currentLayout, - .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = srcImage.GetVkImage(), - .subresourceRange = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1 - } - }; - - VkDependencyInfo depInfo = - { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .imageMemoryBarrierCount = 1, - .pImageMemoryBarriers = &imageBarrierToTransfer - }; - - vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &depInfo); + // Transition image -> TRANSFER_SRC + VkImageMemoryBarrier2KHR toTransfer = {}; + toTransfer.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR; + toTransfer.srcStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR; + toTransfer.srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT_KHR | VK_ACCESS_2_MEMORY_READ_BIT_KHR; + toTransfer.dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR; + toTransfer.dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT_KHR; + toTransfer.oldLayout = currentLayout; + toTransfer.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + toTransfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + toTransfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + toTransfer.image = srcImage.GetVkImage(); + toTransfer.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + toTransfer.subresourceRange.baseMipLevel = 0; + toTransfer.subresourceRange.levelCount = 1; + toTransfer.subresourceRange.baseArrayLayer = 0; + toTransfer.subresourceRange.layerCount = 1; + + VkDependencyInfoKHR dep = {}; + dep.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR; + dep.imageMemoryBarrierCount = 1; + dep.pImageMemoryBarriers = &toTransfer; + + vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &dep); + + // Copy image -> buffer + VkBufferImageCopy region = {}; + region.bufferOffset = 0; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.imageSubresource.mipLevel = 0; + region.imageSubresource.baseArrayLayer = 0; + region.imageSubresource.layerCount = 1; + region.imageOffset = { 0, 0, 0 }; + region.imageExtent = { srcImage.Width, srcImage.Height, 1 }; - VkBufferImageCopy copyRegion = - { - .bufferOffset = 0, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1 - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {srcImage.Width, srcImage.Height, 1} - }; - vkCmdCopyImageToBuffer( - cmdList.m_VkCommandBuffer, - srcImage.GetVkImage(), + cmdList.m_VkCommandBuffer, + srcImage.GetVkImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, - tensorBinding.aliasedBuffer, - 1, - ©Region); - - // Transition image back to original layout - std::swap(imageBarrierToTransfer.oldLayout, imageBarrierToTransfer.newLayout); - std::swap(imageBarrierToTransfer.srcAccessMask, imageBarrierToTransfer.dstAccessMask); - std::swap(imageBarrierToTransfer.srcStageMask, imageBarrierToTransfer.dstStageMask); - - vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &depInfo); + tensorBinding.aliased_buffer, + 1, + ®ion); + + // Transition image back to original layout for future reads (typically SHADER_READ_ONLY) + VkImageMemoryBarrier2KHR fromTransfer = {}; + fromTransfer.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR; + fromTransfer.srcStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR; + fromTransfer.srcAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT_KHR; + fromTransfer.dstStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR; + fromTransfer.dstAccessMask = VK_ACCESS_2_MEMORY_READ_BIT_KHR | VK_ACCESS_2_MEMORY_WRITE_BIT_KHR; + fromTransfer.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + fromTransfer.newLayout = currentLayout; + fromTransfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + fromTransfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + fromTransfer.image = srcImage.GetVkImage(); + fromTransfer.subresourceRange = toTransfer.subresourceRange; + + dep.pImageMemoryBarriers = &fromTransfer; + vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &dep); } //----------------------------------------------------------------------------- @@ -940,70 +530,70 @@ void Application::CopyTensorToImage( CommandListVulkan& cmdList, const TextureVulkan& dstImage, VkImageLayout currentLayout, - const GraphPipelineTensor& tensorBinding) + const Ml::GraphPipelineTensor& tensorBinding) //----------------------------------------------------------------------------- { - const auto& synchronization2_extension = GetVulkan()->GetExtension(); - assert(synchronization2_extension != nullptr); - - VkImageMemoryBarrier2 imageBarrierToTransfer = - { - .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, - .srcStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - .srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT, - .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT, - .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT, - .oldLayout = currentLayout, - .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .image = dstImage.GetVkImage(), - .subresourceRange = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1 - } - }; - - VkDependencyInfo depInfo = - { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .imageMemoryBarrierCount = 1, - .pImageMemoryBarriers = &imageBarrierToTransfer - }; - - vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &depInfo); - - VkBufferImageCopy copyRegion = - { - .bufferOffset = 0, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1 - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {dstImage.Width, dstImage.Height, 1} - }; + // Transition image -> TRANSFER_DST + VkImageMemoryBarrier2KHR toTransfer = {}; + toTransfer.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR; + toTransfer.srcStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR; + toTransfer.srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT_KHR | VK_ACCESS_2_MEMORY_READ_BIT_KHR; + toTransfer.dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR; + toTransfer.dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT_KHR; + toTransfer.oldLayout = currentLayout; + toTransfer.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + toTransfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + toTransfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + toTransfer.image = dstImage.GetVkImage(); + toTransfer.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + toTransfer.subresourceRange.baseMipLevel = 0; + toTransfer.subresourceRange.levelCount = 1; + toTransfer.subresourceRange.baseArrayLayer = 0; + toTransfer.subresourceRange.layerCount = 1; + + VkDependencyInfoKHR dep = {}; + dep.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR; + dep.imageMemoryBarrierCount = 1; + dep.pImageMemoryBarriers = &toTransfer; + + vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &dep); + + // Copy buffer -> image + VkBufferImageCopy region = {}; + region.bufferOffset = 0; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.imageSubresource.mipLevel = 0; + region.imageSubresource.baseArrayLayer = 0; + region.imageSubresource.layerCount = 1; + region.imageOffset = { 0, 0, 0 }; + region.imageExtent = { dstImage.Width, dstImage.Height, 1 }; vkCmdCopyBufferToImage( - cmdList.m_VkCommandBuffer, - tensorBinding.aliasedBuffer, - dstImage.GetVkImage(), + cmdList.m_VkCommandBuffer, + tensorBinding.aliased_buffer, + dstImage.GetVkImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - 1, - ©Region); - - std::swap(imageBarrierToTransfer.oldLayout, imageBarrierToTransfer.newLayout); - std::swap(imageBarrierToTransfer.srcAccessMask, imageBarrierToTransfer.dstAccessMask); - std::swap(imageBarrierToTransfer.srcStageMask, imageBarrierToTransfer.dstStageMask); - - vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &depInfo); + 1, + ®ion); + + // Transition image back (shader read for blit sampling) + VkImageMemoryBarrier2KHR fromTransfer = {}; + fromTransfer.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR; + fromTransfer.srcStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR; + fromTransfer.srcAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT_KHR; + fromTransfer.dstStageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR; + fromTransfer.dstAccessMask = VK_ACCESS_2_MEMORY_READ_BIT_KHR | VK_ACCESS_2_MEMORY_WRITE_BIT_KHR; + fromTransfer.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + fromTransfer.newLayout = currentLayout; + fromTransfer.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + fromTransfer.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + fromTransfer.image = dstImage.GetVkImage(); + fromTransfer.subresourceRange = toTransfer.subresourceRange; + + dep.pImageMemoryBarriers = &fromTransfer; + vkCmdPipelineBarrier2KHR(cmdList.m_VkCommandBuffer, &dep); } //----------------------------------------------------------------------------- @@ -1205,6 +795,11 @@ bool Application::InitUniforms() return false; } + if (!CreateUniformBuffer(pVulkan, m_BlitFragUniform)) + { + return false; + } + return true; } @@ -1214,7 +809,7 @@ bool Application::InitAllRenderPasses() { Vulkan& vulkan = *GetVulkan(); - // ColorInputUsage | ClearDepthRenderPass | ColorOutputUsage | DepthOutputUsage | ClearColor + // ColorInputUsage | ClearDepthRenderPass | ColorOutputUsage | DepthOutputUsage | ClearColor m_RenderPassData[RP_SCENE].RenderPassSetup = { RenderPassInputUsage::Clear, true, RenderPassOutputUsage::StoreReadOnly, RenderPassOutputUsage::Store, {}}; m_RenderPassData[RP_HUD].RenderPassSetup = { RenderPassInputUsage::Clear, false, RenderPassOutputUsage::StoreReadOnly, RenderPassOutputUsage::Discard, {}}; m_RenderPassData[RP_BLIT].RenderPassSetup = { RenderPassInputUsage::DontCare, true, RenderPassOutputUsage::Present, RenderPassOutputUsage::Discard, {}}; @@ -1224,7 +819,7 @@ bool Application::InitAllRenderPasses() auto swapChainDepthFormat = vulkan.m_SwapchainDepth.format; LOGI("******************************"); - LOGI("Initializing Render Passes... "); + LOGI("Initializing Render Passes %d - %d... ", static_cast(swapChainColorFormat[0]), static_cast(vulkan.m_SurfaceColorSpace)); LOGI("******************************"); for (uint32_t whichPass = 0; whichPass < RP_BLIT; whichPass++) @@ -1403,7 +998,6 @@ bool Application::LoadMeshObjects() return shaderMaterial; }; - const auto loaderFlags = 0; // No instancing const bool ignoreTransforms = (loaderFlags & DrawableLoader::LoaderFlags::IgnoreHierarchy) != 0; @@ -1421,9 +1015,8 @@ bool Application::LoadMeshObjects() m_SceneDrawables, loaderFlags)) { - LOGE("Error Loading the museum gltf file"); - LOGI("Please verify if you have all required assets on the sample media folder"); - LOGI("If you are running on Android, don't forget to run the `02_CopyMediaToDevice.bat` script to copy all media files into the device memory"); + LOGE("Error Loading the gltf file"); + LOGI("Please verify if you have all required assets on the media folder"); return false; } @@ -1433,7 +1026,6 @@ bool Application::LoadMeshObjects() m_Camera.SetPosition(camera.Position, camera.Orientation); } - LOGI("*********************"); LOGI("Creating Quad mesh..."); LOGI("*********************"); @@ -1457,6 +1049,10 @@ bool Application::LoadMeshObjects() }, [this](const std::string& bufferName) -> PerFrameBufferVulkan { + if (bufferName == "Params") + { + return { m_BlitFragUniform.buf.GetVkBuffer() }; + } return {}; } ); @@ -1664,35 +1260,37 @@ void Application::UpdateGui() ImGui::Checkbox("Upscaling Enabled", &m_ShouldUpscale); ImGui::EndDisabled(); - ImGui::Separator(); - - ImGui::DragFloat3("Sun Dir", &m_LightUniformData.LightDirection.x, 0.01f, -1.0f, 1.0f); - ImGui::DragFloat3("Sun Color", &m_LightUniformData.LightColor.x, 0.01f, 0.0f, 1.0f); - ImGui::DragFloat("Sun Intensity", &m_LightUniformData.LightColor.w, 0.1f, 0.0f, 100.0f); - ImGui::DragFloat3("Ambient Color", &m_LightUniformData.AmbientColor.x, 0.01f, 0.0f, 1.0f); - - for (int i = 0; i < NUM_SPOT_LIGHTS; i++) + if (ImGui::CollapsingHeader("Sun Light", ImGuiTreeNodeFlags_Framed)) { - std::string childName = std::string("Spot Light ").append(std::to_string(i+1)); - ImGui::TextColored(ImVec4(1, 1, 0, 1), "%s", childName.c_str()); + ImGui::DragFloat3("Sun Dir", &m_LightUniformData.LightDirection.x, 0.01f, -1.0f, 1.0f); + ImGui::DragFloat3("Sun Color", &m_LightUniformData.LightColor.x, 0.01f, 0.0f, 1.0f); + ImGui::DragFloat("Sun Intensity", &m_LightUniformData.LightColor.w, 0.1f, 0.0f, 100.0f); + ImGui::DragFloat3("Ambient Color", &m_LightUniformData.AmbientColor.x, 0.01f, 0.0f, 1.0f); + } - if (ImGui::CollapsingHeader(childName.c_str(), ImGuiTreeNodeFlags_DefaultOpen | ImGuiTreeNodeFlags_Framed)) + if (ImGui::CollapsingHeader("Spot Lights", ImGuiTreeNodeFlags_Framed)) + { + for (int i = 0; i < NUM_SPOT_LIGHTS; i++) { - ImGui::PushID(i); + std::string childName = std::string("Spot Light ").append(std::to_string(i + 1)); + ImGui::TextColored(ImVec4(1, 1, 0, 1), "%s", childName.c_str()); - ImGui::DragFloat3("Pos", &m_LightUniformData.SpotLights_pos[i].x, 0.1f); - ImGui::DragFloat3("Dir", &m_LightUniformData.SpotLights_dir[i].x, 0.01f, -1.0f, 1.0f); - ImGui::DragFloat3("Color", &m_LightUniformData.SpotLights_color[i].x, 0.01f, 0.0f, 1.0f); - ImGui::DragFloat("Intensity", &m_LightUniformData.SpotLights_color[i].w, 0.1f, 0.0f, 100.0f); + if (ImGui::CollapsingHeader(childName.c_str(), ImGuiTreeNodeFlags_DefaultOpen | ImGuiTreeNodeFlags_Framed)) + { + ImGui::PushID(i); - ImGui::PopID(); - } + ImGui::DragFloat3("Pos", &m_LightUniformData.SpotLights_pos[i].x, 0.1f); + ImGui::DragFloat3("Dir", &m_LightUniformData.SpotLights_dir[i].x, 0.01f, -1.0f, 1.0f); + ImGui::DragFloat3("Color", &m_LightUniformData.SpotLights_color[i].x, 0.01f, 0.0f, 1.0f); + ImGui::DragFloat("Intensity", &m_LightUniformData.SpotLights_color[i].w, 0.1f, 0.0f, 100.0f); - ImDrawList* list = ImGui::GetWindowDrawList(); + ImGui::PopID(); + } - glm::vec3 LightDirNotNormalized = m_LightUniformData.SpotLights_dir[i]; - LightDirNotNormalized = glm::normalize(LightDirNotNormalized); - m_LightUniformData.SpotLights_dir[i] = glm::vec4(LightDirNotNormalized, 0.0f); + glm::vec3 LightDirNotNormalized = m_LightUniformData.SpotLights_dir[i]; + LightDirNotNormalized = glm::normalize(LightDirNotNormalized); + m_LightUniformData.SpotLights_dir[i] = glm::vec4(LightDirNotNormalized, 0.0f); + } } glm::vec3 LightDirNotNormalized = m_LightUniformData.LightDirection; @@ -1744,6 +1342,12 @@ bool Application::UpdateUniforms(uint32_t whichBuffer) UpdateUniformBuffer(pVulkan, m_LightUniform, m_LightUniformData); } + // Blit data + { + m_BlitFragUniformData.IsUpscalingActive = m_ShouldUpscale; + UpdateUniformBuffer(pVulkan, m_BlitFragUniform, m_BlitFragUniformData); + } + return true; } @@ -1806,7 +1410,7 @@ void Application::Render(float fltDiffTime) // Submit the commands to the queue. SubmitRenderPass(whichBuffer, RP_SCENE, pWaitSemaphores, waitDstStageMasks, { &m_RenderPassData[RP_SCENE].PassCompleteSemaphore,1 }); pWaitSemaphores = { &m_RenderPassData[RP_SCENE].PassCompleteSemaphore, 1 }; - waitDstStageMasks[0] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT }; + waitDstStageMasks[0] = { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT }; } // Data Graph preparation + dispatch for Upscaling @@ -1814,32 +1418,26 @@ void Application::Render(float fltDiffTime) { m_GraphPipelineCommandLists[whichBuffer].Begin(); - vkCmdBindPipeline( + m_data_graph_dispatch.RecordDispatch( m_GraphPipelineCommandLists[whichBuffer].m_VkCommandBuffer, - VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, - m_GraphPipelineInstance.graphPipeline); - - vkCmdBindDescriptorSets( - m_GraphPipelineCommandLists[whichBuffer].m_VkCommandBuffer, - VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, + m_GraphPipelineInstance.graphPipeline, m_GraphPipelineInstance.pipelineLayout, - 0, - 1, - &m_TensorDescriptorSet, - 0, - NULL); + m_tensor_resources.GetResources().tensor_descriptor_set, + m_GraphPipelineInstance.graphSession); - VkDataGraphPipelineDispatchInfoARM dispatchInfo; - dispatchInfo.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_DISPATCH_INFO_ARM; - dispatchInfo.flags = 0; + m_GraphPipelineCommandLists[whichBuffer].End(); - vkCmdDispatchDataGraphARM( - m_GraphPipelineCommandLists[whichBuffer].m_VkCommandBuffer, - m_GraphPipelineInstance.graphSession, - &dispatchInfo); + if (!m_data_graph_dispatch.Submit( + pVulkan->m_VulkanQueues[m_GraphPipelineCommandLists[whichBuffer].m_QueueIndex].Queue, + m_GraphPipelineCommandLists[whichBuffer].m_VkCommandBuffer, + pWaitSemaphores[0], + waitDstStageMasks[0], + m_GraphPipelinePassCompleteSemaphore, + VK_NULL_HANDLE)) + { + LOGE("Data Graph dispatch failed"); + } - m_GraphPipelineCommandLists[whichBuffer].End(); - m_GraphPipelineCommandLists[whichBuffer].QueueSubmit(pWaitSemaphores[0], waitDstStageMasks[0], m_GraphPipelinePassCompleteSemaphore); pWaitSemaphores = { &m_GraphPipelinePassCompleteSemaphore, 1 }; waitDstStageMasks[0] = { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT }; // Should be VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, but need to update framework // to support VK_PIPELINE_STAGE_2. diff --git a/samples/graph_pipelines/code/main/application.hpp b/samples/graph_pipelines/code/main/application.hpp index 9d769a1..6306ec4 100644 --- a/samples/graph_pipelines/code/main/application.hpp +++ b/samples/graph_pipelines/code/main/application.hpp @@ -1,7 +1,7 @@ //============================================================================================================ // // -// Copyright (c) 2025, Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause // //============================================================================================================ @@ -12,6 +12,12 @@ #include "vulkan/commandBuffer.hpp" #include +#include "ml/GraphPipelineTypes.hpp" +#include "ml/DataGraphPipeline.hpp" +#include "ml/GraphDispatch.hpp" +#include "ml/TensorResources.hpp" +#include "ml/QcomDataGraphModel.hpp" + #define NUM_SPOT_LIGHTS 4 enum RENDER_PASS @@ -38,6 +44,11 @@ struct ObjectFragUB glm::vec4 ORM; }; +struct BlitFragUB +{ + bool IsUpscalingActive; +}; + struct LightUB { glm::mat4 ProjectionInv; @@ -59,18 +70,6 @@ struct LightUB int Height; }; -struct GraphPipelineTensor -{ - std::array dimensions; - std::array strides; - VkBuffer aliasedBuffer = VK_NULL_HANDLE; - VkTensorDescriptionARM tensorDescription; - VkTensorARM tensor = VK_NULL_HANDLE; - VkTensorViewARM tensorView = VK_NULL_HANDLE; - VkDeviceMemory tensorMemory = VK_NULL_HANDLE; - uint32_t portBindingIndex = -1; -}; - struct GraphPipelineInstance { VkPipelineLayout pipelineLayout = VK_NULL_HANDLE; @@ -189,13 +188,13 @@ class Application : public ApplicationHelperBase CommandListVulkan& cmdList, const TextureVulkan& srcImage, VkImageLayout currentLayout, - const GraphPipelineTensor& tensorBinding); + const Ml::GraphPipelineTensor& tensorBinding); void CopyTensorToImage( CommandListVulkan& cmdList, const TextureVulkan& dstImage, VkImageLayout currentLayout, - const GraphPipelineTensor& tensorBinding); + const Ml::GraphPipelineTensor& tensorBinding); void CopyImageToImageBlit( CommandListVulkan& cmdList, @@ -214,6 +213,8 @@ class Application : public ApplicationHelperBase ObjectVertUB m_ObjectVertUniformData; UniformT m_LightUniform; LightUB m_LightUniformData; + UniformT m_BlitFragUniform; + BlitFragUB m_BlitFragUniformData; std::unordered_map m_ObjectFragUniforms; // Drawables @@ -231,13 +232,16 @@ class Application : public ApplicationHelperBase glm::ivec2 m_RenderResolution; glm::ivec2 m_UpscaledResolution; + // ML types + Ml::DataGraphPipeline m_data_graph_pipeline; + Ml::GraphDispatch m_data_graph_dispatch; + Ml::TensorResources m_tensor_resources; + Ml::QcomDataGraphModel m_QCOM_data_graph_model; + // Graph Pipelines bool m_IsGraphPipelinesSupported = false; // Enables/disable the whole graph pipeline functionality - VkDescriptorPool m_TensorDescriptorPool = VK_NULL_HANDLE; - VkDescriptorSetLayout m_TensorDescriptorSetLayout = VK_NULL_HANDLE; - VkDescriptorSet m_TensorDescriptorSet = VK_NULL_HANDLE; - GraphPipelineTensor m_InputTensor; - GraphPipelineTensor m_OutputTensor; + Ml::GraphPipelineTensor m_InputTensor; + Ml::GraphPipelineTensor m_OutputTensor; GraphPipelineInstance m_GraphPipelineInstance; std::vector< CommandListVulkan> m_GraphPipelineCommandLists; // Cmd buffer allocated from the Data Graph queue VkSemaphore m_GraphPipelinePassCompleteSemaphore = VK_NULL_HANDLE; diff --git a/samples/graph_pipelines/code/main/ml/DataGraphPipeline.cpp b/samples/graph_pipelines/code/main/ml/DataGraphPipeline.cpp new file mode 100644 index 0000000..baa72e6 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/DataGraphPipeline.cpp @@ -0,0 +1,230 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ + +#include "DataGraphPipeline.hpp" + +#include + +bool Ml::DataGraphPipeline::FindMemoryType( + VkPhysicalDevice physical_device, + uint32_t type_bits, + VkMemoryPropertyFlags properties, + uint32_t& out_index) +{ + VkPhysicalDeviceMemoryProperties mem_properties = {}; + vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_properties); + + for (uint32_t i = 0; i < mem_properties.memoryTypeCount; ++i) + { + const bool has_properties = (mem_properties.memoryTypes[i].propertyFlags & properties) == properties; + const bool has_bit = (type_bits & (1u << i)) != 0; + + if (has_properties && has_bit) + { + out_index = i; + return true; + } + } + + return false; +} + +bool Ml::DataGraphPipeline::CreatePipelineCacheFromBlob( + VkDevice device, + const std::vector& model_data, + VkPipelineCache& out_cache) +{ + out_cache = VK_NULL_HANDLE; + + VkPipelineCacheCreateInfo cache_info = {}; + cache_info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + cache_info.pNext = nullptr; + cache_info.flags = 0; + cache_info.initialDataSize = model_data.size(); + cache_info.pInitialData = model_data.data(); + + return vkCreatePipelineCache(device, &cache_info, nullptr, &out_cache) == VK_SUCCESS; +} + +bool Ml::DataGraphPipeline::CreatePipelineLayout( + VkDevice device, + VkDescriptorSetLayout tensor_set_layout, + VkPipelineLayout& out_pipeline_layout) +{ + out_pipeline_layout = VK_NULL_HANDLE; + + VkPipelineLayoutCreateInfo layout_info = {}; + layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + layout_info.pNext = nullptr; + layout_info.flags = 0; + layout_info.setLayoutCount = 1; + layout_info.pSetLayouts = &tensor_set_layout; + layout_info.pushConstantRangeCount = 0; + layout_info.pPushConstantRanges = nullptr; + + return vkCreatePipelineLayout(device, &layout_info, nullptr, &out_pipeline_layout) == VK_SUCCESS; +} + +bool Ml::DataGraphPipeline::CreateGraphPipelineArmIdentifierPath( + VkDevice device, + VkPhysicalDeviceDataGraphProcessingEngineARM& data_graph_engine, + VkPipelineLayout pipeline_layout, + VkPipelineCache pipeline_cache, + const VkDataGraphPipelineResourceInfoARM* resource_infos, + uint32_t resource_info_count, + const uint8_t* identifier_bytes, + uint32_t identifier_size, + VkPipeline& out_pipeline) +{ + out_pipeline = VK_NULL_HANDLE; + + VkDataGraphProcessingEngineCreateInfoARM engine_info = {}; + engine_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PROCESSING_ENGINE_CREATE_INFO_ARM; + engine_info.pNext = nullptr; + engine_info.processingEngineCount = 1; + engine_info.pProcessingEngines = &data_graph_engine; + + VkDataGraphPipelineIdentifierCreateInfoARM identifier_info = {}; + identifier_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_IDENTIFIER_CREATE_INFO_ARM; + identifier_info.pNext = &engine_info; + identifier_info.identifierSize = identifier_size; + identifier_info.pIdentifier = identifier_bytes; + + VkDataGraphPipelineShaderModuleCreateInfoARM module_info = {}; + module_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM; + module_info.pNext = &identifier_info; + module_info.module = VK_NULL_HANDLE; + module_info.pName = ""; + + VkDataGraphPipelineCreateInfoARM pipeline_info = {}; + pipeline_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM; + pipeline_info.pNext = &module_info; + pipeline_info.flags = 0; + pipeline_info.layout = pipeline_layout; + pipeline_info.resourceInfoCount = resource_info_count; + pipeline_info.pResourceInfos = resource_infos; + + return vkCreateDataGraphPipelinesARM( + device, + VK_NULL_HANDLE, + pipeline_cache, + 1, + &pipeline_info, + nullptr, + &out_pipeline) == VK_SUCCESS; +} + +bool Ml::DataGraphPipeline::CreateSession( + VkDevice device, + VkPipeline pipeline, + VkDataGraphPipelineSessionARM& out_session) +{ + out_session = VK_NULL_HANDLE; + + VkDataGraphPipelineSessionCreateInfoARM session_info = {}; + session_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM; + session_info.pNext = nullptr; + session_info.flags = 0; + session_info.dataGraphPipeline = pipeline; + + return vkCreateDataGraphPipelineSessionARM(device, &session_info, nullptr, &out_session) == VK_SUCCESS; +} + +bool Ml::DataGraphPipeline::AllocateAndBindSessionMemory( + VkDevice device, + VkPhysicalDevice physical_device, + VkDataGraphPipelineSessionARM session, + std::vector& out_session_mem) +{ + out_session_mem.clear(); + + VkDataGraphPipelineSessionBindPointRequirementsInfoARM bind_reqs_info = {}; + bind_reqs_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM; + bind_reqs_info.pNext = nullptr; + bind_reqs_info.session = session; + + uint32_t bind_reqs_count = 0; + if (vkGetDataGraphPipelineSessionBindPointRequirementsARM(device, &bind_reqs_info, &bind_reqs_count, nullptr) != VK_SUCCESS) + { + return false; + } + + std::vector bind_reqs(bind_reqs_count); + for (uint32_t i = 0; i < bind_reqs_count; ++i) + { + bind_reqs[i].sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENT_ARM; + bind_reqs[i].pNext = nullptr; + } + + if (vkGetDataGraphPipelineSessionBindPointRequirementsARM(device, &bind_reqs_info, &bind_reqs_count, bind_reqs.data()) != VK_SUCCESS) + { + return false; + } + + uint32_t mem_count = 0; + for (uint32_t i = 0; i < bind_reqs_count; ++i) + { + if (bind_reqs[i].bindPointType != VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) + { + return false; + } + + const uint32_t old_size = static_cast(out_session_mem.size()); + out_session_mem.resize(old_size + bind_reqs[i].numObjects); + + for (uint32_t j = 0; j < bind_reqs[i].numObjects; ++j) + { + VkDataGraphPipelineSessionMemoryRequirementsInfoARM mem_reqs_info = {}; + mem_reqs_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM; + mem_reqs_info.pNext = nullptr; + mem_reqs_info.session = session; + mem_reqs_info.bindPoint = bind_reqs[i].bindPoint; + mem_reqs_info.objectIndex = j; + + VkMemoryRequirements2 mem_reqs = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; + vkGetDataGraphPipelineSessionMemoryRequirementsARM(device, &mem_reqs_info, &mem_reqs); + + uint32_t memory_type_index = 0; + if (!FindMemoryType(physical_device, mem_reqs.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, memory_type_index)) + { + return false; + } + + VkMemoryAllocateInfo alloc_info = {}; + alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + alloc_info.pNext = nullptr; + alloc_info.allocationSize = mem_reqs.memoryRequirements.size; + alloc_info.memoryTypeIndex = memory_type_index; + + VkDeviceMemory memory = VK_NULL_HANDLE; + if (vkAllocateMemory(device, &alloc_info, nullptr, &memory) != VK_SUCCESS) + { + return false; + } + + VkBindDataGraphPipelineSessionMemoryInfoARM bind_info = {}; + bind_info.sType = VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM; + bind_info.pNext = nullptr; + bind_info.session = session; + bind_info.bindPoint = bind_reqs[i].bindPoint; + bind_info.objectIndex = j; + bind_info.memory = memory; + + if (vkBindDataGraphPipelineSessionMemoryARM(device, 1, &bind_info) != VK_SUCCESS) + { + vkFreeMemory(device, memory, nullptr); + return false; + } + + out_session_mem[old_size + mem_count] = memory; + ++mem_count; + } + } + + return true; +} \ No newline at end of file diff --git a/samples/graph_pipelines/code/main/ml/DataGraphPipeline.hpp b/samples/graph_pipelines/code/main/ml/DataGraphPipeline.hpp new file mode 100644 index 0000000..8aee9c1 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/DataGraphPipeline.hpp @@ -0,0 +1,69 @@ + +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include +#include + +#include "main/applicationHelperBase.hpp" +#include "GraphPipelineTypes.hpp" + +namespace Ml +{ + class DataGraphPipeline + { + public: + DataGraphPipeline() = default; + ~DataGraphPipeline() = default; + + DataGraphPipeline(const DataGraphPipeline&) = delete; + DataGraphPipeline& operator=(const DataGraphPipeline&) = delete; + + static bool CreatePipelineCacheFromBlob( + VkDevice device, + const std::vector& model_data, + VkPipelineCache& out_cache); + + static bool CreatePipelineLayout( + VkDevice device, + VkDescriptorSetLayout tensor_set_layout, + VkPipelineLayout& out_pipeline_layout); + + static bool CreateGraphPipelineArmIdentifierPath( + VkDevice device, + VkPhysicalDeviceDataGraphProcessingEngineARM& data_graph_engine, + VkPipelineLayout pipeline_layout, + VkPipelineCache pipeline_cache, + const VkDataGraphPipelineResourceInfoARM* resource_infos, + uint32_t resource_info_count, + const uint8_t* identifier_bytes, + uint32_t identifier_size, + VkPipeline& out_pipeline); + + static bool CreateSession( + VkDevice device, + VkPipeline pipeline, + VkDataGraphPipelineSessionARM& out_session); + + static bool AllocateAndBindSessionMemory( + VkDevice device, + VkPhysicalDevice physical_device, + VkDataGraphPipelineSessionARM session, + std::vector& out_session_mem); + + inline bool IsValid() const { return true; } + + private: + static bool FindMemoryType( + VkPhysicalDevice physical_device, + uint32_t type_bits, + VkMemoryPropertyFlags properties, + uint32_t& out_index); + }; +} // namespace Ml \ No newline at end of file diff --git a/samples/graph_pipelines/code/main/ml/GraphDispatch.cpp b/samples/graph_pipelines/code/main/ml/GraphDispatch.cpp new file mode 100644 index 0000000..54e00d7 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/GraphDispatch.cpp @@ -0,0 +1,67 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#include "GraphDispatch.hpp" + +void Ml::GraphDispatch::RecordDispatch( + VkCommandBuffer cmd_buffer, + VkPipeline pipeline, + VkPipelineLayout pipeline_layout, + VkDescriptorSet descriptor_set, + VkDataGraphPipelineSessionARM session) +{ + vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, pipeline); + + vkCmdBindDescriptorSets( + cmd_buffer, + VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, + pipeline_layout, + 0, + 1, + &descriptor_set, + 0, + nullptr); + + VkDataGraphPipelineDispatchInfoARM dispatch_info = {}; + dispatch_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_DISPATCH_INFO_ARM; + dispatch_info.pNext = nullptr; + dispatch_info.flags = 0; + + vkCmdDispatchDataGraphARM(cmd_buffer, session, &dispatch_info); +} + +bool Ml::GraphDispatch::Submit( + VkQueue queue, + VkCommandBuffer cmd_buffer, + VkSemaphore wait_semaphore, + VkPipelineStageFlags wait_stage_mask, + VkSemaphore signal_semaphore, + VkFence fence) +{ + VkSubmitInfo submit_info = {}; + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.pNext = nullptr; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &cmd_buffer; + + VkPipelineStageFlags stage_mask = wait_stage_mask; + + if (wait_semaphore != VK_NULL_HANDLE) + { + submit_info.waitSemaphoreCount = 1; + submit_info.pWaitSemaphores = &wait_semaphore; + submit_info.pWaitDstStageMask = &stage_mask; + } + + if (signal_semaphore != VK_NULL_HANDLE) + { + submit_info.signalSemaphoreCount = 1; + submit_info.pSignalSemaphores = &signal_semaphore; + } + + return vkQueueSubmit(queue, 1, &submit_info, fence) == VK_SUCCESS; +} \ No newline at end of file diff --git a/samples/graph_pipelines/code/main/ml/GraphDispatch.hpp b/samples/graph_pipelines/code/main/ml/GraphDispatch.hpp new file mode 100644 index 0000000..fc176ba --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/GraphDispatch.hpp @@ -0,0 +1,37 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include "main/applicationHelperBase.hpp" + +namespace Ml +{ + class GraphDispatch + { + public: + GraphDispatch() = default; + ~GraphDispatch() = default; + + static void RecordDispatch( + VkCommandBuffer cmd_buffer, + VkPipeline pipeline, + VkPipelineLayout pipeline_layout, + VkDescriptorSet descriptor_set, + VkDataGraphPipelineSessionARM session); + + static bool Submit( + VkQueue queue, + VkCommandBuffer cmd_buffer, + VkSemaphore wait_semaphore, + VkPipelineStageFlags wait_stage_mask, + VkSemaphore signal_semaphore, + VkFence fence); + + inline bool IsValid() const { return true; } + }; +} // namespace Ml diff --git a/samples/graph_pipelines/code/main/ml/GraphPipelineTypes.hpp b/samples/graph_pipelines/code/main/ml/GraphPipelineTypes.hpp new file mode 100644 index 0000000..cf41908 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/GraphPipelineTypes.hpp @@ -0,0 +1,48 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include +#include +#include +#include "main/applicationHelperBase.hpp" + +namespace Ml +{ + struct GraphPipelineTensor + { + std::vector strides; + std::vector dimensions; + uint32_t port_binding_index = 0; + + VkTensorDescriptionARM tensor_description = {}; + VkTensorARM tensor = VK_NULL_HANDLE; + VkTensorViewARM tensor_view = VK_NULL_HANDLE; + + VkBuffer aliased_buffer = VK_NULL_HANDLE; + VkDeviceMemory tensor_memory = VK_NULL_HANDLE; + }; + + struct GraphPipelineResources + { + VkDescriptorPool tensor_descriptor_pool = VK_NULL_HANDLE; + VkDescriptorSetLayout tensor_descriptor_set_layout = VK_NULL_HANDLE; + VkDescriptorSet tensor_descriptor_set = VK_NULL_HANDLE; + }; + + struct DataGraphPipelineInstance + { + VkPipelineCache pipeline_cache = VK_NULL_HANDLE; + VkPipelineLayout pipeline_layout = VK_NULL_HANDLE; + + VkPipeline graph_pipeline = VK_NULL_HANDLE; + VkDataGraphPipelineSessionARM graph_session = VK_NULL_HANDLE; + + std::vector session_memory; + }; +} // namespace Ml \ No newline at end of file diff --git a/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.cpp b/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.cpp new file mode 100644 index 0000000..1921a64 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.cpp @@ -0,0 +1,195 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#include "QcomDataGraphModel.hpp" +#include +#include + +bool Ml::QcomDataGraphModel::IsExtensionSupported(VkPhysicalDevice physical_device) +{ + uint32_t extension_count = 0; + if (vkEnumerateDeviceExtensionProperties(physical_device, nullptr, &extension_count, nullptr) != VK_SUCCESS) + { + return false; + } + + std::vector props(extension_count); + if (vkEnumerateDeviceExtensionProperties(physical_device, nullptr, &extension_count, props.data()) != VK_SUCCESS) + { + return false; + } + + for (const auto& p : props) + { + if (std::strcmp(p.extensionName, VK_QCOM_DATA_GRAPH_MODEL_EXTENSION_NAME) == 0) + { + return true; + } + } + + return false; +} + +bool Ml::QcomDataGraphModel::QueryFeatures( + VkPhysicalDevice physical_device, + VkPhysicalDeviceDataGraphModelFeaturesQCOM& out_features) +{ + out_features = {}; + out_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DATA_GRAPH_MODEL_FEATURES_QCOM; + out_features.pNext = nullptr; + + VkPhysicalDeviceFeatures2 features2 = {}; + features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + features2.pNext = &out_features; + + vkGetPhysicalDeviceFeatures2(physical_device, &features2); // fills out_features + return true; +} + +bool Ml::QcomDataGraphModel::ValidateModelCacheBlob( + const std::vector& model_blob, + uint32_t& out_cache_version) +{ + out_cache_version = 0; + + // Header is defined as VkPipelineCacheHeaderVersionDataGraphQCOM and intended size is 28 bytes. + // It includes headerSize, headerVersion, cacheType, cacheVersion, toolchainVersion[]. + // https://docs.vulkan.org/refpages/latest/refpages/source/VkPipelineCacheHeaderVersionDataGraphQCOM.html + if (model_blob.size() < 28) + { + return false; + } + + const unsigned char* p = model_blob.data(); + + const uint32_t header_size = ReadU32LE(p + 0); + const uint32_t header_version = ReadU32LE(p + 4); + const uint32_t cache_type = ReadU32LE(p + 8); + const uint32_t cache_version = ReadU32LE(p + 12); + + // Verify header identifies a QCOM data-graph model cache. + // headerVersion must be VK_PIPELINE_CACHE_HEADER_VERSION_DATA_GRAPH_QCOM. + // https://docs.vulkan.org/refpages/latest/refpages/source/VkPipelineCacheHeaderVersion.html + // https://docs.vulkan.org/refpages/latest/refpages/source/VkPipelineCacheHeaderVersionDataGraphQCOM.html + if (header_version != uint32_t(VK_PIPELINE_CACHE_HEADER_VERSION_DATA_GRAPH_QCOM)) + { + return false; + } + + // cacheType must match VkDataGraphModelCacheTypeQCOM. Generic binary is currently the defined value. + // https://docs.vulkan.org/refpages/latest/refpages/source/VkDataGraphModelCacheTypeQCOM.html + // https://docs.vulkan.org/refpages/latest/refpages/source/VkPipelineCacheHeaderVersionDataGraphQCOM.html + if (cache_type != uint32_t(VK_DATA_GRAPH_MODEL_CACHE_TYPE_GENERIC_BINARY_QCOM)) + { + return false; + } + + // headerSize should be 28 per intended layout, but we accept >= 28 to be tolerant. + // https://docs.vulkan.org/refpages/latest/refpages/source/VkPipelineCacheHeaderVersionDataGraphQCOM.html + if (header_size < 28) + { + return false; + } + + out_cache_version = cache_version; + return true; +} + +bool Ml::QcomDataGraphModel::SelectQcomEngineAndOperationForQueueFamily( + VkPhysicalDevice physical_device, + uint32_t queue_family_index, + QCOM_MODEL_OPERATION preferred_operation, + QcomSelectedDataGraphSupport& out_selection) +{ + out_selection = {}; + + uint32_t count = 0; + VkResult r = vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( + physical_device, + queue_family_index, + &count, + nullptr); + + if (r != VK_SUCCESS || count == 0) + { + return false; + } + + std::vector props(count); + for (auto& p : props) + { + p.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_DATA_GRAPH_PROPERTIES_ARM; + p.pNext = nullptr; + } + + r = vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM( + physical_device, + queue_family_index, + &count, + props.data()); + + if (r != VK_SUCCESS || count == 0) + { + return false; + } + + const VkPhysicalDeviceDataGraphOperationTypeARM wanted_op_type = + (preferred_operation == QCOM_MODEL_OPERATION::NEURAL_MODEL) + ? VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_QCOM + : VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_BUILTIN_MODEL_QCOM; + + // Find a QCOM engine type (NEURAL_QCOM preferred, COMPUTE_QCOM fallback) paired with the wanted operation. + // QCOM engine types and operation types are defined by VK_QCOM_data_graph_model. + // https://docs.vulkan.org/refpages/latest/refpages/source/VK_QCOM_data_graph_model.html + // https://github.khronos.org/Vulkan-Site/refpages/latest/refpages/source/VkPhysicalDeviceDataGraphOperationTypeARM.html + // https://docs.vulkan.org/refpages/latest/refpages/source/vkGetPhysicalDeviceQueueFamilyDataGraphPropertiesARM.html + auto Match = [&](VkPhysicalDeviceDataGraphProcessingEngineTypeARM engine_type) -> bool + { + for (const auto& p : props) + { + if (p.engine.type == engine_type && p.operation.operationType == wanted_op_type) + { + out_selection.is_valid = true; + out_selection.engine = p.engine; + out_selection.operation = p.operation; + return true; + } + } + return false; + }; + + if (Match(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_NEURAL_QCOM)) + { + return true; + } + + if (Match(VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_COMPUTE_QCOM)) + { + return true; + } + + // If we couldn't match preferred op, try the other op type (some stacks publish only one). + const VkPhysicalDeviceDataGraphOperationTypeARM alt_op_type = + (wanted_op_type == VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_QCOM) + ? VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_BUILTIN_MODEL_QCOM + : VK_PHYSICAL_DEVICE_DATA_GRAPH_OPERATION_TYPE_NEURAL_MODEL_QCOM; + + for (const auto& p : props) + { + if ((p.engine.type == VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_NEURAL_QCOM || + p.engine.type == VK_PHYSICAL_DEVICE_DATA_GRAPH_PROCESSING_ENGINE_TYPE_COMPUTE_QCOM) && + p.operation.operationType == alt_op_type) + { + out_selection.is_valid = true; + out_selection.engine = p.engine; + out_selection.operation = p.operation; + return true; + } + } + + return false; +} diff --git a/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.hpp b/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.hpp new file mode 100644 index 0000000..6d2e3e6 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/QcomDataGraphModel.hpp @@ -0,0 +1,61 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include +#include + +#include "main/applicationHelperBase.hpp" + +namespace Ml +{ + enum class QCOM_MODEL_OPERATION + { + NEURAL_MODEL, + BUILTIN_MODEL + }; + + struct QcomSelectedDataGraphSupport + { + bool is_valid = false; + VkPhysicalDeviceDataGraphProcessingEngineARM engine = {}; + VkPhysicalDeviceDataGraphOperationSupportARM operation = {}; + }; + + class QcomDataGraphModel + { + public: + QcomDataGraphModel() = default; + ~QcomDataGraphModel() = default; + + static bool IsExtensionSupported(VkPhysicalDevice physical_device); + + static bool QueryFeatures( + VkPhysicalDevice physical_device, + VkPhysicalDeviceDataGraphModelFeaturesQCOM& out_features); + + static bool ValidateModelCacheBlob( + const std::vector& model_blob, + uint32_t& out_cache_version); + + static bool SelectQcomEngineAndOperationForQueueFamily( + VkPhysicalDevice physical_device, + uint32_t queue_family_index, + QCOM_MODEL_OPERATION preferred_operation, + QcomSelectedDataGraphSupport& out_selection); + + private: + static inline uint32_t ReadU32LE(const unsigned char* ptr) + { + return (uint32_t(ptr[0])) | + (uint32_t(ptr[1]) << 8) | + (uint32_t(ptr[2]) << 16) | + (uint32_t(ptr[3]) << 24); + } + }; +} // namespace Ml diff --git a/samples/graph_pipelines/code/main/ml/TensorResources.cpp b/samples/graph_pipelines/code/main/ml/TensorResources.cpp new file mode 100644 index 0000000..ee7927e --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/TensorResources.cpp @@ -0,0 +1,288 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#include +#include "TensorResources.hpp" + +bool Ml::TensorResources::FindMemoryType( + VkPhysicalDevice physical_device, + uint32_t type_bits, + VkMemoryPropertyFlags properties, + uint32_t& out_index) +{ + VkPhysicalDeviceMemoryProperties mem_properties = {}; + vkGetPhysicalDeviceMemoryProperties(physical_device, &mem_properties); + + for (uint32_t i = 0; i < mem_properties.memoryTypeCount; ++i) + { + const bool has_properties = (mem_properties.memoryTypes[i].propertyFlags & properties) == properties; + const bool has_bit = (type_bits & (1u << i)) != 0; + + if (has_properties && has_bit) + { + out_index = i; + return true; + } + } + + return false; +} + +bool Ml::TensorResources::CreateTensorInternal( + VkDevice device, + VkPhysicalDevice physical_device, + GraphPipelineTensor& target_tensor) +{ + // TENSOR DESCRIPTION + target_tensor.tensor_description = VkTensorDescriptionARM + { + .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, + .pNext = nullptr, + .tiling = VK_TENSOR_TILING_LINEAR_ARM, + .format = VK_FORMAT_R8_UNORM, + .dimensionCount = static_cast(target_tensor.dimensions.size()), + .pDimensions = target_tensor.dimensions.data(), + .pStrides = nullptr, + .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM/* | VK_TENSOR_USAGE_SHADER_BIT_ARM*/ + }; + + // TENSOR OBJECT + VkExternalMemoryTensorCreateInfoARM external_info = {}; + external_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_TENSOR_CREATE_INFO_ARM; + external_info.pNext = nullptr; + external_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; + + VkTensorCreateInfoARM tensor_info = {}; + tensor_info.sType = VK_STRUCTURE_TYPE_TENSOR_CREATE_INFO_ARM; + tensor_info.pNext = &external_info; + tensor_info.flags = 0; + tensor_info.pDescription = &target_tensor.tensor_description; + tensor_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + tensor_info.queueFamilyIndexCount = 0; + tensor_info.pQueueFamilyIndices = nullptr; + + if (vkCreateTensorARM(device, &tensor_info, nullptr, &target_tensor.tensor) != VK_SUCCESS) + { + return false; + } + + // MEMORY REQUIREMENTS + VkDeviceTensorMemoryRequirementsARM device_mem_req_info = {}; + device_mem_req_info.sType = VK_STRUCTURE_TYPE_DEVICE_TENSOR_MEMORY_REQUIREMENTS_ARM; + device_mem_req_info.pNext = nullptr; + device_mem_req_info.pCreateInfo = &tensor_info; + + VkMemoryRequirements2 tensor_mem_req = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2 }; + vkGetDeviceTensorMemoryRequirementsARM(device, &device_mem_req_info, &tensor_mem_req); + + // ALIASED BUFFER + const uint32_t buffer_size = + static_cast(target_tensor.dimensions[0] * target_tensor.dimensions[1] * target_tensor.dimensions[2]); + + VkBufferCreateInfo buffer_info = {}; + buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + buffer_info.pNext = nullptr; + buffer_info.flags = 0; + buffer_info.size = buffer_size; + buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &buffer_info, nullptr, &target_tensor.aliased_buffer) != VK_SUCCESS) + { + return false; + } + + VkMemoryRequirements buffer_mem_req = {}; + vkGetBufferMemoryRequirements(device, target_tensor.aliased_buffer, &buffer_mem_req); + + uint32_t memory_type_index = 0; + if (!FindMemoryType(physical_device, buffer_mem_req.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, memory_type_index)) + { + return false; + } + + VkExportMemoryAllocateInfo export_alloc_info = {}; + export_alloc_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO; + export_alloc_info.pNext = nullptr; + export_alloc_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; + + VkMemoryAllocateInfo alloc_info = {}; + alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + alloc_info.pNext = &export_alloc_info; + alloc_info.allocationSize = buffer_mem_req.size; + alloc_info.memoryTypeIndex = memory_type_index; + + if (vkAllocateMemory(device, &alloc_info, nullptr, &target_tensor.tensor_memory) != VK_SUCCESS) + { + return false; + } + + VkBindTensorMemoryInfoARM bind_tensor_info = {}; + bind_tensor_info.sType = VK_STRUCTURE_TYPE_BIND_TENSOR_MEMORY_INFO_ARM; + bind_tensor_info.pNext = nullptr; + bind_tensor_info.tensor = target_tensor.tensor; + bind_tensor_info.memory = target_tensor.tensor_memory; + bind_tensor_info.memoryOffset = 0; + + if (vkBindTensorMemoryARM(device, 1, &bind_tensor_info) != VK_SUCCESS) + { + return false; + } + + if (vkBindBufferMemory(device, target_tensor.aliased_buffer, target_tensor.tensor_memory, 0) != VK_SUCCESS) + { + return false; + } + + // TENSOR VIEW + VkTensorViewCreateInfoARM view_info = {}; + view_info.sType = VK_STRUCTURE_TYPE_TENSOR_VIEW_CREATE_INFO_ARM; + view_info.pNext = nullptr; + view_info.flags = 0; + view_info.tensor = target_tensor.tensor; + view_info.format = target_tensor.tensor_description.format; + + if (vkCreateTensorViewARM(device, &view_info, nullptr, &target_tensor.tensor_view) != VK_SUCCESS) + { + return false; + } + + return true; +} + +bool Ml::TensorResources::Initialize( + VkDevice device, + VkPhysicalDevice physical_device, + VkPhysicalDeviceDataGraphProcessingEngineARM& data_graph_engine, + GraphPipelineTensor& input_tensor, + GraphPipelineTensor& output_tensor, + uint32_t max_port_index) +{ + m_is_valid = false; + + if (!CreateTensorInternal(device, physical_device, input_tensor)) + { + return false; + } + + if (!CreateTensorInternal(device, physical_device, output_tensor)) + { + return false; + } + + // DESCRIPTOR POOL + VkDataGraphProcessingEngineCreateInfoARM engine_info = {}; + engine_info.sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PROCESSING_ENGINE_CREATE_INFO_ARM; + engine_info.pNext = nullptr; + engine_info.processingEngineCount = 1; + engine_info.pProcessingEngines = &data_graph_engine; + + VkDescriptorPoolSize pool_size = {}; + pool_size.type = VK_DESCRIPTOR_TYPE_TENSOR_ARM; + pool_size.descriptorCount = max_port_index + 1; + + VkDescriptorPoolCreateInfo pool_info = {}; + pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + pool_info.pNext = &engine_info; + pool_info.flags = 0; + pool_info.maxSets = 1; + pool_info.poolSizeCount = 1; + pool_info.pPoolSizes = &pool_size; + + if (vkCreateDescriptorPool(device, &pool_info, nullptr, &m_resources.tensor_descriptor_pool) != VK_SUCCESS) + { + return false; + } + + // DESCRIPTOR SET LAYOUT + VkDescriptorSetLayoutBinding bindings[2] = {}; + bindings[0].binding = input_tensor.port_binding_index; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = VK_SHADER_STAGE_ALL; + + bindings[1].binding = output_tensor.port_binding_index; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = VK_SHADER_STAGE_ALL; + + VkDescriptorSetLayoutCreateInfo layout_info = {}; + layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layout_info.pNext = nullptr; + layout_info.flags = 0; + layout_info.bindingCount = 2; + layout_info.pBindings = bindings; + + if (vkCreateDescriptorSetLayout(device, &layout_info, nullptr, &m_resources.tensor_descriptor_set_layout) != VK_SUCCESS) + { + return false; + } + + // DESCRIPTOR SET ALLOC + VkDescriptorSetAllocateInfo alloc_ds_info = {}; + alloc_ds_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + alloc_ds_info.descriptorPool = m_resources.tensor_descriptor_pool; + alloc_ds_info.descriptorSetCount = 1; + alloc_ds_info.pSetLayouts = &m_resources.tensor_descriptor_set_layout; + + if (vkAllocateDescriptorSets(device, &alloc_ds_info, &m_resources.tensor_descriptor_set) != VK_SUCCESS) + { + return false; + } + + // DESCRIPTOR SET UPDATE (tensor view writes) + VkWriteDescriptorSetTensorARM tensor_writes[2] = {}; + tensor_writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM; + tensor_writes[0].pNext = nullptr; + tensor_writes[0].tensorViewCount = 1; + tensor_writes[0].pTensorViews = &input_tensor.tensor_view; + + tensor_writes[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM; + tensor_writes[1].pNext = nullptr; + tensor_writes[1].tensorViewCount = 1; + tensor_writes[1].pTensorViews = &output_tensor.tensor_view; + + VkWriteDescriptorSet writes[2] = {}; + writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[0].pNext = &tensor_writes[0]; + writes[0].dstSet = m_resources.tensor_descriptor_set; + writes[0].dstBinding = input_tensor.port_binding_index; + writes[0].dstArrayElement = 0; + writes[0].descriptorCount = 1; + writes[0].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; + + writes[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[1].pNext = &tensor_writes[1]; + writes[1].dstSet = m_resources.tensor_descriptor_set; + writes[1].dstBinding = output_tensor.port_binding_index; + writes[1].dstArrayElement = 0; + writes[1].descriptorCount = 1; + writes[1].descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM; + + vkUpdateDescriptorSets(device, 2, writes, 0, nullptr); + + m_is_valid = true; + return true; +} + +void Ml::TensorResources::Destroy(VkDevice device) +{ + if (m_resources.tensor_descriptor_pool != VK_NULL_HANDLE) + { + vkDestroyDescriptorPool(device, m_resources.tensor_descriptor_pool, nullptr); + m_resources.tensor_descriptor_pool = VK_NULL_HANDLE; + } + + if (m_resources.tensor_descriptor_set_layout != VK_NULL_HANDLE) + { + vkDestroyDescriptorSetLayout(device, m_resources.tensor_descriptor_set_layout, nullptr); + m_resources.tensor_descriptor_set_layout = VK_NULL_HANDLE; + } + + m_resources.tensor_descriptor_set = VK_NULL_HANDLE; + m_is_valid = false; +} \ No newline at end of file diff --git a/samples/graph_pipelines/code/main/ml/TensorResources.hpp b/samples/graph_pipelines/code/main/ml/TensorResources.hpp new file mode 100644 index 0000000..b28dfe3 --- /dev/null +++ b/samples/graph_pipelines/code/main/ml/TensorResources.hpp @@ -0,0 +1,57 @@ +//============================================================================================================ +// +// +// Copyright (c) 2026, Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// +//============================================================================================================ +#pragma once + +#include +#include +#include "main/applicationHelperBase.hpp" +#include "GraphPipelineTypes.hpp" + +namespace Ml +{ + class TensorResources + { + public: + TensorResources() = default; + ~TensorResources() = default; + + TensorResources(const TensorResources&) = delete; + TensorResources& operator=(const TensorResources&) = delete; + + bool Initialize( + VkDevice device, + VkPhysicalDevice physical_device, + VkPhysicalDeviceDataGraphProcessingEngineARM& data_graph_engine, + GraphPipelineTensor& input_tensor, + GraphPipelineTensor& output_tensor, + uint32_t max_port_index); + + void Destroy(VkDevice device); + + inline bool IsValid() const { return m_is_valid; } + + inline const GraphPipelineResources& GetResources() const { return m_resources; } + + private: + + static bool FindMemoryType( + VkPhysicalDevice physical_device, + uint32_t type_bits, + VkMemoryPropertyFlags properties, + uint32_t& out_index); + + bool CreateTensorInternal( + VkDevice device, + VkPhysicalDevice physical_device, + GraphPipelineTensor& target_tensor); + + private: + bool m_is_valid = false; + GraphPipelineResources m_resources = {}; + }; +} // namespace Ml diff --git a/samples/graph_pipelines/img/screenshot.png b/samples/graph_pipelines/img/screenshot.png new file mode 100644 index 0000000..5f2660f Binary files /dev/null and b/samples/graph_pipelines/img/screenshot.png differ diff --git a/samples/graph_pipelines/shaders/Blit.frag b/samples/graph_pipelines/shaders/Blit.frag index 839bc24..b2e989d 100644 --- a/samples/graph_pipelines/shaders/Blit.frag +++ b/samples/graph_pipelines/shaders/Blit.frag @@ -14,6 +14,7 @@ // Buffer binding locations #define SHADER_DIFFUSE_TEXTURE_LOC 0 #define SHADER_OVERLAY_TEXTURE_LOC 1 +#define SHADER_FRAG_UBO_LOCATION 2 layout(set = 0, binding = SHADER_DIFFUSE_TEXTURE_LOC) uniform sampler2D u_DiffuseTex; layout(set = 0, binding = SHADER_OVERLAY_TEXTURE_LOC) uniform sampler2D u_OverlayTex; @@ -25,6 +26,13 @@ layout (location = 1) in vec4 v_VertColor; // Finally, the output color layout (location = 0) out vec4 FragColor; +// Uniform Constant Buffer +layout(std140, set = 0, binding = SHADER_FRAG_UBO_LOCATION) uniform Params +{ + bool IsUpscalingActive; + +} FragCB; + //----------------------------------------------------------------------------- void main() diff --git a/samples/graph_pipelines/shaders/Blit.json b/samples/graph_pipelines/shaders/Blit.json index d7d32c6..7c6404d 100644 --- a/samples/graph_pipelines/shaders/Blit.json +++ b/samples/graph_pipelines/shaders/Blit.json @@ -21,6 +21,11 @@ "Stages": [ "Fragment" ], "Count": 1, "Names": [ "Overlay" ] + }, + { + "Type": "UniformBuffer", + "Stages": [ "Fragment" ], + "Names": [ "Params" ] } ] } diff --git a/samples/hdr_swapchain/README.md b/samples/hdr_swapchain/README.md new file mode 100644 index 0000000..0e9a703 --- /dev/null +++ b/samples/hdr_swapchain/README.md @@ -0,0 +1,7 @@ +# HDR Swapchain + +![Screenshot](img/screenshot.png) + +This sample demonstrates creating and presenting to an **HDR‑capable Vulkan swapchain**, showing how to select HDR formats/color spaces, render in HDR, and present with correct metadata for wide‑gamut, high‑luminance displays. + +The app queries the surface for supported HDR formats and color spaces, builds an HDR swapchain, and renders with appropriate transfer functions and output transforms. If HDR is unavailable, it falls back to an SDR swapchain while preserving visual consistency across devices, including Adreno™ GPUs. \ No newline at end of file diff --git a/samples/hdr_swapchain/img/screenshot.png b/samples/hdr_swapchain/img/screenshot.png new file mode 100644 index 0000000..67828b9 Binary files /dev/null and b/samples/hdr_swapchain/img/screenshot.png differ diff --git a/samples/image_processing/README.md b/samples/image_processing/README.md index 073f3be..c925248 100644 --- a/samples/image_processing/README.md +++ b/samples/image_processing/README.md @@ -1,47 +1,4 @@ # VK_QCOM_Image_Processing Bloom Sample -## Overview - -This demonstrates how to use the VK_QCOM_Image_Processing extension in a simple bloom shader. -Setting gUseExtension to false will use a standard method, setting to true will use the extension in the downsample and blur passes. - -## Building - -### Dependencies - -The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable. - -* Android SDK -* Andorid NDK -* Gradle -* CMake -* Android Studio - -### Build - -Once the dependencies are installed and shaders compiled, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system: - -``` -01_BuildAndroid.bat -02_BuildWindows.bat -``` - -### Deploy (android-only) - -To deploy the media files and the .apk to a connected device, run the batch file below: - -``` -01_Install_APK.bat -``` - -Optionally you can change the default configurations for this sample by upating the file **app_config.txt** and running the batch file below: - -``` -02_InstallConfig.bat -``` - -## Android Studio - -This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging. - -To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio. +This demonstrates how to use the **VK_QCOM_Image_Processing** extension in a simple bloom shader. +Setting gUseExtension to false will use a standard method, setting to true will use the extension in the downsample and blur passes. \ No newline at end of file diff --git a/samples/rotated_copy/README.md b/samples/rotated_copy/README.md index 1b6a661..05713a5 100644 --- a/samples/rotated_copy/README.md +++ b/samples/rotated_copy/README.md @@ -1,6 +1,8 @@ # VK_QCOM_rotated_copy_commands Extension Sample -Sample to initialize and use the 'VK_QCOM_rotated_copy_commands' Vulkan extension. +![Screenshot](img/screenshot.png) -Extension may/will need enabling on older Qualcomm Vulkan drivers. Sample does nothing useful on non Qualcomm hardware. +Sample to initialize and use the +**VK_QCOM_rotated_copy_commands** Vulkan extension. +This extension adds rotated copy capabilities to older devices (where rotated swapchain alternative wasn't available). \ No newline at end of file diff --git a/samples/rotated_copy/img/screenshot.png b/samples/rotated_copy/img/screenshot.png new file mode 100644 index 0000000..ea2b7c3 Binary files /dev/null and b/samples/rotated_copy/img/screenshot.png differ diff --git a/samples/sgsr/README.md b/samples/sgsr/README.md index b192c7f..654d3f3 100644 --- a/samples/sgsr/README.md +++ b/samples/sgsr/README.md @@ -2,64 +2,7 @@ ![Screenshot](img/screenshot.png) -## Overview - -This sample demonstrates how to use the [Snapdragon™ Game Super Resolution](https://github.com/quic/snapdragon-gsr) and offers a comparison with the traditional *Bilinear Interpolation*. +This sample demonstrates how to use the [Snapdragon™ Game Super Resolution](https://github.com/SnapdragonGameStudios/snapdragon-gsr) and offers a comparison with the traditional *Bilinear Interpolation*. - Toggle **SGSR Active** to activate Snapdragon™ GSR -- Toggle **SGSR Edge Direction** to use an optional edge direction calculation on the SGSR shader that helps reducing banding - -## Building - -### Dependencies - -The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable. - -* Android SDK -* Andorid NDK -* Gradle -* CMake -* Android Studio - -### Pre-Build - -Compile the underlying shaders to .spv by running the batch file below: - -``` -01_CompileShaders.bat -``` - -And convert the needed textures to the correct format using the batch file below: - -``` -02_PrepareMedia.bat -``` - -### Build - -Once the dependencies are installed and shaders compiled, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system: - -``` -01_BuildAndroid.bat -02_BuildWindows.bat -``` - -### Deploy (android-only) - -To deploy the media files and the .apk to a connected device, run the batch file below: - -``` -03_Install_APK.bat -``` - -Optionally you can change the default configurations for this sample by upating the file **app_config.txt** and running the batch file below: - -``` -04_InstallConfig.bat -``` - -## Android Studio - -This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging. - -To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio. +- Toggle **SGSR Edge Direction** to use an optional edge direction calculation on the SGSR shader that helps reducing banding \ No newline at end of file diff --git a/samples/sgsr/img/screenshot.png b/samples/sgsr/img/screenshot.png new file mode 100644 index 0000000..b77b563 Binary files /dev/null and b/samples/sgsr/img/screenshot.png differ diff --git a/samples/sgsr2/README.md b/samples/sgsr2/README.md index 321d025..62f78e4 100644 --- a/samples/sgsr2/README.md +++ b/samples/sgsr2/README.md @@ -2,41 +2,6 @@ ![Screenshot](img/screenshot.png) -## Overview +This sample demonstrates how to use [Snapdragon™ Game Super Resolution 2](https://github.com/SnapdragonGameStudios/snapdragon-gsr). -This sample demonstrates how to use [Snapdragon™ Game Super Resolution 2](https://github.com/SnapdragonStudios/snapdragon-gsr). - -Snapdragon™ Game Super Resolution 2 (Snapdragon™ GSR 2 or SGSR 2) was developed by Qualcomm Snapdragon Game Studios. It's a temporal upscaling solution optimized for Adreno devices. It comes with 3 different variants (compute 3-pass is the one being demonstrated on this sample). - -## Building - -### Dependencies - -The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable. - -* Android SDK -* Andorid NDK -* Gradle -* CMake -* Android Studio - -Build the tools program, which is used to conver textures, meshes and compile shaders. - -``` -03_BuildTools.bat -``` - -### Build - -Once the dependencies are installed, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system: - -``` -01_BuildAndroid.bat -02_BuildWindows.bat -``` - -## Android Studio - -This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging. - -To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio. +Snapdragon™ Game Super Resolution 2 (Snapdragon™ GSR 2 or SGSR 2) was developed by Qualcomm Snapdragon Game Studios. It's a temporal upscaling solution optimized for Adreno devices. It comes with 3 different variants (compute 3-pass is the one being demonstrated on this sample). \ No newline at end of file diff --git a/samples/sgsr2/img/screenshot.png b/samples/sgsr2/img/screenshot.png new file mode 100644 index 0000000..4e50503 Binary files /dev/null and b/samples/sgsr2/img/screenshot.png differ diff --git a/samples/sub_pass/README.md b/samples/sub_pass/README.md index 5d7a12b..1075a38 100644 --- a/samples/sub_pass/README.md +++ b/samples/sub_pass/README.md @@ -1,71 +1,9 @@ # SubPass Sample -![Screenshot](img/screenshot.PNG) - -## Overview - For mobile tile-based GPUs, subpasses is one of the most important ways to save memory bandwidth, therefore, to improve power efficiency and help performance at the meantime. -SubPass sample demos the use of vulkan subpasses to perform a filmic tonemapping operator (on a simple forward rendered scene) and the impact on bandwidth and performance with subpass. -Optionally runs the tonemap as a subpass of the main scene pass. Has onscreen UI controls to enable/disable the use of subpasses (for measuring GPU subpass efficiency). -Bandwidth savings are meaused with Snapdragon Profiler. 60% of the bandwidth can be saved in this sample when enabling subpass. - -From SnapdragonProfiler, we can see that there are four surfaces when subpass is disabled and three surfaces otherwise. There is extra GMEM stores when there is no subpass. - -![NoSubPassSurfaces](img/nosubpassstage.PNG) - -![SubPassSurfaces](img/subpassstage.PNG) - -Read total and write total have both been reduced when subpass is enable. - -![NoSubPassBandwidth](img/nosubpass.PNG) - -![SubPassBandwidth](img/subpass.PNG) - -## Building - -### Dependencies - -The following dependencies must be installed and the appropriate locations should be referenced in the `PATH` environment variable. - -* Android SDK -* Andorid NDK -* Gradle -* CMake -* Android Studio +SubPass sample demonstrate the use of vulkan subpasses to perform a filmic tonemapping operator (on a simple forward rendered scene) and the impact on bandwidth and performance with subpass. -### Pre-Build - -Compile the underlying shaders to .spv by running the batch file below: - -``` -01_CompileShaders.bat -``` - -Note: The sample assumes the existence of supporting assets under the **'Media'** folder. These assets are not currently distributed with the framework. -The framework team is working to build a centralized asset repository that should minimize these requirements in the near future. - -### Build - -Once the dependencies are installed and shaders compiled, building this sample .apk/.exe is as simple as running any of the batch files from the framework root directory, accordingly to your target system: - -``` -01_BuildAndroid.bat -02_BuildWindows.bat -``` - -### Deploy (android-only) - -To deploy the media files and the .apk to a connected device, run the batch files below: - -``` -02_Install_APK.bat -``` - -If desired, you can keep track of any logging by running one of the logcat batch files (which you can find on the current directory). - -## Android Studio - -This sample can also be easily imported to Android Studio and be used within the Android Studio ecosystem including building, deploying, and native code debugging. +Optionally runs the tonemap as a subpass of the main scene pass. Has onscreen UI controls to enable/disable the use of subpasses (for measuring GPU subpass efficiency). -To do this, open Android Studio and go to `File->New->Import Project...` and select the `project\android` folder as the source for the import. This will load up the gradle configuration and once finalized, the sample can be used within Android Studio. +Bandwidth savings are meaused with Snapdragon Profiler. 60% of the bandwidth can be saved in this sample when enabling subpass. \ No newline at end of file diff --git a/samples/tile_memory/README.md b/samples/tile_memory/README.md index 243f385..050e0ef 100644 --- a/samples/tile_memory/README.md +++ b/samples/tile_memory/README.md @@ -1,5 +1,7 @@ # Tile Memory Heap Sample +![Screenshot](img/screenshot.png) + This sample demonstrates a light clustering algorithm using Vulkan, with specific support for the *VK_QCOM_tile_memory_heap* extension. This extension allows the application to allocate and manage tile memory, which is used for efficient memory management within a command buffer submission batch. diff --git a/samples/tile_memory/img/screenshot.png b/samples/tile_memory/img/screenshot.png new file mode 100644 index 0000000..d60af4a Binary files /dev/null and b/samples/tile_memory/img/screenshot.png differ diff --git a/samples/tile_shading/README.md b/samples/tile_shading/README.md index c787a00..925b521 100644 --- a/samples/tile_shading/README.md +++ b/samples/tile_shading/README.md @@ -1,5 +1,7 @@ # Tile Shading Sample +![Screenshot](img/screenshot.png) + This sample demonstrates a tile-based shading technique using Vulkan, with support for the *VK_QCOM_tile_memory_heap* extension. The extension enables the application to allocate and manage tile-local memory, which is scoped to the duration of a command buffer submission and optimized for high-bandwidth, low-latency access within a tile. diff --git a/samples/tile_shading/code/main/application.cpp b/samples/tile_shading/code/main/application.cpp index a263276..1f7c9de 100644 --- a/samples/tile_shading/code/main/application.cpp +++ b/samples/tile_shading/code/main/application.cpp @@ -404,7 +404,8 @@ bool Application::CreateRenderTargets() gRenderWidth, gRenderHeight, SceneColorType, - desiredDepthFormat, + desiredDepthFormat, + nullptr, {}, // std::span }, "Scene RT")) @@ -421,6 +422,7 @@ bool Application::CreateRenderTargets() gRenderHeight, DeferredLightColorType, TextureFormat::UNDEFINED, + nullptr, {}, // std::span }, "Particles RT")) @@ -437,6 +439,7 @@ bool Application::CreateRenderTargets() gSurfaceHeight, HudColorType, TextureFormat::UNDEFINED, + nullptr, {}, // std::span }, "HUD RT")) diff --git a/samples/tile_shading/img/screenshot.png b/samples/tile_shading/img/screenshot.png new file mode 100644 index 0000000..0d4cfab Binary files /dev/null and b/samples/tile_shading/img/screenshot.png differ