From 4f628a2a79052f37fab602f2ebe9676f27fda3af Mon Sep 17 00:00:00 2001 From: Damien Dooley Date: Thu, 4 Jun 2026 15:35:39 +0100 Subject: [PATCH 1/4] Add SME2 shape heuristic Signed-off-by: Damien Dooley --- src/armnn/Network.cpp | 369 +++++++++++++++++- src/backends/neon/NeonBackend.cpp | 8 +- src/backends/neon/NeonBackendModelContext.cpp | 23 +- src/backends/neon/NeonBackendModelContext.hpp | 9 +- src/backends/neon/NeonWorkloadFactory.cpp | 6 +- 5 files changed, 399 insertions(+), 16 deletions(-) diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index e81674bf29..48276cd9e8 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2025 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -14,6 +14,8 @@ #include "armnnUtils/Filesystem.hpp" #include "armnn/utility/Timer.hpp" +#include + #include #include #include @@ -35,12 +37,372 @@ #include #include +#include #include #include #include namespace armnn { +namespace +{ + +struct Sme2ShapeProfile +{ + unsigned int m_GemmLikeOps = 0; + bool m_HasFp16 = false; + bool m_HasQuantized = false; + bool m_HasSegmentationShape = false; + bool m_HasStyleTransferShape = false; + bool m_HasPoseShape = false; + bool m_HasSmallMLargeNProjection = false; +}; + +bool IsQuantizedDataType(DataType dataType) +{ + switch (dataType) + { + case DataType::QAsymmU8: + case DataType::QAsymmS8: + case DataType::QSymmS8: + case DataType::QSymmS16: + return true; + default: + return false; + } +} + +void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo) +{ + const DataType dataType = tensorInfo.GetDataType(); + profile.m_HasFp16 |= dataType == DataType::Float16; + profile.m_HasQuantized |= IsQuantizedDataType(dataType); +} + +bool HasSpecifiedShape(const TensorInfo& tensorInfo) +{ + const TensorShape& shape = tensorInfo.GetShape(); + return shape.GetDimensionality() == Dimensionality::Specified && + shape.AreAllDimensionsSpecified(); +} + +int64_t NumElements(const TensorShape& shape) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified()) + { + return 0; + } + + int64_t elements = 1; + for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i) + { + elements *= static_cast(std::max(shape[i], 1U)); + } + return elements; +} + +int64_t Dimension(const TensorShape& shape, unsigned int index) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified() || + index >= shape.GetNumDimensions()) + { + return 0; + } + return static_cast(shape[index]); +} + +int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset) +{ + if (offset == 0 || shape.GetNumDimensions() < offset) + { + return 0; + } + return Dimension(shape, shape.GetNumDimensions() - offset); +} + +void RecordGemmShape(Sme2ShapeProfile& profile, + int64_t m, + int64_t n, + int64_t k, + int64_t kernelH, + int64_t kernelW) +{ + if (m <= 0 || n <= 0 || k <= 0) + { + return; + } + + ++profile.m_GemmLikeOps; + + const bool is1x1 = kernelH == 1 && kernelW == 1; + + if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) + { + profile.m_HasSegmentationShape = true; + } + + if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) || + (m >= 60000 && m <= 75000 && n == 32 && k >= 500))) + { + profile.m_HasStyleTransferShape = true; + } + + if (!is1x1 && m >= 100000 && n >= 64 && k >= 500) + { + profile.m_HasPoseShape = true; + } + + if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024) + { + profile.m_HasSmallMLargeNProjection = true; + } +} + +void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, filterInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const Convolution2dDescriptor& descriptor = + static_cast(layer.GetParameters()); + const TensorShape& filterShape = filterInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout); + + if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4) + { + return; + } + + const int64_t n = Dimension(filterShape, 0); + const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex()); + const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex()); + const int64_t filterElements = NumElements(filterShape); + const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ? + filterElements / (n * kernelH * kernelW) : 0; + const int64_t k = kernelH * kernelW * inputChannels; + const int64_t outputElements = NumElements(outputShape); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, kernelH, kernelW); +} + +void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, weightsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& weightsShape = weightsInfo.GetShape(); + if (weightsShape.GetNumDimensions() < 2) + { + return; + } + + const FullyConnectedDescriptor& descriptor = + static_cast(layer.GetParameters()); + const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U; + const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U; + const int64_t n = Dimension(weightsShape, nIndex); + const int64_t k = Dimension(weightsShape, kIndex); + const int64_t outputElements = NumElements(outputInfo.GetShape()); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, 1, 1); +} + +void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, lhsInfo); + RecordTensorType(profile, rhsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& lhsShape = lhsInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const int64_t n = DimensionFromEnd(outputShape, 1); + const int64_t m = n > 0 ? NumElements(outputShape) / n : 0; + int64_t k = DimensionFromEnd(lhsShape, 1); + if (k == n) + { + k = DimensionFromEnd(lhsShape, 2); + } + + RecordGemmShape(profile, m, n, k, 1, 1); +} + +Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16) +{ + Sme2ShapeProfile profile; + profile.m_HasFp16 = reduceFp32ToFp16; + + for (const Layer* layer : graph) + { + switch (layer->GetType()) + { + case LayerType::Convolution2d: + RecordConvolution2d(profile, *layer); + break; + case LayerType::FullyConnected: + RecordFullyConnected(profile, *layer); + break; + case LayerType::BatchMatMul: + RecordBatchMatMul(profile, *layer); + break; + default: + for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i) + { + if (layer->GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i) + { + if (layer->GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo()); + } + } + break; + } + } + + return profile; +} + +unsigned int CapWorkerCount(unsigned int workers, unsigned int cap) +{ + if (workers == 0 || cap == 0 || cap >= workers) + { + return workers; + } + return cap; +} + +unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions) +{ + unsigned int numberOfThreads = 0; + ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value) + { + if (name == "NumberOfThreads") + { + if (value.IsUnsignedInt()) + { + numberOfThreads = value.AsUnsignedInt(); + } + else if (value.IsInt() && value.AsInt() > 0) + { + numberOfThreads = static_cast(value.AsInt()); + } + } + }); + return numberOfThreads; +} + +bool ShouldDisableSme(const Sme2ShapeProfile& profile) +{ + if (profile.m_GemmLikeOps == 0) + { + return false; + } + + return (profile.m_HasFp16 || profile.m_HasQuantized) && !profile.m_HasSmallMLargeNProjection; +} + +unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads) +{ + if (!profile.m_HasQuantized || ShouldDisableSme(profile)) + { + return requestedThreads; + } + + if (profile.m_GemmLikeOps == 0) + { + return CapWorkerCount(requestedThreads, 1); + } + + if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape) + { + return requestedThreads; + } + + if (profile.m_HasPoseShape) + { + return CapWorkerCount(requestedThreads, 4); + } + + return CapWorkerCount(requestedThreads, 1); +} + +void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions) +{ + const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); + const bool smeEnabled = !ShouldDisableSme(profile); + const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); + const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads); + + modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}})); + if (selectedThreads != requestedThreads) + { + modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}})); + } +} + +} // namespace INetwork::INetwork(NetworkOptions networkOptions) : pNetworkImpl(new NetworkImpl(networkOptions)) {} @@ -2096,6 +2458,9 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph, optGraph.InferTensorInfos(); } + ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions); + optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions; + // Initialize backend settings BackendSettings backendSettings(backendPreferences, deviceSpec); auto availablePreferredBackends = backendSettings.GetAvailablePreferredBackends(); @@ -2207,7 +2572,7 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph, OptimizationResult backendOptimizationResult = ApplyBackendOptimizations(optNetObjPtr->pOptimizedNetworkImpl.get(), backendSettings, backends, - options.GetModelOptions(), + optimizedOptions, messages); if (backendOptimizationResult.m_Error) { diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index 9cd9d18d47..feb19e79e7 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2025 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -134,11 +134,7 @@ IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport() const IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport(const ModelOptions& modelOptions) const { - static ILayerSupportSharedPtr layerSupport - { - new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions)) - }; - return layerSupport; + return ILayerSupportSharedPtr{new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions))}; } OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph, diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp index 270592e94d..acc2eecc07 100644 --- a/src/backends/neon/NeonBackendModelContext.cpp +++ b/src/backends/neon/NeonBackendModelContext.cpp @@ -1,10 +1,12 @@ // -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "NeonBackendModelContext.hpp" +#include + namespace { @@ -32,7 +34,7 @@ namespace armnn { NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOptions) - : m_IsFastMathEnabled(false), m_NumberOfThreads(0) + : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSmeEnabled(true) { if (!modelOptions.empty()) { @@ -40,14 +42,20 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption { if (name == "FastMathEnabled") { - m_IsFastMathEnabled |= ParseBool(value, false); + m_IsFastMathEnabled = ParseBool(value, m_IsFastMathEnabled); } if (name == "NumberOfThreads") { - m_NumberOfThreads |= ParseUnsignedInt(value, 0); + m_NumberOfThreads = ParseUnsignedInt(value, m_NumberOfThreads); + } + if (name == "SmeEnabled") + { + m_IsSmeEnabled = ParseBool(value, m_IsSmeEnabled); } }); } + + arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled); } bool NeonBackendModelContext::IsFastMathEnabled() const @@ -60,4 +68,9 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const return m_NumberOfThreads; } -} // namespace armnn \ No newline at end of file +bool NeonBackendModelContext::IsSmeEnabled() const +{ + return m_IsSmeEnabled; +} + +} // namespace armnn diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp index e736efc1d8..7ca9d0cf0d 100644 --- a/src/backends/neon/NeonBackendModelContext.hpp +++ b/src/backends/neon/NeonBackendModelContext.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -16,6 +16,8 @@ namespace armnn /// results with reduced or different precision. The fast_math flag will not have any effect on int8 performance. /// - "NumberOfThreads"\n /// Specify the number of threads used by the CpuAcc backend. +/// - "SmeEnabled"\n +/// Specify whether SME/SME2 implementations may be selected by the CpuAcc backend. class NeonBackendModelContext : public IBackendModelContext { public: @@ -25,9 +27,12 @@ class NeonBackendModelContext : public IBackendModelContext unsigned int GetNumberOfThreads() const; + bool IsSmeEnabled() const; + private: bool m_IsFastMathEnabled; unsigned int m_NumberOfThreads; + bool m_IsSmeEnabled; }; -} // namespace armnn \ No newline at end of file +} // namespace armnn diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index f75f84d2d8..2126bf954e 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2024 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -20,6 +20,8 @@ #include #include +#include + #include #include @@ -63,6 +65,8 @@ void NeonWorkloadFactory::SetNumberOfThreads() auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); auto numberOfThreads = modelOptions->GetNumberOfThreads(); + arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled()); + if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS) { arm_compute::Scheduler::get().set_num_threads(numberOfThreads); From 3515aac50f7d00f19f49ffc5fc5bb5a4278dd410 Mon Sep 17 00:00:00 2001 From: Damien Dooley Date: Mon, 8 Jun 2026 14:53:12 +0100 Subject: [PATCH 2/4] Updated heuristic for better coverage FP16 and FP32 use cases --- delegate/cmake/Modules/FindTfLite.cmake | 7 +- include/armnnUtils/QuantizeHelper.hpp | 2 +- src/armnn/Network.cpp | 85 +++++++++++++++++-- src/armnnUtils/Half.hpp | 24 ++---- src/backends/neon/NeonBackendModelContext.cpp | 12 ++- src/backends/neon/NeonBackendModelContext.hpp | 5 ++ src/backends/neon/NeonWorkloadFactory.cpp | 1 + 7 files changed, 109 insertions(+), 27 deletions(-) diff --git a/delegate/cmake/Modules/FindTfLite.cmake b/delegate/cmake/Modules/FindTfLite.cmake index bdb4df7ce0..d5c0e53606 100644 --- a/delegate/cmake/Modules/FindTfLite.cmake +++ b/delegate/cmake/Modules/FindTfLite.cmake @@ -150,7 +150,7 @@ if (TfLite_LIB MATCHES .a$) TfLite_ruy_prepare_packed_matrices_LIB TfLite_ruy_system_aligned_alloc_LIB TfLite_ruy_threadpool_LIB TfLite_ruy_trmul_LIB TfLite_ruy_tune_LIB TfLite_ruy_wait_LIB TfLite_ruy_profiler_LIB TfLite_cpuinfo_LIB TfLite_abseil_synchronization_LIB TfLite_abseil_graphCycle_internal_LIB TfLite_abseil_raw_logging_internal_LIB - TfLite_abseil_kernel_timeout_LIB TfLite_abseil_internal_strings_LIB) + TfLite_abseil_internal_strings_LIB) # Set external variables for usage in CMakeLists.txt if (TFLITE_FOUND) # WARNING! The order of these libraries is critical. Moving them @@ -166,7 +166,10 @@ if (TfLite_LIB MATCHES .a$) ${TfLite_ruy_prepare_packed_matrices_LIB} ${TfLite_ruy_system_aligned_alloc_LIB} ${TfLite_ruy_tune_LIB} ${TfLite_ruy_wait_LIB} ${TfLite_ruy_profiler_LIB} ${TfLite_cpuinfo_LIB} ${TfLite_abseil_synchronization_LIB} ${TfLite_abseil_graphCycle_internal_LIB} - ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_kernel_timeout_LIB} ${TfLite_abseil_internal_strings_LIB}) + ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_internal_strings_LIB}) + if(TfLite_abseil_kernel_timeout_LIB) + list(APPEND TfLite_LIB ${TfLite_abseil_kernel_timeout_LIB}) + endif() endif () elseif (TfLite_LIB MATCHES .so$) message("-- Dynamic tensorflow lite library found, using for ArmNN build") diff --git a/include/armnnUtils/QuantizeHelper.hpp b/include/armnnUtils/QuantizeHelper.hpp index 231b8411cb..5868d13fc8 100644 --- a/include/armnnUtils/QuantizeHelper.hpp +++ b/include/armnnUtils/QuantizeHelper.hpp @@ -96,7 +96,7 @@ float SelectiveDequantize(T value, float scale, int32_t offset) template struct IsFloatingPointIterator { - static constexpr bool value=std::is_floating_point::value_type>::value; + static constexpr bool value=armnn::IsFloatingPoint::value_type>::value; }; template = 900 && k <= 384) || (n <= 384 && k >= 900))) { @@ -204,7 +219,7 @@ void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) const int64_t outputElements = NumElements(outputShape); const int64_t m = n > 0 ? outputElements / n : 0; - RecordGemmShape(profile, m, n, k, kernelH, kernelW); + RecordGemmShape(profile, m, n, k, kernelH, kernelW, false); } void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) @@ -245,7 +260,7 @@ void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) const int64_t outputElements = NumElements(outputInfo.GetShape()); const int64_t m = n > 0 ? outputElements / n : 0; - RecordGemmShape(profile, m, n, k, 1, 1); + RecordGemmShape(profile, m, n, k, 1, 1, true); } void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) @@ -281,7 +296,27 @@ void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) k = DimensionFromEnd(lhsShape, 2); } - RecordGemmShape(profile, m, n, k, 1, 1); + RecordGemmShape(profile, m, n, k, 1, 1, true); +} + +void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + ++profile.m_DepthwiseConvolution2dOps; + + for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) + { + if (layer.GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i) + { + if (layer.GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo()); + } + } } Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16) @@ -302,6 +337,9 @@ Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16 case LayerType::BatchMatMul: RecordBatchMatMul(profile, *layer); break; + case LayerType::DepthwiseConvolution2d: + RecordDepthwiseConvolution2d(profile, *layer); + break; default: for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i) { @@ -353,6 +391,30 @@ unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions) return numberOfThreads; } +bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile) +{ + const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized; + if (!isFloatOnly) + { + return false; + } + + const bool hasHeavySpatialConvolution = + profile.m_GemmMacs > 0 && + profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs && + !profile.m_HasSegmentationShape; + + const bool hasSmallDenseGraph = + profile.m_DepthwiseConvolution2dOps == 0 && + profile.m_SmallDenseProjectionOps >= 4 && + !profile.m_HasSmallMLargeNProjection; + + return profile.m_HasPoseShape || + profile.m_HasStyleTransferShape || + hasHeavySpatialConvolution || + hasSmallDenseGraph; +} + bool ShouldDisableSme(const Sme2ShapeProfile& profile) { if (profile.m_GemmLikeOps == 0) @@ -360,7 +422,17 @@ bool ShouldDisableSme(const Sme2ShapeProfile& profile) return false; } - return (profile.m_HasFp16 || profile.m_HasQuantized) && !profile.m_HasSmallMLargeNProjection; + if (profile.m_HasFp16) + { + return true; + } + + if (profile.m_HasQuantized) + { + return !profile.m_HasSmallMLargeNProjection; + } + + return HasFloatSmeRegressionRisk(profile); } unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads) @@ -392,10 +464,11 @@ void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOption { const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); const bool smeEnabled = !ShouldDisableSme(profile); + const bool sveEnabled = smeEnabled || profile.m_HasQuantized; const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads); - modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}})); + modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}})); if (selectedThreads != requestedThreads) { modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}})); diff --git a/src/armnnUtils/Half.hpp b/src/armnnUtils/Half.hpp index e39968f54d..ce867725dd 100644 --- a/src/armnnUtils/Half.hpp +++ b/src/armnnUtils/Half.hpp @@ -20,25 +20,15 @@ namespace armnn { using Half = half_float::half; //import half float implementation -} //namespace armnn - - -namespace std -{ -template<> -struct is_floating_point - : integral_constant< bool, true > +template +struct IsArmnnHalf + : std::is_same::type, Half> {}; -template<> -struct is_floating_point - : integral_constant< bool, true > +template +struct IsFloatingPoint + : std::integral_constant::value || IsArmnnHalf::value> {}; -template<> -struct is_floating_point - : integral_constant< bool, true > -{}; - -} //namespace std +} //namespace armnn diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp index acc2eecc07..20b44795dd 100644 --- a/src/backends/neon/NeonBackendModelContext.cpp +++ b/src/backends/neon/NeonBackendModelContext.cpp @@ -34,7 +34,7 @@ namespace armnn { NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOptions) - : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSmeEnabled(true) + : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSveEnabled(true), m_IsSmeEnabled(true) { if (!modelOptions.empty()) { @@ -52,9 +52,14 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption { m_IsSmeEnabled = ParseBool(value, m_IsSmeEnabled); } + if (name == "SveEnabled") + { + m_IsSveEnabled = ParseBool(value, m_IsSveEnabled); + } }); } + arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled); arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled); } @@ -68,6 +73,11 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const return m_NumberOfThreads; } +bool NeonBackendModelContext::IsSveEnabled() const +{ + return m_IsSveEnabled; +} + bool NeonBackendModelContext::IsSmeEnabled() const { return m_IsSmeEnabled; diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp index 7ca9d0cf0d..60d3157471 100644 --- a/src/backends/neon/NeonBackendModelContext.hpp +++ b/src/backends/neon/NeonBackendModelContext.hpp @@ -16,6 +16,8 @@ namespace armnn /// results with reduced or different precision. The fast_math flag will not have any effect on int8 performance. /// - "NumberOfThreads"\n /// Specify the number of threads used by the CpuAcc backend. +/// - "SveEnabled"\n +/// Specify whether SVE/SVE2 implementations may be selected by the CpuAcc backend. /// - "SmeEnabled"\n /// Specify whether SME/SME2 implementations may be selected by the CpuAcc backend. class NeonBackendModelContext : public IBackendModelContext @@ -27,11 +29,14 @@ class NeonBackendModelContext : public IBackendModelContext unsigned int GetNumberOfThreads() const; + bool IsSveEnabled() const; + bool IsSmeEnabled() const; private: bool m_IsFastMathEnabled; unsigned int m_NumberOfThreads; + bool m_IsSveEnabled; bool m_IsSmeEnabled; }; diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index 2126bf954e..e90fe60488 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -65,6 +65,7 @@ void NeonWorkloadFactory::SetNumberOfThreads() auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); auto numberOfThreads = modelOptions->GetNumberOfThreads(); + arm_compute::CPUInfo::get().set_sve_allowed(modelOptions->IsSveEnabled()); arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled()); if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS) From 00fca53fb3defff227a45dea0770c8eec59f91d9 Mon Sep 17 00:00:00 2001 From: Damien Dooley Date: Fri, 12 Jun 2026 11:49:04 +0100 Subject: [PATCH 3/4] Separated heuristic defnition and tightened other logic --- CMakeLists.txt | 2 + src/armnn/Network.cpp | 437 +---------------- src/armnn/Sme2ShapePolicy.cpp | 453 ++++++++++++++++++ src/armnn/Sme2ShapePolicy.hpp | 17 + src/backends/neon/NeonBackendModelContext.cpp | 13 +- src/backends/neon/NeonBackendModelContext.hpp | 4 +- src/backends/neon/NeonWorkloadFactory.cpp | 5 +- 7 files changed, 479 insertions(+), 452 deletions(-) create mode 100644 src/armnn/Sme2ShapePolicy.cpp create mode 100644 src/armnn/Sme2ShapePolicy.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ca3bb69d5..08ed3ea135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,8 @@ list(APPEND armnn_sources src/armnn/Logging.cpp src/armnn/Network.cpp src/armnn/Network.hpp + src/armnn/Sme2ShapePolicy.cpp + src/armnn/Sme2ShapePolicy.hpp src/armnn/NetworkUtils.cpp src/armnn/NetworkUtils.hpp src/armnn/Observable.cpp diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index abd9ab3154..f83be3b864 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -4,6 +4,7 @@ // #include "Network.hpp" +#include "Sme2ShapePolicy.hpp" #include "Graph.hpp" #include "Layer.hpp" #include "DeviceSpec.hpp" @@ -14,8 +15,6 @@ #include "armnnUtils/Filesystem.hpp" #include "armnn/utility/Timer.hpp" -#include - #include #include #include @@ -36,446 +35,12 @@ #include #include -#include -#include #include #include #include namespace armnn { -namespace -{ - -struct Sme2ShapeProfile -{ - unsigned int m_GemmLikeOps = 0; - unsigned int m_DepthwiseConvolution2dOps = 0; - unsigned int m_SmallDenseProjectionOps = 0; - int64_t m_GemmMacs = 0; - int64_t m_NonPointwiseGemmMacs = 0; - bool m_HasFp16 = false; - bool m_HasQuantized = false; - bool m_HasSegmentationShape = false; - bool m_HasStyleTransferShape = false; - bool m_HasPoseShape = false; - bool m_HasSmallMLargeNProjection = false; -}; - -bool IsQuantizedDataType(DataType dataType) -{ - switch (dataType) - { - case DataType::QAsymmU8: - case DataType::QAsymmS8: - case DataType::QSymmS8: - case DataType::QSymmS16: - return true; - default: - return false; - } -} - -void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo) -{ - const DataType dataType = tensorInfo.GetDataType(); - profile.m_HasFp16 |= dataType == DataType::Float16; - profile.m_HasQuantized |= IsQuantizedDataType(dataType); -} - -bool HasSpecifiedShape(const TensorInfo& tensorInfo) -{ - const TensorShape& shape = tensorInfo.GetShape(); - return shape.GetDimensionality() == Dimensionality::Specified && - shape.AreAllDimensionsSpecified(); -} - -int64_t NumElements(const TensorShape& shape) -{ - if (shape.GetDimensionality() != Dimensionality::Specified || - !shape.AreAllDimensionsSpecified()) - { - return 0; - } - - int64_t elements = 1; - for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i) - { - elements *= static_cast(std::max(shape[i], 1U)); - } - return elements; -} - -int64_t Dimension(const TensorShape& shape, unsigned int index) -{ - if (shape.GetDimensionality() != Dimensionality::Specified || - !shape.AreAllDimensionsSpecified() || - index >= shape.GetNumDimensions()) - { - return 0; - } - return static_cast(shape[index]); -} - -int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset) -{ - if (offset == 0 || shape.GetNumDimensions() < offset) - { - return 0; - } - return Dimension(shape, shape.GetNumDimensions() - offset); -} - -void RecordGemmShape(Sme2ShapeProfile& profile, - int64_t m, - int64_t n, - int64_t k, - int64_t kernelH, - int64_t kernelW, - bool isDenseProjection) -{ - if (m <= 0 || n <= 0 || k <= 0) - { - return; - } - - ++profile.m_GemmLikeOps; - - const bool is1x1 = kernelH == 1 && kernelW == 1; - const int64_t macs = m * n * k; - profile.m_GemmMacs += macs; - if (!is1x1) - { - profile.m_NonPointwiseGemmMacs += macs; - } - if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024) - { - ++profile.m_SmallDenseProjectionOps; - } - - if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) - { - profile.m_HasSegmentationShape = true; - } - - if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) || - (m >= 60000 && m <= 75000 && n == 32 && k >= 500))) - { - profile.m_HasStyleTransferShape = true; - } - - if (!is1x1 && m >= 100000 && n >= 64 && k >= 500) - { - profile.m_HasPoseShape = true; - } - - if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024) - { - profile.m_HasSmallMLargeNProjection = true; - } -} - -void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) -{ - if (layer.GetNumInputSlots() < 2 || - layer.GetNumOutputSlots() == 0 || - !layer.GetInputSlot(0).IsTensorInfoSet() || - !layer.GetInputSlot(1).IsTensorInfoSet() || - !layer.GetOutputSlot(0).IsTensorInfoSet()) - { - return; - } - - const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); - const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo(); - const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); - RecordTensorType(profile, inputInfo); - RecordTensorType(profile, filterInfo); - RecordTensorType(profile, outputInfo); - - if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo)) - { - return; - } - - const Convolution2dDescriptor& descriptor = - static_cast(layer.GetParameters()); - const TensorShape& filterShape = filterInfo.GetShape(); - const TensorShape& outputShape = outputInfo.GetShape(); - const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout); - - if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4) - { - return; - } - - const int64_t n = Dimension(filterShape, 0); - const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex()); - const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex()); - const int64_t filterElements = NumElements(filterShape); - const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ? - filterElements / (n * kernelH * kernelW) : 0; - const int64_t k = kernelH * kernelW * inputChannels; - const int64_t outputElements = NumElements(outputShape); - const int64_t m = n > 0 ? outputElements / n : 0; - - RecordGemmShape(profile, m, n, k, kernelH, kernelW, false); -} - -void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) -{ - if (layer.GetNumInputSlots() < 2 || - layer.GetNumOutputSlots() == 0 || - !layer.GetInputSlot(0).IsTensorInfoSet() || - !layer.GetInputSlot(1).IsTensorInfoSet() || - !layer.GetOutputSlot(0).IsTensorInfoSet()) - { - return; - } - - const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); - const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo(); - const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); - RecordTensorType(profile, inputInfo); - RecordTensorType(profile, weightsInfo); - RecordTensorType(profile, outputInfo); - - if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo)) - { - return; - } - - const TensorShape& weightsShape = weightsInfo.GetShape(); - if (weightsShape.GetNumDimensions() < 2) - { - return; - } - - const FullyConnectedDescriptor& descriptor = - static_cast(layer.GetParameters()); - const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U; - const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U; - const int64_t n = Dimension(weightsShape, nIndex); - const int64_t k = Dimension(weightsShape, kIndex); - const int64_t outputElements = NumElements(outputInfo.GetShape()); - const int64_t m = n > 0 ? outputElements / n : 0; - - RecordGemmShape(profile, m, n, k, 1, 1, true); -} - -void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) -{ - if (layer.GetNumInputSlots() < 2 || - layer.GetNumOutputSlots() == 0 || - !layer.GetInputSlot(0).IsTensorInfoSet() || - !layer.GetInputSlot(1).IsTensorInfoSet() || - !layer.GetOutputSlot(0).IsTensorInfoSet()) - { - return; - } - - const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo(); - const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo(); - const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); - RecordTensorType(profile, lhsInfo); - RecordTensorType(profile, rhsInfo); - RecordTensorType(profile, outputInfo); - - if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo)) - { - return; - } - - const TensorShape& lhsShape = lhsInfo.GetShape(); - const TensorShape& outputShape = outputInfo.GetShape(); - const int64_t n = DimensionFromEnd(outputShape, 1); - const int64_t m = n > 0 ? NumElements(outputShape) / n : 0; - int64_t k = DimensionFromEnd(lhsShape, 1); - if (k == n) - { - k = DimensionFromEnd(lhsShape, 2); - } - - RecordGemmShape(profile, m, n, k, 1, 1, true); -} - -void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) -{ - ++profile.m_DepthwiseConvolution2dOps; - - for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) - { - if (layer.GetInputSlot(i).IsTensorInfoSet()) - { - RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo()); - } - } - for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i) - { - if (layer.GetOutputSlot(i).IsTensorInfoSet()) - { - RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo()); - } - } -} - -Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16) -{ - Sme2ShapeProfile profile; - profile.m_HasFp16 = reduceFp32ToFp16; - - for (const Layer* layer : graph) - { - switch (layer->GetType()) - { - case LayerType::Convolution2d: - RecordConvolution2d(profile, *layer); - break; - case LayerType::FullyConnected: - RecordFullyConnected(profile, *layer); - break; - case LayerType::BatchMatMul: - RecordBatchMatMul(profile, *layer); - break; - case LayerType::DepthwiseConvolution2d: - RecordDepthwiseConvolution2d(profile, *layer); - break; - default: - for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i) - { - if (layer->GetInputSlot(i).IsTensorInfoSet()) - { - RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo()); - } - } - for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i) - { - if (layer->GetOutputSlot(i).IsTensorInfoSet()) - { - RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo()); - } - } - break; - } - } - - return profile; -} - -unsigned int CapWorkerCount(unsigned int workers, unsigned int cap) -{ - if (workers == 0 || cap == 0 || cap >= workers) - { - return workers; - } - return cap; -} - -unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions) -{ - unsigned int numberOfThreads = 0; - ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value) - { - if (name == "NumberOfThreads") - { - if (value.IsUnsignedInt()) - { - numberOfThreads = value.AsUnsignedInt(); - } - else if (value.IsInt() && value.AsInt() > 0) - { - numberOfThreads = static_cast(value.AsInt()); - } - } - }); - return numberOfThreads; -} - -bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile) -{ - const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized; - if (!isFloatOnly) - { - return false; - } - - const bool hasHeavySpatialConvolution = - profile.m_GemmMacs > 0 && - profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs && - !profile.m_HasSegmentationShape; - - const bool hasSmallDenseGraph = - profile.m_DepthwiseConvolution2dOps == 0 && - profile.m_SmallDenseProjectionOps >= 4 && - !profile.m_HasSmallMLargeNProjection; - - return profile.m_HasPoseShape || - profile.m_HasStyleTransferShape || - hasHeavySpatialConvolution || - hasSmallDenseGraph; -} - -bool ShouldDisableSme(const Sme2ShapeProfile& profile) -{ - if (profile.m_GemmLikeOps == 0) - { - return false; - } - - if (profile.m_HasFp16) - { - return true; - } - - if (profile.m_HasQuantized) - { - return !profile.m_HasSmallMLargeNProjection; - } - - return HasFloatSmeRegressionRisk(profile); -} - -unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads) -{ - if (!profile.m_HasQuantized || ShouldDisableSme(profile)) - { - return requestedThreads; - } - - if (profile.m_GemmLikeOps == 0) - { - return CapWorkerCount(requestedThreads, 1); - } - - if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape) - { - return requestedThreads; - } - - if (profile.m_HasPoseShape) - { - return CapWorkerCount(requestedThreads, 4); - } - - return CapWorkerCount(requestedThreads, 1); -} - -void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions) -{ - const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); - const bool smeEnabled = !ShouldDisableSme(profile); - const bool sveEnabled = smeEnabled || profile.m_HasQuantized; - const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); - const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads); - - modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}})); - if (selectedThreads != requestedThreads) - { - modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}})); - } -} - -} // namespace INetwork::INetwork(NetworkOptions networkOptions) : pNetworkImpl(new NetworkImpl(networkOptions)) {} diff --git a/src/armnn/Sme2ShapePolicy.cpp b/src/armnn/Sme2ShapePolicy.cpp new file mode 100644 index 0000000000..a94ac733e2 --- /dev/null +++ b/src/armnn/Sme2ShapePolicy.cpp @@ -0,0 +1,453 @@ +// +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "Sme2ShapePolicy.hpp" + +#include "Graph.hpp" +#include "Layer.hpp" +#include "armnnUtils/DataLayoutIndexed.hpp" + +#include +#include + +#include +#include + +namespace armnn +{ +namespace +{ + +struct Sme2ShapeProfile +{ + unsigned int m_GemmLikeOps = 0; + unsigned int m_DepthwiseConvolution2dOps = 0; + unsigned int m_SmallDenseProjectionOps = 0; + int64_t m_GemmMacs = 0; + int64_t m_NonPointwiseGemmMacs = 0; + bool m_HasFp16 = false; + bool m_HasQuantized = false; + bool m_HasSegmentationShape = false; + bool m_HasStyleTransferShape = false; + bool m_HasPoseShape = false; + bool m_HasSmallMLargeNProjection = false; +}; + +bool IsQuantizedDataType(DataType dataType) +{ + switch (dataType) + { + case DataType::QAsymmU8: + case DataType::QAsymmS8: + case DataType::QSymmS8: + case DataType::QSymmS16: + return true; + default: + return false; + } +} + +void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo) +{ + const DataType dataType = tensorInfo.GetDataType(); + profile.m_HasFp16 |= dataType == DataType::Float16; + profile.m_HasQuantized |= IsQuantizedDataType(dataType); +} + +bool HasSpecifiedShape(const TensorInfo& tensorInfo) +{ + const TensorShape& shape = tensorInfo.GetShape(); + return shape.GetDimensionality() == Dimensionality::Specified && + shape.AreAllDimensionsSpecified(); +} + +int64_t NumElements(const TensorShape& shape) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified()) + { + return 0; + } + + int64_t elements = 1; + for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i) + { + elements *= static_cast(std::max(shape[i], 1U)); + } + return elements; +} + +int64_t Dimension(const TensorShape& shape, unsigned int index) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified() || + index >= shape.GetNumDimensions()) + { + return 0; + } + return static_cast(shape[index]); +} + +int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset) +{ + if (offset == 0 || shape.GetNumDimensions() < offset) + { + return 0; + } + return Dimension(shape, shape.GetNumDimensions() - offset); +} + +void RecordGemmShape(Sme2ShapeProfile& profile, + int64_t m, + int64_t n, + int64_t k, + int64_t kernelH, + int64_t kernelW, + bool isDenseProjection) +{ + if (m <= 0 || n <= 0 || k <= 0) + { + return; + } + + ++profile.m_GemmLikeOps; + + const bool is1x1 = kernelH == 1 && kernelW == 1; + const int64_t macs = m * n * k; + profile.m_GemmMacs += macs; + if (!is1x1) + { + profile.m_NonPointwiseGemmMacs += macs; + } + if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024) + { + ++profile.m_SmallDenseProjectionOps; + } + + if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) + { + profile.m_HasSegmentationShape = true; + } + + if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) || + (m >= 60000 && m <= 75000 && n == 32 && k >= 500))) + { + profile.m_HasStyleTransferShape = true; + } + + if (!is1x1 && m >= 100000 && n >= 64 && k >= 500) + { + profile.m_HasPoseShape = true; + } + + if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024) + { + profile.m_HasSmallMLargeNProjection = true; + } +} + +void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, filterInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const Convolution2dDescriptor& descriptor = + static_cast(layer.GetParameters()); + const TensorShape& filterShape = filterInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout); + + if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4) + { + return; + } + + const int64_t n = Dimension(filterShape, 0); + const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex()); + const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex()); + const int64_t filterElements = NumElements(filterShape); + const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ? + filterElements / (n * kernelH * kernelW) : 0; + const int64_t k = kernelH * kernelW * inputChannels; + const int64_t outputElements = NumElements(outputShape); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, kernelH, kernelW, false); +} + +void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, weightsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& weightsShape = weightsInfo.GetShape(); + if (weightsShape.GetNumDimensions() < 2) + { + return; + } + + const FullyConnectedDescriptor& descriptor = + static_cast(layer.GetParameters()); + const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U; + const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U; + const int64_t n = Dimension(weightsShape, nIndex); + const int64_t k = Dimension(weightsShape, kIndex); + const int64_t outputElements = NumElements(outputInfo.GetShape()); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, 1, 1, true); +} + +void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, lhsInfo); + RecordTensorType(profile, rhsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& lhsShape = lhsInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const int64_t n = DimensionFromEnd(outputShape, 1); + const int64_t m = n > 0 ? NumElements(outputShape) / n : 0; + int64_t k = DimensionFromEnd(lhsShape, 1); + if (k == n) + { + k = DimensionFromEnd(lhsShape, 2); + } + + RecordGemmShape(profile, m, n, k, 1, 1, true); +} + +void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + ++profile.m_DepthwiseConvolution2dOps; + + for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) + { + if (layer.GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i) + { + if (layer.GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo()); + } + } +} + +Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16) +{ + Sme2ShapeProfile profile; + profile.m_HasFp16 = reduceFp32ToFp16; + + for (const Layer* layer : graph) + { + switch (layer->GetType()) + { + case LayerType::Convolution2d: + RecordConvolution2d(profile, *layer); + break; + case LayerType::FullyConnected: + RecordFullyConnected(profile, *layer); + break; + case LayerType::BatchMatMul: + RecordBatchMatMul(profile, *layer); + break; + case LayerType::DepthwiseConvolution2d: + RecordDepthwiseConvolution2d(profile, *layer); + break; + default: + for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i) + { + if (layer->GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i) + { + if (layer->GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo()); + } + } + break; + } + } + + return profile; +} + +unsigned int CapWorkerCount(unsigned int workers, unsigned int cap) +{ + if (workers == 0 || cap == 0 || cap >= workers) + { + return workers; + } + return cap; +} + +unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions) +{ + unsigned int numberOfThreads = 0; + ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value) + { + if (name == "NumberOfThreads") + { + if (value.IsUnsignedInt()) + { + numberOfThreads = value.AsUnsignedInt(); + } + else if (value.IsInt() && value.AsInt() > 0) + { + numberOfThreads = static_cast(value.AsInt()); + } + } + }); + return numberOfThreads; +} + +bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile) +{ + const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized; + if (!isFloatOnly) + { + return false; + } + + const bool hasHeavySpatialConvolution = + profile.m_GemmMacs > 0 && + profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs && + !profile.m_HasSegmentationShape; + + const bool hasSmallDenseGraph = + profile.m_DepthwiseConvolution2dOps == 0 && + profile.m_SmallDenseProjectionOps >= 4 && + !profile.m_HasSmallMLargeNProjection; + + return profile.m_HasPoseShape || + profile.m_HasStyleTransferShape || + hasHeavySpatialConvolution || + hasSmallDenseGraph; +} + +bool ShouldDisableSme(const Sme2ShapeProfile& profile) +{ + if (profile.m_GemmLikeOps == 0) + { + return false; + } + + if (profile.m_HasFp16) + { + return true; + } + + if (profile.m_HasQuantized) + { + return !profile.m_HasSmallMLargeNProjection; + } + + return HasFloatSmeRegressionRisk(profile); +} + +unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads) +{ + if (!profile.m_HasQuantized || ShouldDisableSme(profile)) + { + return requestedThreads; + } + + if (profile.m_GemmLikeOps == 0) + { + return CapWorkerCount(requestedThreads, 1); + } + + if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape) + { + return requestedThreads; + } + + if (profile.m_HasPoseShape) + { + return CapWorkerCount(requestedThreads, 4); + } + + return CapWorkerCount(requestedThreads, 1); +} + +} // namespace + +void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions) +{ + const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); + const bool smeEnabled = !ShouldDisableSme(profile); + const bool sveEnabled = smeEnabled || profile.m_HasQuantized; + const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); + const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads); + + modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}})); + if (selectedThreads != requestedThreads) + { + modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}})); + } +} + +} // namespace armnn diff --git a/src/armnn/Sme2ShapePolicy.hpp b/src/armnn/Sme2ShapePolicy.hpp new file mode 100644 index 0000000000..3e2450f5a0 --- /dev/null +++ b/src/armnn/Sme2ShapePolicy.hpp @@ -0,0 +1,17 @@ +// +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include + +namespace armnn +{ + +class Graph; + +void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions); + +} // namespace armnn diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp index 20b44795dd..f675d2c28e 100644 --- a/src/backends/neon/NeonBackendModelContext.cpp +++ b/src/backends/neon/NeonBackendModelContext.cpp @@ -59,8 +59,7 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption }); } - arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled); - arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled); + ApplyAclIsaPolicy(); } bool NeonBackendModelContext::IsFastMathEnabled() const @@ -73,14 +72,10 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const return m_NumberOfThreads; } -bool NeonBackendModelContext::IsSveEnabled() const +void NeonBackendModelContext::ApplyAclIsaPolicy() const { - return m_IsSveEnabled; -} - -bool NeonBackendModelContext::IsSmeEnabled() const -{ - return m_IsSmeEnabled; + arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled); + arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled); } } // namespace armnn diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp index 60d3157471..2959bd60ba 100644 --- a/src/backends/neon/NeonBackendModelContext.hpp +++ b/src/backends/neon/NeonBackendModelContext.hpp @@ -29,9 +29,7 @@ class NeonBackendModelContext : public IBackendModelContext unsigned int GetNumberOfThreads() const; - bool IsSveEnabled() const; - - bool IsSmeEnabled() const; + void ApplyAclIsaPolicy() const; private: bool m_IsFastMathEnabled; diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index e90fe60488..dd40073ee7 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -20,8 +20,6 @@ #include #include -#include - #include #include @@ -65,8 +63,7 @@ void NeonWorkloadFactory::SetNumberOfThreads() auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); auto numberOfThreads = modelOptions->GetNumberOfThreads(); - arm_compute::CPUInfo::get().set_sve_allowed(modelOptions->IsSveEnabled()); - arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled()); + modelOptions->ApplyAclIsaPolicy(); if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS) { From a66749922ead11b45da60b8639610cfa53cd849a Mon Sep 17 00:00:00 2001 From: Damien Dooley Date: Fri, 12 Jun 2026 12:28:22 +0100 Subject: [PATCH 4/4] Widened M band for heuristic --- src/armnn/Network.cpp | 8 ++++++-- src/armnn/Sme2ShapePolicy.cpp | 8 +++++++- src/backends/neon/NeonBackendModelContext.cpp | 2 +- src/backends/neon/NeonBackendModelContext.hpp | 2 +- src/backends/neon/NeonWorkloadFactory.cpp | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index f83be3b864..0f97d385df 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -2096,8 +2097,11 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph, optGraph.InferTensorInfos(); } - ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions); - optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions; + if (std::count(backendPreferences.begin(), backendPreferences.end(), armnn::Compute::CpuAcc) > 0) + { + ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions); + optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions; + } // Initialize backend settings BackendSettings backendSettings(backendPreferences, deviceSpec); diff --git a/src/armnn/Sme2ShapePolicy.cpp b/src/armnn/Sme2ShapePolicy.cpp index a94ac733e2..333d00653a 100644 --- a/src/armnn/Sme2ShapePolicy.cpp +++ b/src/armnn/Sme2ShapePolicy.cpp @@ -126,7 +126,8 @@ void RecordGemmShape(Sme2ShapeProfile& profile, ++profile.m_SmallDenseProjectionOps; } - if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) + const bool hasModerateSpatialM = m >= 2048 && m <= 2560; + if (is1x1 && hasModerateSpatialM && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) { profile.m_HasSegmentationShape = true; } @@ -438,6 +439,11 @@ unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions) { const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); + if (profile.m_GemmLikeOps == 0) + { + return; + } + const bool smeEnabled = !ShouldDisableSme(profile); const bool sveEnabled = smeEnabled || profile.m_HasQuantized; const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp index f675d2c28e..63724b5178 100644 --- a/src/backends/neon/NeonBackendModelContext.cpp +++ b/src/backends/neon/NeonBackendModelContext.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp index 2959bd60ba..d2c1b5323e 100644 --- a/src/backends/neon/NeonBackendModelContext.hpp +++ b/src/backends/neon/NeonBackendModelContext.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index dd40073ee7..afde9cef80 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2024, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT //