diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ca3bb69d5..08ed3ea135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,6 +385,8 @@ list(APPEND armnn_sources src/armnn/Logging.cpp src/armnn/Network.cpp src/armnn/Network.hpp + src/armnn/Sme2ShapePolicy.cpp + src/armnn/Sme2ShapePolicy.hpp src/armnn/NetworkUtils.cpp src/armnn/NetworkUtils.hpp src/armnn/Observable.cpp diff --git a/delegate/cmake/Modules/FindTfLite.cmake b/delegate/cmake/Modules/FindTfLite.cmake index bdb4df7ce0..d5c0e53606 100644 --- a/delegate/cmake/Modules/FindTfLite.cmake +++ b/delegate/cmake/Modules/FindTfLite.cmake @@ -150,7 +150,7 @@ if (TfLite_LIB MATCHES .a$) TfLite_ruy_prepare_packed_matrices_LIB TfLite_ruy_system_aligned_alloc_LIB TfLite_ruy_threadpool_LIB TfLite_ruy_trmul_LIB TfLite_ruy_tune_LIB TfLite_ruy_wait_LIB TfLite_ruy_profiler_LIB TfLite_cpuinfo_LIB TfLite_abseil_synchronization_LIB TfLite_abseil_graphCycle_internal_LIB TfLite_abseil_raw_logging_internal_LIB - TfLite_abseil_kernel_timeout_LIB TfLite_abseil_internal_strings_LIB) + TfLite_abseil_internal_strings_LIB) # Set external variables for usage in CMakeLists.txt if (TFLITE_FOUND) # WARNING! The order of these libraries is critical. Moving them @@ -166,7 +166,10 @@ if (TfLite_LIB MATCHES .a$) ${TfLite_ruy_prepare_packed_matrices_LIB} ${TfLite_ruy_system_aligned_alloc_LIB} ${TfLite_ruy_tune_LIB} ${TfLite_ruy_wait_LIB} ${TfLite_ruy_profiler_LIB} ${TfLite_cpuinfo_LIB} ${TfLite_abseil_synchronization_LIB} ${TfLite_abseil_graphCycle_internal_LIB} - ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_kernel_timeout_LIB} ${TfLite_abseil_internal_strings_LIB}) + ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_internal_strings_LIB}) + if(TfLite_abseil_kernel_timeout_LIB) + list(APPEND TfLite_LIB ${TfLite_abseil_kernel_timeout_LIB}) + endif() endif () elseif (TfLite_LIB MATCHES .so$) message("-- Dynamic tensorflow lite library found, using for ArmNN build") diff --git a/include/armnnUtils/QuantizeHelper.hpp b/include/armnnUtils/QuantizeHelper.hpp index 231b8411cb..5868d13fc8 100644 --- a/include/armnnUtils/QuantizeHelper.hpp +++ b/include/armnnUtils/QuantizeHelper.hpp @@ -96,7 +96,7 @@ float SelectiveDequantize(T value, float scale, int32_t offset) template struct IsFloatingPointIterator { - static constexpr bool value=std::is_floating_point::value_type>::value; + static constexpr bool value=armnn::IsFloatingPoint::value_type>::value; }; template 0) + { + ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions); + optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions; + } + // Initialize backend settings BackendSettings backendSettings(backendPreferences, deviceSpec); auto availablePreferredBackends = backendSettings.GetAvailablePreferredBackends(); @@ -2207,7 +2214,7 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph, OptimizationResult backendOptimizationResult = ApplyBackendOptimizations(optNetObjPtr->pOptimizedNetworkImpl.get(), backendSettings, backends, - options.GetModelOptions(), + optimizedOptions, messages); if (backendOptimizationResult.m_Error) { diff --git a/src/armnn/Sme2ShapePolicy.cpp b/src/armnn/Sme2ShapePolicy.cpp new file mode 100644 index 0000000000..333d00653a --- /dev/null +++ b/src/armnn/Sme2ShapePolicy.cpp @@ -0,0 +1,459 @@ +// +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "Sme2ShapePolicy.hpp" + +#include "Graph.hpp" +#include "Layer.hpp" +#include "armnnUtils/DataLayoutIndexed.hpp" + +#include +#include + +#include +#include + +namespace armnn +{ +namespace +{ + +struct Sme2ShapeProfile +{ + unsigned int m_GemmLikeOps = 0; + unsigned int m_DepthwiseConvolution2dOps = 0; + unsigned int m_SmallDenseProjectionOps = 0; + int64_t m_GemmMacs = 0; + int64_t m_NonPointwiseGemmMacs = 0; + bool m_HasFp16 = false; + bool m_HasQuantized = false; + bool m_HasSegmentationShape = false; + bool m_HasStyleTransferShape = false; + bool m_HasPoseShape = false; + bool m_HasSmallMLargeNProjection = false; +}; + +bool IsQuantizedDataType(DataType dataType) +{ + switch (dataType) + { + case DataType::QAsymmU8: + case DataType::QAsymmS8: + case DataType::QSymmS8: + case DataType::QSymmS16: + return true; + default: + return false; + } +} + +void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo) +{ + const DataType dataType = tensorInfo.GetDataType(); + profile.m_HasFp16 |= dataType == DataType::Float16; + profile.m_HasQuantized |= IsQuantizedDataType(dataType); +} + +bool HasSpecifiedShape(const TensorInfo& tensorInfo) +{ + const TensorShape& shape = tensorInfo.GetShape(); + return shape.GetDimensionality() == Dimensionality::Specified && + shape.AreAllDimensionsSpecified(); +} + +int64_t NumElements(const TensorShape& shape) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified()) + { + return 0; + } + + int64_t elements = 1; + for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i) + { + elements *= static_cast(std::max(shape[i], 1U)); + } + return elements; +} + +int64_t Dimension(const TensorShape& shape, unsigned int index) +{ + if (shape.GetDimensionality() != Dimensionality::Specified || + !shape.AreAllDimensionsSpecified() || + index >= shape.GetNumDimensions()) + { + return 0; + } + return static_cast(shape[index]); +} + +int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset) +{ + if (offset == 0 || shape.GetNumDimensions() < offset) + { + return 0; + } + return Dimension(shape, shape.GetNumDimensions() - offset); +} + +void RecordGemmShape(Sme2ShapeProfile& profile, + int64_t m, + int64_t n, + int64_t k, + int64_t kernelH, + int64_t kernelW, + bool isDenseProjection) +{ + if (m <= 0 || n <= 0 || k <= 0) + { + return; + } + + ++profile.m_GemmLikeOps; + + const bool is1x1 = kernelH == 1 && kernelW == 1; + const int64_t macs = m * n * k; + profile.m_GemmMacs += macs; + if (!is1x1) + { + profile.m_NonPointwiseGemmMacs += macs; + } + if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024) + { + ++profile.m_SmallDenseProjectionOps; + } + + const bool hasModerateSpatialM = m >= 2048 && m <= 2560; + if (is1x1 && hasModerateSpatialM && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900))) + { + profile.m_HasSegmentationShape = true; + } + + if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) || + (m >= 60000 && m <= 75000 && n == 32 && k >= 500))) + { + profile.m_HasStyleTransferShape = true; + } + + if (!is1x1 && m >= 100000 && n >= 64 && k >= 500) + { + profile.m_HasPoseShape = true; + } + + if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024) + { + profile.m_HasSmallMLargeNProjection = true; + } +} + +void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, filterInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const Convolution2dDescriptor& descriptor = + static_cast(layer.GetParameters()); + const TensorShape& filterShape = filterInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout); + + if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4) + { + return; + } + + const int64_t n = Dimension(filterShape, 0); + const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex()); + const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex()); + const int64_t filterElements = NumElements(filterShape); + const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ? + filterElements / (n * kernelH * kernelW) : 0; + const int64_t k = kernelH * kernelW * inputChannels; + const int64_t outputElements = NumElements(outputShape); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, kernelH, kernelW, false); +} + +void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, inputInfo); + RecordTensorType(profile, weightsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& weightsShape = weightsInfo.GetShape(); + if (weightsShape.GetNumDimensions() < 2) + { + return; + } + + const FullyConnectedDescriptor& descriptor = + static_cast(layer.GetParameters()); + const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U; + const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U; + const int64_t n = Dimension(weightsShape, nIndex); + const int64_t k = Dimension(weightsShape, kIndex); + const int64_t outputElements = NumElements(outputInfo.GetShape()); + const int64_t m = n > 0 ? outputElements / n : 0; + + RecordGemmShape(profile, m, n, k, 1, 1, true); +} + +void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer) +{ + if (layer.GetNumInputSlots() < 2 || + layer.GetNumOutputSlots() == 0 || + !layer.GetInputSlot(0).IsTensorInfoSet() || + !layer.GetInputSlot(1).IsTensorInfoSet() || + !layer.GetOutputSlot(0).IsTensorInfoSet()) + { + return; + } + + const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo(); + const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo(); + const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo(); + RecordTensorType(profile, lhsInfo); + RecordTensorType(profile, rhsInfo); + RecordTensorType(profile, outputInfo); + + if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo)) + { + return; + } + + const TensorShape& lhsShape = lhsInfo.GetShape(); + const TensorShape& outputShape = outputInfo.GetShape(); + const int64_t n = DimensionFromEnd(outputShape, 1); + const int64_t m = n > 0 ? NumElements(outputShape) / n : 0; + int64_t k = DimensionFromEnd(lhsShape, 1); + if (k == n) + { + k = DimensionFromEnd(lhsShape, 2); + } + + RecordGemmShape(profile, m, n, k, 1, 1, true); +} + +void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer) +{ + ++profile.m_DepthwiseConvolution2dOps; + + for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) + { + if (layer.GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i) + { + if (layer.GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo()); + } + } +} + +Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16) +{ + Sme2ShapeProfile profile; + profile.m_HasFp16 = reduceFp32ToFp16; + + for (const Layer* layer : graph) + { + switch (layer->GetType()) + { + case LayerType::Convolution2d: + RecordConvolution2d(profile, *layer); + break; + case LayerType::FullyConnected: + RecordFullyConnected(profile, *layer); + break; + case LayerType::BatchMatMul: + RecordBatchMatMul(profile, *layer); + break; + case LayerType::DepthwiseConvolution2d: + RecordDepthwiseConvolution2d(profile, *layer); + break; + default: + for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i) + { + if (layer->GetInputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo()); + } + } + for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i) + { + if (layer->GetOutputSlot(i).IsTensorInfoSet()) + { + RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo()); + } + } + break; + } + } + + return profile; +} + +unsigned int CapWorkerCount(unsigned int workers, unsigned int cap) +{ + if (workers == 0 || cap == 0 || cap >= workers) + { + return workers; + } + return cap; +} + +unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions) +{ + unsigned int numberOfThreads = 0; + ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value) + { + if (name == "NumberOfThreads") + { + if (value.IsUnsignedInt()) + { + numberOfThreads = value.AsUnsignedInt(); + } + else if (value.IsInt() && value.AsInt() > 0) + { + numberOfThreads = static_cast(value.AsInt()); + } + } + }); + return numberOfThreads; +} + +bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile) +{ + const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized; + if (!isFloatOnly) + { + return false; + } + + const bool hasHeavySpatialConvolution = + profile.m_GemmMacs > 0 && + profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs && + !profile.m_HasSegmentationShape; + + const bool hasSmallDenseGraph = + profile.m_DepthwiseConvolution2dOps == 0 && + profile.m_SmallDenseProjectionOps >= 4 && + !profile.m_HasSmallMLargeNProjection; + + return profile.m_HasPoseShape || + profile.m_HasStyleTransferShape || + hasHeavySpatialConvolution || + hasSmallDenseGraph; +} + +bool ShouldDisableSme(const Sme2ShapeProfile& profile) +{ + if (profile.m_GemmLikeOps == 0) + { + return false; + } + + if (profile.m_HasFp16) + { + return true; + } + + if (profile.m_HasQuantized) + { + return !profile.m_HasSmallMLargeNProjection; + } + + return HasFloatSmeRegressionRisk(profile); +} + +unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads) +{ + if (!profile.m_HasQuantized || ShouldDisableSme(profile)) + { + return requestedThreads; + } + + if (profile.m_GemmLikeOps == 0) + { + return CapWorkerCount(requestedThreads, 1); + } + + if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape) + { + return requestedThreads; + } + + if (profile.m_HasPoseShape) + { + return CapWorkerCount(requestedThreads, 4); + } + + return CapWorkerCount(requestedThreads, 1); +} + +} // namespace + +void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions) +{ + const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16); + if (profile.m_GemmLikeOps == 0) + { + return; + } + + const bool smeEnabled = !ShouldDisableSme(profile); + const bool sveEnabled = smeEnabled || profile.m_HasQuantized; + const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions); + const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads); + + modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}})); + if (selectedThreads != requestedThreads) + { + modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}})); + } +} + +} // namespace armnn diff --git a/src/armnn/Sme2ShapePolicy.hpp b/src/armnn/Sme2ShapePolicy.hpp new file mode 100644 index 0000000000..3e2450f5a0 --- /dev/null +++ b/src/armnn/Sme2ShapePolicy.hpp @@ -0,0 +1,17 @@ +// +// Copyright © 2026 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include + +namespace armnn +{ + +class Graph; + +void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions); + +} // namespace armnn diff --git a/src/armnnUtils/Half.hpp b/src/armnnUtils/Half.hpp index e39968f54d..ce867725dd 100644 --- a/src/armnnUtils/Half.hpp +++ b/src/armnnUtils/Half.hpp @@ -20,25 +20,15 @@ namespace armnn { using Half = half_float::half; //import half float implementation -} //namespace armnn - - -namespace std -{ -template<> -struct is_floating_point - : integral_constant< bool, true > +template +struct IsArmnnHalf + : std::is_same::type, Half> {}; -template<> -struct is_floating_point - : integral_constant< bool, true > +template +struct IsFloatingPoint + : std::integral_constant::value || IsArmnnHalf::value> {}; -template<> -struct is_floating_point - : integral_constant< bool, true > -{}; - -} //namespace std +} //namespace armnn diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index 9cd9d18d47..feb19e79e7 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2025 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -134,11 +134,7 @@ IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport() const IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport(const ModelOptions& modelOptions) const { - static ILayerSupportSharedPtr layerSupport - { - new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions)) - }; - return layerSupport; + return ILayerSupportSharedPtr{new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions))}; } OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph, diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp index 270592e94d..63724b5178 100644 --- a/src/backends/neon/NeonBackendModelContext.cpp +++ b/src/backends/neon/NeonBackendModelContext.cpp @@ -1,10 +1,12 @@ // -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "NeonBackendModelContext.hpp" +#include + namespace { @@ -32,7 +34,7 @@ namespace armnn { NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOptions) - : m_IsFastMathEnabled(false), m_NumberOfThreads(0) + : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSveEnabled(true), m_IsSmeEnabled(true) { if (!modelOptions.empty()) { @@ -40,14 +42,24 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption { if (name == "FastMathEnabled") { - m_IsFastMathEnabled |= ParseBool(value, false); + m_IsFastMathEnabled = ParseBool(value, m_IsFastMathEnabled); } if (name == "NumberOfThreads") { - m_NumberOfThreads |= ParseUnsignedInt(value, 0); + m_NumberOfThreads = ParseUnsignedInt(value, m_NumberOfThreads); + } + if (name == "SmeEnabled") + { + m_IsSmeEnabled = ParseBool(value, m_IsSmeEnabled); + } + if (name == "SveEnabled") + { + m_IsSveEnabled = ParseBool(value, m_IsSveEnabled); } }); } + + ApplyAclIsaPolicy(); } bool NeonBackendModelContext::IsFastMathEnabled() const @@ -60,4 +72,10 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const return m_NumberOfThreads; } -} // namespace armnn \ No newline at end of file +void NeonBackendModelContext::ApplyAclIsaPolicy() const +{ + arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled); + arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled); +} + +} // namespace armnn diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp index e736efc1d8..d2c1b5323e 100644 --- a/src/backends/neon/NeonBackendModelContext.hpp +++ b/src/backends/neon/NeonBackendModelContext.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -16,6 +16,10 @@ namespace armnn /// results with reduced or different precision. The fast_math flag will not have any effect on int8 performance. /// - "NumberOfThreads"\n /// Specify the number of threads used by the CpuAcc backend. +/// - "SveEnabled"\n +/// Specify whether SVE/SVE2 implementations may be selected by the CpuAcc backend. +/// - "SmeEnabled"\n +/// Specify whether SME/SME2 implementations may be selected by the CpuAcc backend. class NeonBackendModelContext : public IBackendModelContext { public: @@ -25,9 +29,13 @@ class NeonBackendModelContext : public IBackendModelContext unsigned int GetNumberOfThreads() const; + void ApplyAclIsaPolicy() const; + private: bool m_IsFastMathEnabled; unsigned int m_NumberOfThreads; + bool m_IsSveEnabled; + bool m_IsSmeEnabled; }; -} // namespace armnn \ No newline at end of file +} // namespace armnn diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp index f75f84d2d8..afde9cef80 100644 --- a/src/backends/neon/NeonWorkloadFactory.cpp +++ b/src/backends/neon/NeonWorkloadFactory.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017-2024 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2024, 2026 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -63,6 +63,8 @@ void NeonWorkloadFactory::SetNumberOfThreads() auto modelOptions = dynamic_cast(m_ModelContextPtr.get()); auto numberOfThreads = modelOptions->GetNumberOfThreads(); + modelOptions->ApplyAclIsaPolicy(); + if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS) { arm_compute::Scheduler::get().set_num_threads(numberOfThreads);