From 4f628a2a79052f37fab602f2ebe9676f27fda3af Mon Sep 17 00:00:00 2001
From: Damien Dooley <damien.dooley@arm.com>
Date: Thu, 4 Jun 2026 15:35:39 +0100
Subject: [PATCH 1/4] Add SME2 shape heuristic

Signed-off-by: Damien Dooley <damien.dooley@arm.com>
---
 src/armnn/Network.cpp                         | 369 +++++++++++++++++-
 src/backends/neon/NeonBackend.cpp             |   8 +-
 src/backends/neon/NeonBackendModelContext.cpp |  23 +-
 src/backends/neon/NeonBackendModelContext.hpp |   9 +-
 src/backends/neon/NeonWorkloadFactory.cpp     |   6 +-
 5 files changed, 399 insertions(+), 16 deletions(-)
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index e81674bf29..48276cd9e8 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017-2025 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -14,6 +14,8 @@
 #include "armnnUtils/Filesystem.hpp"
 #include "armnn/utility/Timer.hpp"
 
+#include <armnnUtils/DataLayoutIndexed.hpp>
+
 #include <armnn/backends/TensorHandle.hpp>
 #include <armnn/backends/WorkloadFactory.hpp>
 #include <armnn/backends/IBackendInternal.hpp>
@@ -35,12 +37,372 @@
 
 #include <fcntl.h>
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <vector>
 #include <armnn/ArmNN.hpp>
 
 namespace armnn
 {
+namespace
+{
+
+struct Sme2ShapeProfile
+{
+    unsigned int m_GemmLikeOps = 0;
+    bool m_HasFp16 = false;
+    bool m_HasQuantized = false;
+    bool m_HasSegmentationShape = false;
+    bool m_HasStyleTransferShape = false;
+    bool m_HasPoseShape = false;
+    bool m_HasSmallMLargeNProjection = false;
+};
+
+bool IsQuantizedDataType(DataType dataType)
+{
+    switch (dataType)
+    {
+        case DataType::QAsymmU8:
+        case DataType::QAsymmS8:
+        case DataType::QSymmS8:
+        case DataType::QSymmS16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo)
+{
+    const DataType dataType = tensorInfo.GetDataType();
+    profile.m_HasFp16 |= dataType == DataType::Float16;
+    profile.m_HasQuantized |= IsQuantizedDataType(dataType);
+}
+
+bool HasSpecifiedShape(const TensorInfo& tensorInfo)
+{
+    const TensorShape& shape = tensorInfo.GetShape();
+    return shape.GetDimensionality() == Dimensionality::Specified &&
+           shape.AreAllDimensionsSpecified();
+}
+
+int64_t NumElements(const TensorShape& shape)
+{
+    if (shape.GetDimensionality() != Dimensionality::Specified ||
+        !shape.AreAllDimensionsSpecified())
+    {
+        return 0;
+    }
+
+    int64_t elements = 1;
+    for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i)
+    {
+        elements *= static_cast<int64_t>(std::max(shape[i], 1U));
+    }
+    return elements;
+}
+
+int64_t Dimension(const TensorShape& shape, unsigned int index)
+{
+    if (shape.GetDimensionality() != Dimensionality::Specified ||
+        !shape.AreAllDimensionsSpecified() ||
+        index >= shape.GetNumDimensions())
+    {
+        return 0;
+    }
+    return static_cast<int64_t>(shape[index]);
+}
+
+int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset)
+{
+    if (offset == 0 || shape.GetNumDimensions() < offset)
+    {
+        return 0;
+    }
+    return Dimension(shape, shape.GetNumDimensions() - offset);
+}
+
+void RecordGemmShape(Sme2ShapeProfile& profile,
+                     int64_t m,
+                     int64_t n,
+                     int64_t k,
+                     int64_t kernelH,
+                     int64_t kernelW)
+{
+    if (m <= 0 || n <= 0 || k <= 0)
+    {
+        return;
+    }
+
+    ++profile.m_GemmLikeOps;
+
+    const bool is1x1 = kernelH == 1 && kernelW == 1;
+
+    if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
+    {
+        profile.m_HasSegmentationShape = true;
+    }
+
+    if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) ||
+                   (m >= 60000 && m <= 75000 && n == 32 && k >= 500)))
+    {
+        profile.m_HasStyleTransferShape = true;
+    }
+
+    if (!is1x1 && m >= 100000 && n >= 64 && k >= 500)
+    {
+        profile.m_HasPoseShape = true;
+    }
+
+    if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024)
+    {
+        profile.m_HasSmallMLargeNProjection = true;
+    }
+}
+
+void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, inputInfo);
+    RecordTensorType(profile, filterInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const Convolution2dDescriptor& descriptor =
+        static_cast<const Convolution2dDescriptor&>(layer.GetParameters());
+    const TensorShape& filterShape = filterInfo.GetShape();
+    const TensorShape& outputShape = outputInfo.GetShape();
+    const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout);
+
+    if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4)
+    {
+        return;
+    }
+
+    const int64_t n = Dimension(filterShape, 0);
+    const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex());
+    const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex());
+    const int64_t filterElements = NumElements(filterShape);
+    const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ?
+        filterElements / (n * kernelH * kernelW) : 0;
+    const int64_t k = kernelH * kernelW * inputChannels;
+    const int64_t outputElements = NumElements(outputShape);
+    const int64_t m = n > 0 ? outputElements / n : 0;
+
+    RecordGemmShape(profile, m, n, k, kernelH, kernelW);
+}
+
+void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, inputInfo);
+    RecordTensorType(profile, weightsInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const TensorShape& weightsShape = weightsInfo.GetShape();
+    if (weightsShape.GetNumDimensions() < 2)
+    {
+        return;
+    }
+
+    const FullyConnectedDescriptor& descriptor =
+        static_cast<const FullyConnectedDescriptor&>(layer.GetParameters());
+    const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U;
+    const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U;
+    const int64_t n = Dimension(weightsShape, nIndex);
+    const int64_t k = Dimension(weightsShape, kIndex);
+    const int64_t outputElements = NumElements(outputInfo.GetShape());
+    const int64_t m = n > 0 ? outputElements / n : 0;
+
+    RecordGemmShape(profile, m, n, k, 1, 1);
+}
+
+void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, lhsInfo);
+    RecordTensorType(profile, rhsInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const TensorShape& lhsShape = lhsInfo.GetShape();
+    const TensorShape& outputShape = outputInfo.GetShape();
+    const int64_t n = DimensionFromEnd(outputShape, 1);
+    const int64_t m = n > 0 ? NumElements(outputShape) / n : 0;
+    int64_t k = DimensionFromEnd(lhsShape, 1);
+    if (k == n)
+    {
+        k = DimensionFromEnd(lhsShape, 2);
+    }
+
+    RecordGemmShape(profile, m, n, k, 1, 1);
+}
+
+Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16)
+{
+    Sme2ShapeProfile profile;
+    profile.m_HasFp16 = reduceFp32ToFp16;
+
+    for (const Layer* layer : graph)
+    {
+        switch (layer->GetType())
+        {
+            case LayerType::Convolution2d:
+                RecordConvolution2d(profile, *layer);
+                break;
+            case LayerType::FullyConnected:
+                RecordFullyConnected(profile, *layer);
+                break;
+            case LayerType::BatchMatMul:
+                RecordBatchMatMul(profile, *layer);
+                break;
+            default:
+                for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i)
+                {
+                    if (layer->GetInputSlot(i).IsTensorInfoSet())
+                    {
+                        RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo());
+                    }
+                }
+                for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i)
+                {
+                    if (layer->GetOutputSlot(i).IsTensorInfoSet())
+                    {
+                        RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo());
+                    }
+                }
+                break;
+        }
+    }
+
+    return profile;
+}
+
+unsigned int CapWorkerCount(unsigned int workers, unsigned int cap)
+{
+    if (workers == 0 || cap == 0 || cap >= workers)
+    {
+        return workers;
+    }
+    return cap;
+}
+
+unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions)
+{
+    unsigned int numberOfThreads = 0;
+    ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value)
+    {
+        if (name == "NumberOfThreads")
+        {
+            if (value.IsUnsignedInt())
+            {
+                numberOfThreads = value.AsUnsignedInt();
+            }
+            else if (value.IsInt() && value.AsInt() > 0)
+            {
+                numberOfThreads = static_cast<unsigned int>(value.AsInt());
+            }
+        }
+    });
+    return numberOfThreads;
+}
+
+bool ShouldDisableSme(const Sme2ShapeProfile& profile)
+{
+    if (profile.m_GemmLikeOps == 0)
+    {
+        return false;
+    }
+
+    return (profile.m_HasFp16 || profile.m_HasQuantized) && !profile.m_HasSmallMLargeNProjection;
+}
+
+unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads)
+{
+    if (!profile.m_HasQuantized || ShouldDisableSme(profile))
+    {
+        return requestedThreads;
+    }
+
+    if (profile.m_GemmLikeOps == 0)
+    {
+        return CapWorkerCount(requestedThreads, 1);
+    }
+
+    if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape)
+    {
+        return requestedThreads;
+    }
+
+    if (profile.m_HasPoseShape)
+    {
+        return CapWorkerCount(requestedThreads, 4);
+    }
+
+    return CapWorkerCount(requestedThreads, 1);
+}
+
+void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions)
+{
+    const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16);
+    const bool smeEnabled = !ShouldDisableSme(profile);
+    const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions);
+    const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads);
+
+    modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}}));
+    if (selectedThreads != requestedThreads)
+    {
+        modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}}));
+    }
+}
+
+} // namespace
 
 INetwork::INetwork(NetworkOptions networkOptions) : pNetworkImpl(new NetworkImpl(networkOptions)) {}
 
@@ -2096,6 +2458,9 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
         optGraph.InferTensorInfos();
     }
 
+    ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions);
+    optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions;
+
     // Initialize backend settings
     BackendSettings backendSettings(backendPreferences, deviceSpec);
     auto availablePreferredBackends = backendSettings.GetAvailablePreferredBackends();
@@ -2207,7 +2572,7 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
     OptimizationResult backendOptimizationResult = ApplyBackendOptimizations(optNetObjPtr->pOptimizedNetworkImpl.get(),
                                                                              backendSettings,
                                                                              backends,
-                                                                             options.GetModelOptions(),
+                                                                             optimizedOptions,
                                                                              messages);
     if (backendOptimizationResult.m_Error)
     {
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index 9cd9d18d47..feb19e79e7 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017-2025 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -134,11 +134,7 @@ IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport() const
 
 IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport(const ModelOptions& modelOptions) const
 {
-    static ILayerSupportSharedPtr layerSupport
-        {
-            new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions))
-        };
-    return layerSupport;
+    return ILayerSupportSharedPtr{new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions))};
 }
 
 OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp
index 270592e94d..acc2eecc07 100644
--- a/src/backends/neon/NeonBackendModelContext.cpp
+++ b/src/backends/neon/NeonBackendModelContext.cpp
@@ -1,10 +1,12 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "NeonBackendModelContext.hpp"
 
+#include <arm_compute/core/CPP/CPPTypes.h>
+
 namespace
 {
 
@@ -32,7 +34,7 @@ namespace armnn
 {
 
 NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOptions)
-    : m_IsFastMathEnabled(false), m_NumberOfThreads(0)
+    : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSmeEnabled(true)
 {
    if (!modelOptions.empty())
    {
@@ -40,14 +42,20 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption
        {
            if (name == "FastMathEnabled")
            {
-               m_IsFastMathEnabled |= ParseBool(value, false);
+               m_IsFastMathEnabled = ParseBool(value, m_IsFastMathEnabled);
            }
            if (name == "NumberOfThreads")
            {
-               m_NumberOfThreads |= ParseUnsignedInt(value, 0);
+               m_NumberOfThreads = ParseUnsignedInt(value, m_NumberOfThreads);
+           }
+           if (name == "SmeEnabled")
+           {
+               m_IsSmeEnabled = ParseBool(value, m_IsSmeEnabled);
            }
        });
    }
+
+   arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled);
 }
 
 bool NeonBackendModelContext::IsFastMathEnabled() const
@@ -60,4 +68,9 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const
     return m_NumberOfThreads;
 }
 
-} // namespace armnn
\ No newline at end of file
+bool NeonBackendModelContext::IsSmeEnabled() const
+{
+    return m_IsSmeEnabled;
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp
index e736efc1d8..7ca9d0cf0d 100644
--- a/src/backends/neon/NeonBackendModelContext.hpp
+++ b/src/backends/neon/NeonBackendModelContext.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -16,6 +16,8 @@ namespace armnn
 ///    results with reduced or different precision. The fast_math flag will not have any effect on int8 performance.
 ///  - "NumberOfThreads"\n
 ///    Specify the number of threads used by the CpuAcc backend.
+///  - "SmeEnabled"\n
+///    Specify whether SME/SME2 implementations may be selected by the CpuAcc backend.
 class NeonBackendModelContext : public IBackendModelContext
 {
 public:
@@ -25,9 +27,12 @@ class NeonBackendModelContext : public IBackendModelContext
 
     unsigned int GetNumberOfThreads() const;
 
+    bool IsSmeEnabled() const;
+
 private:
     bool m_IsFastMathEnabled;
     unsigned int m_NumberOfThreads;
+    bool m_IsSmeEnabled;
 };
 
-} // namespace armnn
\ No newline at end of file
+} // namespace armnn
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index f75f84d2d8..2126bf954e 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017-2024 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -20,6 +20,8 @@
 #include <backendsCommon/MemImportWorkload.hpp>
 #include <armnn/backends/TensorHandle.hpp>
 
+#include <arm_compute/core/CPP/CPPTypes.h>
+
 #include <neon/workloads/NeonWorkloadUtils.hpp>
 #include <neon/workloads/NeonWorkloads.hpp>
 
@@ -63,6 +65,8 @@ void NeonWorkloadFactory::SetNumberOfThreads()
         auto modelOptions = dynamic_cast<NeonBackendModelContext*>(m_ModelContextPtr.get());
         auto numberOfThreads = modelOptions->GetNumberOfThreads();
 
+        arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled());
+
         if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS)
         {
             arm_compute::Scheduler::get().set_num_threads(numberOfThreads);

From 3515aac50f7d00f19f49ffc5fc5bb5a4278dd410 Mon Sep 17 00:00:00 2001
From: Damien Dooley <damien.dooley@arm.com>
Date: Mon, 8 Jun 2026 14:53:12 +0100
Subject: [PATCH 2/4] Updated heuristic for better coverage FP16 and FP32 use
 cases

---
 delegate/cmake/Modules/FindTfLite.cmake       |  7 +-
 include/armnnUtils/QuantizeHelper.hpp         |  2 +-
 src/armnn/Network.cpp                         | 85 +++++++++++++++++--
 src/armnnUtils/Half.hpp                       | 24 ++----
 src/backends/neon/NeonBackendModelContext.cpp | 12 ++-
 src/backends/neon/NeonBackendModelContext.hpp |  5 ++
 src/backends/neon/NeonWorkloadFactory.cpp     |  1 +
 7 files changed, 109 insertions(+), 27 deletions(-)

diff --git a/delegate/cmake/Modules/FindTfLite.cmake b/delegate/cmake/Modules/FindTfLite.cmake
index bdb4df7ce0..d5c0e53606 100644
--- a/delegate/cmake/Modules/FindTfLite.cmake
+++ b/delegate/cmake/Modules/FindTfLite.cmake
@@ -150,7 +150,7 @@ if (TfLite_LIB MATCHES .a$)
                                       TfLite_ruy_prepare_packed_matrices_LIB TfLite_ruy_system_aligned_alloc_LIB TfLite_ruy_threadpool_LIB
                                       TfLite_ruy_trmul_LIB TfLite_ruy_tune_LIB TfLite_ruy_wait_LIB TfLite_ruy_profiler_LIB TfLite_cpuinfo_LIB
                                       TfLite_abseil_synchronization_LIB TfLite_abseil_graphCycle_internal_LIB TfLite_abseil_raw_logging_internal_LIB
-                                      TfLite_abseil_kernel_timeout_LIB TfLite_abseil_internal_strings_LIB)
+                                      TfLite_abseil_internal_strings_LIB)
     # Set external variables for usage in CMakeLists.txt
     if (TFLITE_FOUND)
         # WARNING! The order of these libraries is critical. Moving them
@@ -166,7 +166,10 @@ if (TfLite_LIB MATCHES .a$)
                                      ${TfLite_ruy_prepare_packed_matrices_LIB} ${TfLite_ruy_system_aligned_alloc_LIB}
                                      ${TfLite_ruy_tune_LIB} ${TfLite_ruy_wait_LIB} ${TfLite_ruy_profiler_LIB}
                                      ${TfLite_cpuinfo_LIB} ${TfLite_abseil_synchronization_LIB} ${TfLite_abseil_graphCycle_internal_LIB}
-                                     ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_kernel_timeout_LIB} ${TfLite_abseil_internal_strings_LIB})
+                                     ${TfLite_abseil_raw_logging_internal_LIB} ${TfLite_abseil_internal_strings_LIB})
+        if(TfLite_abseil_kernel_timeout_LIB)
+            list(APPEND TfLite_LIB ${TfLite_abseil_kernel_timeout_LIB})
+        endif()
     endif ()
 elseif (TfLite_LIB MATCHES .so$)
     message("-- Dynamic tensorflow lite library found, using for ArmNN build")
diff --git a/include/armnnUtils/QuantizeHelper.hpp b/include/armnnUtils/QuantizeHelper.hpp
index 231b8411cb..5868d13fc8 100644
--- a/include/armnnUtils/QuantizeHelper.hpp
+++ b/include/armnnUtils/QuantizeHelper.hpp
@@ -96,7 +96,7 @@ float SelectiveDequantize(T value, float scale, int32_t offset)
 template<typename ItType>
 struct IsFloatingPointIterator
 {
-    static constexpr bool value=std::is_floating_point<typename std::iterator_traits<ItType>::value_type>::value;
+    static constexpr bool value=armnn::IsFloatingPoint<typename std::iterator_traits<ItType>::value_type>::value;
 };
 
 template <typename T, typename FloatIt,
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 48276cd9e8..abd9ab3154 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -50,6 +50,10 @@ namespace
 struct Sme2ShapeProfile
 {
     unsigned int m_GemmLikeOps = 0;
+    unsigned int m_DepthwiseConvolution2dOps = 0;
+    unsigned int m_SmallDenseProjectionOps = 0;
+    int64_t m_GemmMacs = 0;
+    int64_t m_NonPointwiseGemmMacs = 0;
     bool m_HasFp16 = false;
     bool m_HasQuantized = false;
     bool m_HasSegmentationShape = false;
@@ -127,7 +131,8 @@ void RecordGemmShape(Sme2ShapeProfile& profile,
                      int64_t n,
                      int64_t k,
                      int64_t kernelH,
-                     int64_t kernelW)
+                     int64_t kernelW,
+                     bool isDenseProjection)
 {
     if (m <= 0 || n <= 0 || k <= 0)
     {
@@ -137,6 +142,16 @@ void RecordGemmShape(Sme2ShapeProfile& profile,
     ++profile.m_GemmLikeOps;
 
     const bool is1x1 = kernelH == 1 && kernelW == 1;
+    const int64_t macs = m * n * k;
+    profile.m_GemmMacs += macs;
+    if (!is1x1)
+    {
+        profile.m_NonPointwiseGemmMacs += macs;
+    }
+    if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024)
+    {
+        ++profile.m_SmallDenseProjectionOps;
+    }
 
     if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
     {
@@ -204,7 +219,7 @@ void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
     const int64_t outputElements = NumElements(outputShape);
     const int64_t m = n > 0 ? outputElements / n : 0;
 
-    RecordGemmShape(profile, m, n, k, kernelH, kernelW);
+    RecordGemmShape(profile, m, n, k, kernelH, kernelW, false);
 }
 
 void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer)
@@ -245,7 +260,7 @@ void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer)
     const int64_t outputElements = NumElements(outputInfo.GetShape());
     const int64_t m = n > 0 ? outputElements / n : 0;
 
-    RecordGemmShape(profile, m, n, k, 1, 1);
+    RecordGemmShape(profile, m, n, k, 1, 1, true);
 }
 
 void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer)
@@ -281,7 +296,27 @@ void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer)
         k = DimensionFromEnd(lhsShape, 2);
     }
 
-    RecordGemmShape(profile, m, n, k, 1, 1);
+    RecordGemmShape(profile, m, n, k, 1, 1, true);
+}
+
+void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    ++profile.m_DepthwiseConvolution2dOps;
+
+    for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i)
+    {
+        if (layer.GetInputSlot(i).IsTensorInfoSet())
+        {
+            RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo());
+        }
+    }
+    for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i)
+    {
+        if (layer.GetOutputSlot(i).IsTensorInfoSet())
+        {
+            RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo());
+        }
+    }
 }
 
 Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16)
@@ -302,6 +337,9 @@ Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16
             case LayerType::BatchMatMul:
                 RecordBatchMatMul(profile, *layer);
                 break;
+            case LayerType::DepthwiseConvolution2d:
+                RecordDepthwiseConvolution2d(profile, *layer);
+                break;
             default:
                 for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i)
                 {
@@ -353,6 +391,30 @@ unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions)
     return numberOfThreads;
 }
 
+bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile)
+{
+    const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized;
+    if (!isFloatOnly)
+    {
+        return false;
+    }
+
+    const bool hasHeavySpatialConvolution =
+        profile.m_GemmMacs > 0 &&
+        profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs &&
+        !profile.m_HasSegmentationShape;
+
+    const bool hasSmallDenseGraph =
+        profile.m_DepthwiseConvolution2dOps == 0 &&
+        profile.m_SmallDenseProjectionOps >= 4 &&
+        !profile.m_HasSmallMLargeNProjection;
+
+    return profile.m_HasPoseShape ||
+           profile.m_HasStyleTransferShape ||
+           hasHeavySpatialConvolution ||
+           hasSmallDenseGraph;
+}
+
 bool ShouldDisableSme(const Sme2ShapeProfile& profile)
 {
     if (profile.m_GemmLikeOps == 0)
@@ -360,7 +422,17 @@ bool ShouldDisableSme(const Sme2ShapeProfile& profile)
         return false;
     }
 
-    return (profile.m_HasFp16 || profile.m_HasQuantized) && !profile.m_HasSmallMLargeNProjection;
+    if (profile.m_HasFp16)
+    {
+        return true;
+    }
+
+    if (profile.m_HasQuantized)
+    {
+        return !profile.m_HasSmallMLargeNProjection;
+    }
+
+    return HasFloatSmeRegressionRisk(profile);
 }
 
 unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads)
@@ -392,10 +464,11 @@ void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOption
 {
     const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16);
     const bool smeEnabled = !ShouldDisableSme(profile);
+    const bool sveEnabled = smeEnabled || profile.m_HasQuantized;
     const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions);
     const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads);
 
-    modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}}));
+    modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}}));
     if (selectedThreads != requestedThreads)
     {
         modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}}));
diff --git a/src/armnnUtils/Half.hpp b/src/armnnUtils/Half.hpp
index e39968f54d..ce867725dd 100644
--- a/src/armnnUtils/Half.hpp
+++ b/src/armnnUtils/Half.hpp
@@ -20,25 +20,15 @@
 namespace armnn
 {
     using Half = half_float::half; //import half float implementation
-} //namespace armnn
-
-
-namespace std
-{
 
-template<>
-struct is_floating_point<armnn::Half>
-    : integral_constant< bool, true >
+template<typename T>
+struct IsArmnnHalf
+    : std::is_same<typename std::remove_cv<T>::type, Half>
 {};
 
-template<>
-struct is_floating_point<const armnn::Half>
-    : integral_constant< bool, true >
+template<typename T>
+struct IsFloatingPoint
+    : std::integral_constant<bool, std::is_floating_point<T>::value || IsArmnnHalf<T>::value>
 {};
 
-template<>
-struct is_floating_point<volatile armnn::Half>
-    : integral_constant< bool, true >
-{};
-
-} //namespace std
+} //namespace armnn
diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp
index acc2eecc07..20b44795dd 100644
--- a/src/backends/neon/NeonBackendModelContext.cpp
+++ b/src/backends/neon/NeonBackendModelContext.cpp
@@ -34,7 +34,7 @@ namespace armnn
 {
 
 NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOptions)
-    : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSmeEnabled(true)
+    : m_IsFastMathEnabled(false), m_NumberOfThreads(0), m_IsSveEnabled(true), m_IsSmeEnabled(true)
 {
    if (!modelOptions.empty())
    {
@@ -52,9 +52,14 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption
            {
                m_IsSmeEnabled = ParseBool(value, m_IsSmeEnabled);
            }
+           if (name == "SveEnabled")
+           {
+               m_IsSveEnabled = ParseBool(value, m_IsSveEnabled);
+           }
        });
    }
 
+   arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled);
    arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled);
 }
 
@@ -68,6 +73,11 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const
     return m_NumberOfThreads;
 }
 
+bool NeonBackendModelContext::IsSveEnabled() const
+{
+    return m_IsSveEnabled;
+}
+
 bool NeonBackendModelContext::IsSmeEnabled() const
 {
     return m_IsSmeEnabled;
diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp
index 7ca9d0cf0d..60d3157471 100644
--- a/src/backends/neon/NeonBackendModelContext.hpp
+++ b/src/backends/neon/NeonBackendModelContext.hpp
@@ -16,6 +16,8 @@ namespace armnn
 ///    results with reduced or different precision. The fast_math flag will not have any effect on int8 performance.
 ///  - "NumberOfThreads"\n
 ///    Specify the number of threads used by the CpuAcc backend.
+///  - "SveEnabled"\n
+///    Specify whether SVE/SVE2 implementations may be selected by the CpuAcc backend.
 ///  - "SmeEnabled"\n
 ///    Specify whether SME/SME2 implementations may be selected by the CpuAcc backend.
 class NeonBackendModelContext : public IBackendModelContext
@@ -27,11 +29,14 @@ class NeonBackendModelContext : public IBackendModelContext
 
     unsigned int GetNumberOfThreads() const;
 
+    bool IsSveEnabled() const;
+
     bool IsSmeEnabled() const;
 
 private:
     bool m_IsFastMathEnabled;
     unsigned int m_NumberOfThreads;
+    bool m_IsSveEnabled;
     bool m_IsSmeEnabled;
 };
 
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 2126bf954e..e90fe60488 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -65,6 +65,7 @@ void NeonWorkloadFactory::SetNumberOfThreads()
         auto modelOptions = dynamic_cast<NeonBackendModelContext*>(m_ModelContextPtr.get());
         auto numberOfThreads = modelOptions->GetNumberOfThreads();
 
+        arm_compute::CPUInfo::get().set_sve_allowed(modelOptions->IsSveEnabled());
         arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled());
 
         if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS)

From 00fca53fb3defff227a45dea0770c8eec59f91d9 Mon Sep 17 00:00:00 2001
From: Damien Dooley <damien.dooley@arm.com>
Date: Fri, 12 Jun 2026 11:49:04 +0100
Subject: [PATCH 3/4] Separated heuristic defnition and tightened other logic

---
 CMakeLists.txt                                |   2 +
 src/armnn/Network.cpp                         | 437 +----------------
 src/armnn/Sme2ShapePolicy.cpp                 | 453 ++++++++++++++++++
 src/armnn/Sme2ShapePolicy.hpp                 |  17 +
 src/backends/neon/NeonBackendModelContext.cpp |  13 +-
 src/backends/neon/NeonBackendModelContext.hpp |   4 +-
 src/backends/neon/NeonWorkloadFactory.cpp     |   5 +-
 7 files changed, 479 insertions(+), 452 deletions(-)
 create mode 100644 src/armnn/Sme2ShapePolicy.cpp
 create mode 100644 src/armnn/Sme2ShapePolicy.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ca3bb69d5..08ed3ea135 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -385,6 +385,8 @@ list(APPEND armnn_sources
     src/armnn/Logging.cpp
     src/armnn/Network.cpp
     src/armnn/Network.hpp
+    src/armnn/Sme2ShapePolicy.cpp
+    src/armnn/Sme2ShapePolicy.hpp
     src/armnn/NetworkUtils.cpp
     src/armnn/NetworkUtils.hpp
     src/armnn/Observable.cpp
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index abd9ab3154..f83be3b864 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -4,6 +4,7 @@
 //
 
 #include "Network.hpp"
+#include "Sme2ShapePolicy.hpp"
 #include "Graph.hpp"
 #include "Layer.hpp"
 #include "DeviceSpec.hpp"
@@ -14,8 +15,6 @@
 #include "armnnUtils/Filesystem.hpp"
 #include "armnn/utility/Timer.hpp"
 
-#include <armnnUtils/DataLayoutIndexed.hpp>
-
 #include <armnn/backends/TensorHandle.hpp>
 #include <armnn/backends/WorkloadFactory.hpp>
 #include <armnn/backends/IBackendInternal.hpp>
@@ -36,446 +35,12 @@
 #include <fmt/format.h>
 
 #include <fcntl.h>
-#include <algorithm>
-#include <cstdint>
 #include <memory>
 #include <vector>
 #include <armnn/ArmNN.hpp>
 
 namespace armnn
 {
-namespace
-{
-
-struct Sme2ShapeProfile
-{
-    unsigned int m_GemmLikeOps = 0;
-    unsigned int m_DepthwiseConvolution2dOps = 0;
-    unsigned int m_SmallDenseProjectionOps = 0;
-    int64_t m_GemmMacs = 0;
-    int64_t m_NonPointwiseGemmMacs = 0;
-    bool m_HasFp16 = false;
-    bool m_HasQuantized = false;
-    bool m_HasSegmentationShape = false;
-    bool m_HasStyleTransferShape = false;
-    bool m_HasPoseShape = false;
-    bool m_HasSmallMLargeNProjection = false;
-};
-
-bool IsQuantizedDataType(DataType dataType)
-{
-    switch (dataType)
-    {
-        case DataType::QAsymmU8:
-        case DataType::QAsymmS8:
-        case DataType::QSymmS8:
-        case DataType::QSymmS16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo)
-{
-    const DataType dataType = tensorInfo.GetDataType();
-    profile.m_HasFp16 |= dataType == DataType::Float16;
-    profile.m_HasQuantized |= IsQuantizedDataType(dataType);
-}
-
-bool HasSpecifiedShape(const TensorInfo& tensorInfo)
-{
-    const TensorShape& shape = tensorInfo.GetShape();
-    return shape.GetDimensionality() == Dimensionality::Specified &&
-           shape.AreAllDimensionsSpecified();
-}
-
-int64_t NumElements(const TensorShape& shape)
-{
-    if (shape.GetDimensionality() != Dimensionality::Specified ||
-        !shape.AreAllDimensionsSpecified())
-    {
-        return 0;
-    }
-
-    int64_t elements = 1;
-    for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i)
-    {
-        elements *= static_cast<int64_t>(std::max(shape[i], 1U));
-    }
-    return elements;
-}
-
-int64_t Dimension(const TensorShape& shape, unsigned int index)
-{
-    if (shape.GetDimensionality() != Dimensionality::Specified ||
-        !shape.AreAllDimensionsSpecified() ||
-        index >= shape.GetNumDimensions())
-    {
-        return 0;
-    }
-    return static_cast<int64_t>(shape[index]);
-}
-
-int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset)
-{
-    if (offset == 0 || shape.GetNumDimensions() < offset)
-    {
-        return 0;
-    }
-    return Dimension(shape, shape.GetNumDimensions() - offset);
-}
-
-void RecordGemmShape(Sme2ShapeProfile& profile,
-                     int64_t m,
-                     int64_t n,
-                     int64_t k,
-                     int64_t kernelH,
-                     int64_t kernelW,
-                     bool isDenseProjection)
-{
-    if (m <= 0 || n <= 0 || k <= 0)
-    {
-        return;
-    }
-
-    ++profile.m_GemmLikeOps;
-
-    const bool is1x1 = kernelH == 1 && kernelW == 1;
-    const int64_t macs = m * n * k;
-    profile.m_GemmMacs += macs;
-    if (!is1x1)
-    {
-        profile.m_NonPointwiseGemmMacs += macs;
-    }
-    if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024)
-    {
-        ++profile.m_SmallDenseProjectionOps;
-    }
-
-    if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
-    {
-        profile.m_HasSegmentationShape = true;
-    }
-
-    if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) ||
-                   (m >= 60000 && m <= 75000 && n == 32 && k >= 500)))
-    {
-        profile.m_HasStyleTransferShape = true;
-    }
-
-    if (!is1x1 && m >= 100000 && n >= 64 && k >= 500)
-    {
-        profile.m_HasPoseShape = true;
-    }
-
-    if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024)
-    {
-        profile.m_HasSmallMLargeNProjection = true;
-    }
-}
-
-void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
-{
-    if (layer.GetNumInputSlots() < 2 ||
-        layer.GetNumOutputSlots() == 0 ||
-        !layer.GetInputSlot(0).IsTensorInfoSet() ||
-        !layer.GetInputSlot(1).IsTensorInfoSet() ||
-        !layer.GetOutputSlot(0).IsTensorInfoSet())
-    {
-        return;
-    }
-
-    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
-    const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo();
-    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
-    RecordTensorType(profile, inputInfo);
-    RecordTensorType(profile, filterInfo);
-    RecordTensorType(profile, outputInfo);
-
-    if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo))
-    {
-        return;
-    }
-
-    const Convolution2dDescriptor& descriptor =
-        static_cast<const Convolution2dDescriptor&>(layer.GetParameters());
-    const TensorShape& filterShape = filterInfo.GetShape();
-    const TensorShape& outputShape = outputInfo.GetShape();
-    const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout);
-
-    if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4)
-    {
-        return;
-    }
-
-    const int64_t n = Dimension(filterShape, 0);
-    const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex());
-    const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex());
-    const int64_t filterElements = NumElements(filterShape);
-    const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ?
-        filterElements / (n * kernelH * kernelW) : 0;
-    const int64_t k = kernelH * kernelW * inputChannels;
-    const int64_t outputElements = NumElements(outputShape);
-    const int64_t m = n > 0 ? outputElements / n : 0;
-
-    RecordGemmShape(profile, m, n, k, kernelH, kernelW, false);
-}
-
-void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer)
-{
-    if (layer.GetNumInputSlots() < 2 ||
-        layer.GetNumOutputSlots() == 0 ||
-        !layer.GetInputSlot(0).IsTensorInfoSet() ||
-        !layer.GetInputSlot(1).IsTensorInfoSet() ||
-        !layer.GetOutputSlot(0).IsTensorInfoSet())
-    {
-        return;
-    }
-
-    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
-    const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo();
-    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
-    RecordTensorType(profile, inputInfo);
-    RecordTensorType(profile, weightsInfo);
-    RecordTensorType(profile, outputInfo);
-
-    if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo))
-    {
-        return;
-    }
-
-    const TensorShape& weightsShape = weightsInfo.GetShape();
-    if (weightsShape.GetNumDimensions() < 2)
-    {
-        return;
-    }
-
-    const FullyConnectedDescriptor& descriptor =
-        static_cast<const FullyConnectedDescriptor&>(layer.GetParameters());
-    const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U;
-    const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U;
-    const int64_t n = Dimension(weightsShape, nIndex);
-    const int64_t k = Dimension(weightsShape, kIndex);
-    const int64_t outputElements = NumElements(outputInfo.GetShape());
-    const int64_t m = n > 0 ? outputElements / n : 0;
-
-    RecordGemmShape(profile, m, n, k, 1, 1, true);
-}
-
-void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer)
-{
-    if (layer.GetNumInputSlots() < 2 ||
-        layer.GetNumOutputSlots() == 0 ||
-        !layer.GetInputSlot(0).IsTensorInfoSet() ||
-        !layer.GetInputSlot(1).IsTensorInfoSet() ||
-        !layer.GetOutputSlot(0).IsTensorInfoSet())
-    {
-        return;
-    }
-
-    const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo();
-    const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo();
-    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
-    RecordTensorType(profile, lhsInfo);
-    RecordTensorType(profile, rhsInfo);
-    RecordTensorType(profile, outputInfo);
-
-    if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo))
-    {
-        return;
-    }
-
-    const TensorShape& lhsShape = lhsInfo.GetShape();
-    const TensorShape& outputShape = outputInfo.GetShape();
-    const int64_t n = DimensionFromEnd(outputShape, 1);
-    const int64_t m = n > 0 ? NumElements(outputShape) / n : 0;
-    int64_t k = DimensionFromEnd(lhsShape, 1);
-    if (k == n)
-    {
-        k = DimensionFromEnd(lhsShape, 2);
-    }
-
-    RecordGemmShape(profile, m, n, k, 1, 1, true);
-}
-
-void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
-{
-    ++profile.m_DepthwiseConvolution2dOps;
-
-    for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i)
-    {
-        if (layer.GetInputSlot(i).IsTensorInfoSet())
-        {
-            RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo());
-        }
-    }
-    for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i)
-    {
-        if (layer.GetOutputSlot(i).IsTensorInfoSet())
-        {
-            RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo());
-        }
-    }
-}
-
-Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16)
-{
-    Sme2ShapeProfile profile;
-    profile.m_HasFp16 = reduceFp32ToFp16;
-
-    for (const Layer* layer : graph)
-    {
-        switch (layer->GetType())
-        {
-            case LayerType::Convolution2d:
-                RecordConvolution2d(profile, *layer);
-                break;
-            case LayerType::FullyConnected:
-                RecordFullyConnected(profile, *layer);
-                break;
-            case LayerType::BatchMatMul:
-                RecordBatchMatMul(profile, *layer);
-                break;
-            case LayerType::DepthwiseConvolution2d:
-                RecordDepthwiseConvolution2d(profile, *layer);
-                break;
-            default:
-                for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i)
-                {
-                    if (layer->GetInputSlot(i).IsTensorInfoSet())
-                    {
-                        RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo());
-                    }
-                }
-                for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i)
-                {
-                    if (layer->GetOutputSlot(i).IsTensorInfoSet())
-                    {
-                        RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo());
-                    }
-                }
-                break;
-        }
-    }
-
-    return profile;
-}
-
-unsigned int CapWorkerCount(unsigned int workers, unsigned int cap)
-{
-    if (workers == 0 || cap == 0 || cap >= workers)
-    {
-        return workers;
-    }
-    return cap;
-}
-
-unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions)
-{
-    unsigned int numberOfThreads = 0;
-    ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value)
-    {
-        if (name == "NumberOfThreads")
-        {
-            if (value.IsUnsignedInt())
-            {
-                numberOfThreads = value.AsUnsignedInt();
-            }
-            else if (value.IsInt() && value.AsInt() > 0)
-            {
-                numberOfThreads = static_cast<unsigned int>(value.AsInt());
-            }
-        }
-    });
-    return numberOfThreads;
-}
-
-bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile)
-{
-    const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized;
-    if (!isFloatOnly)
-    {
-        return false;
-    }
-
-    const bool hasHeavySpatialConvolution =
-        profile.m_GemmMacs > 0 &&
-        profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs &&
-        !profile.m_HasSegmentationShape;
-
-    const bool hasSmallDenseGraph =
-        profile.m_DepthwiseConvolution2dOps == 0 &&
-        profile.m_SmallDenseProjectionOps >= 4 &&
-        !profile.m_HasSmallMLargeNProjection;
-
-    return profile.m_HasPoseShape ||
-           profile.m_HasStyleTransferShape ||
-           hasHeavySpatialConvolution ||
-           hasSmallDenseGraph;
-}
-
-bool ShouldDisableSme(const Sme2ShapeProfile& profile)
-{
-    if (profile.m_GemmLikeOps == 0)
-    {
-        return false;
-    }
-
-    if (profile.m_HasFp16)
-    {
-        return true;
-    }
-
-    if (profile.m_HasQuantized)
-    {
-        return !profile.m_HasSmallMLargeNProjection;
-    }
-
-    return HasFloatSmeRegressionRisk(profile);
-}
-
-unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads)
-{
-    if (!profile.m_HasQuantized || ShouldDisableSme(profile))
-    {
-        return requestedThreads;
-    }
-
-    if (profile.m_GemmLikeOps == 0)
-    {
-        return CapWorkerCount(requestedThreads, 1);
-    }
-
-    if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape)
-    {
-        return requestedThreads;
-    }
-
-    if (profile.m_HasPoseShape)
-    {
-        return CapWorkerCount(requestedThreads, 4);
-    }
-
-    return CapWorkerCount(requestedThreads, 1);
-}
-
-void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions)
-{
-    const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16);
-    const bool smeEnabled = !ShouldDisableSme(profile);
-    const bool sveEnabled = smeEnabled || profile.m_HasQuantized;
-    const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions);
-    const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads);
-
-    modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}}));
-    if (selectedThreads != requestedThreads)
-    {
-        modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}}));
-    }
-}
-
-} // namespace
 
 INetwork::INetwork(NetworkOptions networkOptions) : pNetworkImpl(new NetworkImpl(networkOptions)) {}
 
diff --git a/src/armnn/Sme2ShapePolicy.cpp b/src/armnn/Sme2ShapePolicy.cpp
new file mode 100644
index 0000000000..a94ac733e2
--- /dev/null
+++ b/src/armnn/Sme2ShapePolicy.cpp
@@ -0,0 +1,453 @@
+//
+// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Sme2ShapePolicy.hpp"
+
+#include "Graph.hpp"
+#include "Layer.hpp"
+#include "armnnUtils/DataLayoutIndexed.hpp"
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace armnn
+{
+namespace
+{
+
+struct Sme2ShapeProfile
+{
+    unsigned int m_GemmLikeOps = 0;
+    unsigned int m_DepthwiseConvolution2dOps = 0;
+    unsigned int m_SmallDenseProjectionOps = 0;
+    int64_t m_GemmMacs = 0;
+    int64_t m_NonPointwiseGemmMacs = 0;
+    bool m_HasFp16 = false;
+    bool m_HasQuantized = false;
+    bool m_HasSegmentationShape = false;
+    bool m_HasStyleTransferShape = false;
+    bool m_HasPoseShape = false;
+    bool m_HasSmallMLargeNProjection = false;
+};
+
+bool IsQuantizedDataType(DataType dataType)
+{
+    switch (dataType)
+    {
+        case DataType::QAsymmU8:
+        case DataType::QAsymmS8:
+        case DataType::QSymmS8:
+        case DataType::QSymmS16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+void RecordTensorType(Sme2ShapeProfile& profile, const TensorInfo& tensorInfo)
+{
+    const DataType dataType = tensorInfo.GetDataType();
+    profile.m_HasFp16 |= dataType == DataType::Float16;
+    profile.m_HasQuantized |= IsQuantizedDataType(dataType);
+}
+
+bool HasSpecifiedShape(const TensorInfo& tensorInfo)
+{
+    const TensorShape& shape = tensorInfo.GetShape();
+    return shape.GetDimensionality() == Dimensionality::Specified &&
+           shape.AreAllDimensionsSpecified();
+}
+
+int64_t NumElements(const TensorShape& shape)
+{
+    if (shape.GetDimensionality() != Dimensionality::Specified ||
+        !shape.AreAllDimensionsSpecified())
+    {
+        return 0;
+    }
+
+    int64_t elements = 1;
+    for (unsigned int i = 0; i < shape.GetNumDimensions(); ++i)
+    {
+        elements *= static_cast<int64_t>(std::max(shape[i], 1U));
+    }
+    return elements;
+}
+
+int64_t Dimension(const TensorShape& shape, unsigned int index)
+{
+    if (shape.GetDimensionality() != Dimensionality::Specified ||
+        !shape.AreAllDimensionsSpecified() ||
+        index >= shape.GetNumDimensions())
+    {
+        return 0;
+    }
+    return static_cast<int64_t>(shape[index]);
+}
+
+int64_t DimensionFromEnd(const TensorShape& shape, unsigned int offset)
+{
+    if (offset == 0 || shape.GetNumDimensions() < offset)
+    {
+        return 0;
+    }
+    return Dimension(shape, shape.GetNumDimensions() - offset);
+}
+
+void RecordGemmShape(Sme2ShapeProfile& profile,
+                     int64_t m,
+                     int64_t n,
+                     int64_t k,
+                     int64_t kernelH,
+                     int64_t kernelW,
+                     bool isDenseProjection)
+{
+    if (m <= 0 || n <= 0 || k <= 0)
+    {
+        return;
+    }
+
+    ++profile.m_GemmLikeOps;
+
+    const bool is1x1 = kernelH == 1 && kernelW == 1;
+    const int64_t macs = m * n * k;
+    profile.m_GemmMacs += macs;
+    if (!is1x1)
+    {
+        profile.m_NonPointwiseGemmMacs += macs;
+    }
+    if (isDenseProjection && is1x1 && m <= 256 && n <= 1024 && k <= 1024)
+    {
+        ++profile.m_SmallDenseProjectionOps;
+    }
+
+    if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
+    {
+        profile.m_HasSegmentationShape = true;
+    }
+
+    if (!is1x1 && ((m >= 25000 && m <= 30000 && n == 64 && k >= 2000) ||
+                   (m >= 60000 && m <= 75000 && n == 32 && k >= 500)))
+    {
+        profile.m_HasStyleTransferShape = true;
+    }
+
+    if (!is1x1 && m >= 100000 && n >= 64 && k >= 500)
+    {
+        profile.m_HasPoseShape = true;
+    }
+
+    if (m <= 64 && n >= 4096 && k >= 64 && k <= 1024)
+    {
+        profile.m_HasSmallMLargeNProjection = true;
+    }
+}
+
+void RecordConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& filterInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, inputInfo);
+    RecordTensorType(profile, filterInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(filterInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const Convolution2dDescriptor& descriptor =
+        static_cast<const Convolution2dDescriptor&>(layer.GetParameters());
+    const TensorShape& filterShape = filterInfo.GetShape();
+    const TensorShape& outputShape = outputInfo.GetShape();
+    const armnnUtils::DataLayoutIndexed dataLayoutIndex(descriptor.m_DataLayout);
+
+    if (filterShape.GetNumDimensions() != 4 || outputShape.GetNumDimensions() != 4)
+    {
+        return;
+    }
+
+    const int64_t n = Dimension(filterShape, 0);
+    const int64_t kernelH = Dimension(filterShape, dataLayoutIndex.GetHeightIndex());
+    const int64_t kernelW = Dimension(filterShape, dataLayoutIndex.GetWidthIndex());
+    const int64_t filterElements = NumElements(filterShape);
+    const int64_t inputChannels = n > 0 && kernelH > 0 && kernelW > 0 ?
+        filterElements / (n * kernelH * kernelW) : 0;
+    const int64_t k = kernelH * kernelW * inputChannels;
+    const int64_t outputElements = NumElements(outputShape);
+    const int64_t m = n > 0 ? outputElements / n : 0;
+
+    RecordGemmShape(profile, m, n, k, kernelH, kernelW, false);
+}
+
+void RecordFullyConnected(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& inputInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& weightsInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, inputInfo);
+    RecordTensorType(profile, weightsInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(inputInfo) || !HasSpecifiedShape(weightsInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const TensorShape& weightsShape = weightsInfo.GetShape();
+    if (weightsShape.GetNumDimensions() < 2)
+    {
+        return;
+    }
+
+    const FullyConnectedDescriptor& descriptor =
+        static_cast<const FullyConnectedDescriptor&>(layer.GetParameters());
+    const unsigned int nIndex = descriptor.m_TransposeWeightMatrix ? 0U : 1U;
+    const unsigned int kIndex = descriptor.m_TransposeWeightMatrix ? 1U : 0U;
+    const int64_t n = Dimension(weightsShape, nIndex);
+    const int64_t k = Dimension(weightsShape, kIndex);
+    const int64_t outputElements = NumElements(outputInfo.GetShape());
+    const int64_t m = n > 0 ? outputElements / n : 0;
+
+    RecordGemmShape(profile, m, n, k, 1, 1, true);
+}
+
+void RecordBatchMatMul(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    if (layer.GetNumInputSlots() < 2 ||
+        layer.GetNumOutputSlots() == 0 ||
+        !layer.GetInputSlot(0).IsTensorInfoSet() ||
+        !layer.GetInputSlot(1).IsTensorInfoSet() ||
+        !layer.GetOutputSlot(0).IsTensorInfoSet())
+    {
+        return;
+    }
+
+    const TensorInfo& lhsInfo = layer.GetInputSlot(0).GetTensorInfo();
+    const TensorInfo& rhsInfo = layer.GetInputSlot(1).GetTensorInfo();
+    const TensorInfo& outputInfo = layer.GetOutputSlot(0).GetTensorInfo();
+    RecordTensorType(profile, lhsInfo);
+    RecordTensorType(profile, rhsInfo);
+    RecordTensorType(profile, outputInfo);
+
+    if (!HasSpecifiedShape(lhsInfo) || !HasSpecifiedShape(rhsInfo) || !HasSpecifiedShape(outputInfo))
+    {
+        return;
+    }
+
+    const TensorShape& lhsShape = lhsInfo.GetShape();
+    const TensorShape& outputShape = outputInfo.GetShape();
+    const int64_t n = DimensionFromEnd(outputShape, 1);
+    const int64_t m = n > 0 ? NumElements(outputShape) / n : 0;
+    int64_t k = DimensionFromEnd(lhsShape, 1);
+    if (k == n)
+    {
+        k = DimensionFromEnd(lhsShape, 2);
+    }
+
+    RecordGemmShape(profile, m, n, k, 1, 1, true);
+}
+
+void RecordDepthwiseConvolution2d(Sme2ShapeProfile& profile, const Layer& layer)
+{
+    ++profile.m_DepthwiseConvolution2dOps;
+
+    for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i)
+    {
+        if (layer.GetInputSlot(i).IsTensorInfoSet())
+        {
+            RecordTensorType(profile, layer.GetInputSlot(i).GetTensorInfo());
+        }
+    }
+    for (unsigned int i = 0; i < layer.GetNumOutputSlots(); ++i)
+    {
+        if (layer.GetOutputSlot(i).IsTensorInfoSet())
+        {
+            RecordTensorType(profile, layer.GetOutputSlot(i).GetTensorInfo());
+        }
+    }
+}
+
+Sme2ShapeProfile BuildSme2ShapeProfile(const Graph& graph, bool reduceFp32ToFp16)
+{
+    Sme2ShapeProfile profile;
+    profile.m_HasFp16 = reduceFp32ToFp16;
+
+    for (const Layer* layer : graph)
+    {
+        switch (layer->GetType())
+        {
+            case LayerType::Convolution2d:
+                RecordConvolution2d(profile, *layer);
+                break;
+            case LayerType::FullyConnected:
+                RecordFullyConnected(profile, *layer);
+                break;
+            case LayerType::BatchMatMul:
+                RecordBatchMatMul(profile, *layer);
+                break;
+            case LayerType::DepthwiseConvolution2d:
+                RecordDepthwiseConvolution2d(profile, *layer);
+                break;
+            default:
+                for (unsigned int i = 0; i < layer->GetNumInputSlots(); ++i)
+                {
+                    if (layer->GetInputSlot(i).IsTensorInfoSet())
+                    {
+                        RecordTensorType(profile, layer->GetInputSlot(i).GetTensorInfo());
+                    }
+                }
+                for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i)
+                {
+                    if (layer->GetOutputSlot(i).IsTensorInfoSet())
+                    {
+                        RecordTensorType(profile, layer->GetOutputSlot(i).GetTensorInfo());
+                    }
+                }
+                break;
+        }
+    }
+
+    return profile;
+}
+
+unsigned int CapWorkerCount(unsigned int workers, unsigned int cap)
+{
+    if (workers == 0 || cap == 0 || cap >= workers)
+    {
+        return workers;
+    }
+    return cap;
+}
+
+unsigned int GetCpuAccNumberOfThreads(const ModelOptions& modelOptions)
+{
+    unsigned int numberOfThreads = 0;
+    ParseOptions(modelOptions, "CpuAcc", [&](std::string name, const BackendOptions::Var& value)
+    {
+        if (name == "NumberOfThreads")
+        {
+            if (value.IsUnsignedInt())
+            {
+                numberOfThreads = value.AsUnsignedInt();
+            }
+            else if (value.IsInt() && value.AsInt() > 0)
+            {
+                numberOfThreads = static_cast<unsigned int>(value.AsInt());
+            }
+        }
+    });
+    return numberOfThreads;
+}
+
+bool HasFloatSmeRegressionRisk(const Sme2ShapeProfile& profile)
+{
+    const bool isFloatOnly = !profile.m_HasFp16 && !profile.m_HasQuantized;
+    if (!isFloatOnly)
+    {
+        return false;
+    }
+
+    const bool hasHeavySpatialConvolution =
+        profile.m_GemmMacs > 0 &&
+        profile.m_NonPointwiseGemmMacs * 2 >= profile.m_GemmMacs &&
+        !profile.m_HasSegmentationShape;
+
+    const bool hasSmallDenseGraph =
+        profile.m_DepthwiseConvolution2dOps == 0 &&
+        profile.m_SmallDenseProjectionOps >= 4 &&
+        !profile.m_HasSmallMLargeNProjection;
+
+    return profile.m_HasPoseShape ||
+           profile.m_HasStyleTransferShape ||
+           hasHeavySpatialConvolution ||
+           hasSmallDenseGraph;
+}
+
+bool ShouldDisableSme(const Sme2ShapeProfile& profile)
+{
+    if (profile.m_GemmLikeOps == 0)
+    {
+        return false;
+    }
+
+    if (profile.m_HasFp16)
+    {
+        return true;
+    }
+
+    if (profile.m_HasQuantized)
+    {
+        return !profile.m_HasSmallMLargeNProjection;
+    }
+
+    return HasFloatSmeRegressionRisk(profile);
+}
+
+unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int requestedThreads)
+{
+    if (!profile.m_HasQuantized || ShouldDisableSme(profile))
+    {
+        return requestedThreads;
+    }
+
+    if (profile.m_GemmLikeOps == 0)
+    {
+        return CapWorkerCount(requestedThreads, 1);
+    }
+
+    if (profile.m_HasSegmentationShape || profile.m_HasStyleTransferShape)
+    {
+        return requestedThreads;
+    }
+
+    if (profile.m_HasPoseShape)
+    {
+        return CapWorkerCount(requestedThreads, 4);
+    }
+
+    return CapWorkerCount(requestedThreads, 1);
+}
+
+} // namespace
+
+void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions)
+{
+    const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16);
+    const bool smeEnabled = !ShouldDisableSme(profile);
+    const bool sveEnabled = smeEnabled || profile.m_HasQuantized;
+    const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions);
+    const unsigned int selectedThreads = SelectNumberOfThreads(profile, requestedThreads);
+
+    modelOptions.push_back(BackendOptions("CpuAcc", {{"SmeEnabled", smeEnabled}, {"SveEnabled", sveEnabled}}));
+    if (selectedThreads != requestedThreads)
+    {
+        modelOptions.push_back(BackendOptions("CpuAcc", {{"NumberOfThreads", selectedThreads}}));
+    }
+}
+
+} // namespace armnn
diff --git a/src/armnn/Sme2ShapePolicy.hpp b/src/armnn/Sme2ShapePolicy.hpp
new file mode 100644
index 0000000000..3e2450f5a0
--- /dev/null
+++ b/src/armnn/Sme2ShapePolicy.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/BackendOptions.hpp>
+
+namespace armnn
+{
+
+class Graph;
+
+void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions);
+
+} // namespace armnn
diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp
index 20b44795dd..f675d2c28e 100644
--- a/src/backends/neon/NeonBackendModelContext.cpp
+++ b/src/backends/neon/NeonBackendModelContext.cpp
@@ -59,8 +59,7 @@ NeonBackendModelContext::NeonBackendModelContext(const ModelOptions& modelOption
        });
    }
 
-   arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled);
-   arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled);
+   ApplyAclIsaPolicy();
 }
 
 bool NeonBackendModelContext::IsFastMathEnabled() const
@@ -73,14 +72,10 @@ unsigned int NeonBackendModelContext::GetNumberOfThreads() const
     return m_NumberOfThreads;
 }
 
-bool NeonBackendModelContext::IsSveEnabled() const
+void NeonBackendModelContext::ApplyAclIsaPolicy() const
 {
-    return m_IsSveEnabled;
-}
-
-bool NeonBackendModelContext::IsSmeEnabled() const
-{
-    return m_IsSmeEnabled;
+    arm_compute::CPUInfo::get().set_sve_allowed(m_IsSveEnabled);
+    arm_compute::CPUInfo::get().set_sme_allowed(m_IsSmeEnabled);
 }
 
 } // namespace armnn
diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp
index 60d3157471..2959bd60ba 100644
--- a/src/backends/neon/NeonBackendModelContext.hpp
+++ b/src/backends/neon/NeonBackendModelContext.hpp
@@ -29,9 +29,7 @@ class NeonBackendModelContext : public IBackendModelContext
 
     unsigned int GetNumberOfThreads() const;
 
-    bool IsSveEnabled() const;
-
-    bool IsSmeEnabled() const;
+    void ApplyAclIsaPolicy() const;
 
 private:
     bool m_IsFastMathEnabled;
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index e90fe60488..dd40073ee7 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -20,8 +20,6 @@
 #include <backendsCommon/MemImportWorkload.hpp>
 #include <armnn/backends/TensorHandle.hpp>
 
-#include <arm_compute/core/CPP/CPPTypes.h>
-
 #include <neon/workloads/NeonWorkloadUtils.hpp>
 #include <neon/workloads/NeonWorkloads.hpp>
 
@@ -65,8 +63,7 @@ void NeonWorkloadFactory::SetNumberOfThreads()
         auto modelOptions = dynamic_cast<NeonBackendModelContext*>(m_ModelContextPtr.get());
         auto numberOfThreads = modelOptions->GetNumberOfThreads();
 
-        arm_compute::CPUInfo::get().set_sve_allowed(modelOptions->IsSveEnabled());
-        arm_compute::CPUInfo::get().set_sme_allowed(modelOptions->IsSmeEnabled());
+        modelOptions->ApplyAclIsaPolicy();
 
         if (numberOfThreads != 0 && numberOfThreads >= MIN_THREADS && numberOfThreads <= MAX_THREADS)
         {

From a66749922ead11b45da60b8639610cfa53cd849a Mon Sep 17 00:00:00 2001
From: Damien Dooley <damien.dooley@arm.com>
Date: Fri, 12 Jun 2026 12:28:22 +0100
Subject: [PATCH 4/4] Widened M band for heuristic

---
 src/armnn/Network.cpp                         | 8 ++++++--
 src/armnn/Sme2ShapePolicy.cpp                 | 8 +++++++-
 src/backends/neon/NeonBackendModelContext.cpp | 2 +-
 src/backends/neon/NeonBackendModelContext.hpp | 2 +-
 src/backends/neon/NeonWorkloadFactory.cpp     | 2 +-
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index f83be3b864..0f97d385df 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -35,6 +35,7 @@
 #include <fmt/format.h>
 
 #include <fcntl.h>
+#include <algorithm>
 #include <memory>
 #include <vector>
 #include <armnn/ArmNN.hpp>
@@ -2096,8 +2097,11 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
         optGraph.InferTensorInfos();
     }
 
-    ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions);
-    optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions;
+    if (std::count(backendPreferences.begin(), backendPreferences.end(), armnn::Compute::CpuAcc) > 0)
+    {
+        ApplySme2ShapePolicy(optGraph, options.GetReduceFp32ToFp16(), optimizedOptions);
+        optNetObjPtr->pOptimizedNetworkImpl->GetModelOptions() = optimizedOptions;
+    }
 
     // Initialize backend settings
     BackendSettings backendSettings(backendPreferences, deviceSpec);
diff --git a/src/armnn/Sme2ShapePolicy.cpp b/src/armnn/Sme2ShapePolicy.cpp
index a94ac733e2..333d00653a 100644
--- a/src/armnn/Sme2ShapePolicy.cpp
+++ b/src/armnn/Sme2ShapePolicy.cpp
@@ -126,7 +126,8 @@ void RecordGemmShape(Sme2ShapeProfile& profile,
         ++profile.m_SmallDenseProjectionOps;
     }
 
-    if (is1x1 && m == 2304 && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
+    const bool hasModerateSpatialM = m >= 2048 && m <= 2560;
+    if (is1x1 && hasModerateSpatialM && ((n >= 900 && k <= 384) || (n <= 384 && k >= 900)))
     {
         profile.m_HasSegmentationShape = true;
     }
@@ -438,6 +439,11 @@ unsigned int SelectNumberOfThreads(const Sme2ShapeProfile& profile, unsigned int
 void ApplySme2ShapePolicy(const Graph& graph, bool reduceFp32ToFp16, ModelOptions& modelOptions)
 {
     const Sme2ShapeProfile profile = BuildSme2ShapeProfile(graph, reduceFp32ToFp16);
+    if (profile.m_GemmLikeOps == 0)
+    {
+        return;
+    }
+
     const bool smeEnabled = !ShouldDisableSme(profile);
     const bool sveEnabled = smeEnabled || profile.m_HasQuantized;
     const unsigned int requestedThreads = GetCpuAccNumberOfThreads(modelOptions);
diff --git a/src/backends/neon/NeonBackendModelContext.cpp b/src/backends/neon/NeonBackendModelContext.cpp
index f675d2c28e..63724b5178 100644
--- a/src/backends/neon/NeonBackendModelContext.cpp
+++ b/src/backends/neon/NeonBackendModelContext.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
diff --git a/src/backends/neon/NeonBackendModelContext.hpp b/src/backends/neon/NeonBackendModelContext.hpp
index 2959bd60ba..d2c1b5323e 100644
--- a/src/backends/neon/NeonBackendModelContext.hpp
+++ b/src/backends/neon/NeonBackendModelContext.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2026 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2020, 2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index dd40073ee7..afde9cef80 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017-2026 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017-2024, 2026 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //