arm · Copilot · Mar 10, 2026 · Mar 10, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -16,15 +16,21 @@ COPY *.h ./
 COPY *.cpp ./
 
 # Build the application with optimizations
-# SSE2 intrinsics are used in the code for x86-64 platforms
-RUN g++ -O2 -o benchmark \
-    main.cpp \
-    matrix_operations.cpp \
-    hash_operations.cpp \
-    string_search.cpp \
-    memory_operations.cpp \
-    polynomial_eval.cpp \
-    -std=c++11
+# Detects architecture at build time: uses NEON on AArch64, native opts on x86-64
+RUN ARCH=$(uname -m) && \
+    if [ "$ARCH" = "aarch64" ]; then \
+        MARCH_FLAG="-march=armv8-a"; \
+    else \
+        MARCH_FLAG="-march=native"; \
+    fi && \
+    g++ -O3 $MARCH_FLAG -o benchmark \
+        main.cpp \
+        matrix_operations.cpp \
+        hash_operations.cpp \
+        string_search.cpp \
+        memory_operations.cpp \
+        polynomial_eval.cpp \
+        -std=c++14
 
 # Create a startup script
 COPY start.sh .

diff --git a/hash_operations.cpp b/hash_operations.cpp
@@ -11,6 +11,13 @@
 #define USE_X86_SIMD 0
 #endif
 
+#ifdef __aarch64__
+#include <arm_neon.h>
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
 unsigned long long compute_hash(const char* data, size_t len) {
     unsigned long long hash = 5381;
     size_t i = 0;
@@ -31,11 +38,21 @@ unsigned long long compute_hash(const char* data, size_t len) {
             hash = ((hash << 5) + hash) + byte;
         }
     }
+#elif USE_ARM_NEON
+    // AArch64 optimized path using NEON: load 16 bytes at a time
+    for (; i + 16 <= len; i += 16) {
+        uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));
+        uint8_t bytes[16];
+        vst1q_u8(bytes, chunk);
+        for (int j = 0; j < 16; j++) {
+            hash = ((hash << 5) + hash) + bytes[j];
+        }
+    }
 #endif
 
-    // Process remaining bytes (or all bytes on non-x86)
+    // Process remaining bytes (or all bytes on non-SIMD)
     for (; i < len; i++) {
-        hash = ((hash << 5) + hash) + data[i];
+        hash = ((hash << 5) + hash) + static_cast<unsigned char>(data[i]);
     }
 
     return hash;

diff --git a/invocation_reasons.yaml b/invocation_reasons.yaml
@@ -0,0 +1,39 @@
+---
+id: f0ef4f16-bb37-4361-9a72-6454cff5ec4e
+timestamp: '2026-03-10T20:52:56.735332+00:00'
+tool: check_image
+args:
+  image: ubuntu:22.04
+reason: Checking if ubuntu:22.04 base image supports ARM64 architecture
+---
+id: 76249d17-ab54-4117-9116-e9b9f5ac6c1b
+timestamp: '2026-03-10T20:52:56.964652+00:00'
+tool: migrate_ease_scan
+args:
+  scanner: cpp
+  arch: armv8-a
+  git_repo: null
+  output_format: json
+  extra_args: null
+reason: Scanning C++ codebase for x86-specific code that needs ARM migration
+---
+id: 6489af70-f9ce-41f2-9f07-eb4c33a0cf4b
+timestamp: '2026-03-10T20:53:35.583334+00:00'
+tool: knowledge_base_search
+args:
+  query: Is g++ compatible with ARM architecture?
+reason: Checking if g++ compiler package in Dockerfile is ARM compatible
+---
+id: 992020cb-0b7e-42f7-a50d-e6f7f4b3ec11
+timestamp: '2026-03-10T20:53:35.605637+00:00'
+tool: knowledge_base_search
+args:
+  query: Is make compatible with ARM architecture?
+reason: Checking if make build tool in Dockerfile is ARM compatible
+---
+id: 34af97ec-cfdd-4e25-b5a2-279e3e730aec
+timestamp: '2026-03-10T20:53:35.618797+00:00'
+tool: knowledge_base_search
+args:
+  query: ARM NEON SIMD intrinsics equivalent to SSE2 x86 for C++ migration
+reason: Finding NEON equivalents for SSE2 intrinsics used in the codebase
diff --git a/main.cpp b/main.cpp
@@ -1,6 +1,6 @@
 /*
  * High-Performance Compute Benchmark Suite
- * Optimized for x86-64 architecture with SSE/AVX SIMD instructions
+ * Optimized for x86-64 (SSE2) and AArch64 (NEON) architectures
  */
 
 #include <iostream>
@@ -16,14 +16,21 @@
 #define USE_X86_SIMD 0
 #endif
 
+#ifdef __aarch64__
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
 int main() {
     std::cout << "========================================" << std::endl;
     std::cout << "  Compute Benchmark Suite" << std::endl;
 #if USE_X86_SIMD
     std::cout << "  x86-64 with SSE2 Optimizations" << std::endl;
+#elif USE_ARM_NEON
+    std::cout << "  AArch64 with NEON Optimizations" << std::endl;
 #else
     std::cout << "  Generic Build (No SIMD)" << std::endl;
-    std::cout << "  NOTE: This code is optimized for x86-64" << std::endl;
 #endif
     std::cout << "========================================" << std::endl;
 

diff --git a/matrix_operations.cpp b/matrix_operations.cpp
@@ -11,6 +11,13 @@
 #define USE_X86_SIMD 0
 #endif
 
+#ifdef __aarch64__
+#include <arm_neon.h>
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
 Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
     data.resize(rows, std::vector<double>(cols, 0.0));
 }
@@ -58,6 +65,32 @@ Matrix Matrix::multiply(const Matrix& other) const {
                 sum += data[i][k] * other.data[k][j];
             }
 
+            result.data[i][j] = sum;
+        }
+    }
+#elif USE_ARM_NEON
+    // AArch64 optimized path using NEON
+    for (size_t i = 0; i < rows; i++) {
+        for (size_t j = 0; j < other.cols; j++) {
+            float64x2_t sum_vec = vdupq_n_f64(0.0);
+            size_t k = 0;
+
+            // Process 2 elements at a time with NEON
+            for (; k + 1 < cols; k += 2) {
+                float64x2_t a_vec = vld1q_f64(&data[i][k]);
+                double b_vals[2] = {other.data[k][j], other.data[k+1][j]};
+                float64x2_t b_vec = vld1q_f64(b_vals);
+                sum_vec = vmlaq_f64(sum_vec, a_vec, b_vec);
+            }
+
+            // Horizontal add
+            double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1);
+
+            // Handle remaining element
+            if (k < cols) {
+                sum += data[i][k] * other.data[k][j];
+            }
+
             result.data[i][j] = sum;
         }
     }

diff --git a/matrix_operations.h b/matrix_operations.h
@@ -4,7 +4,7 @@
 #include <vector>
 #include <cstddef>
 
-// Matrix class with x86 SSE2 optimizations
+// Matrix class with x86 SSE2 and AArch64 NEON optimizations
 class Matrix {
 private:
     std::vector<std::vector<double>> data;

diff --git a/memory_operations.cpp b/memory_operations.cpp
@@ -10,6 +10,13 @@
 #define USE_X86_SIMD 0
 #endif
 
+#ifdef __aarch64__
+#include <arm_neon.h>
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
 void fast_memcpy(void* dest, const void* src, size_t n) {
     char* d = static_cast<char*>(dest);
     const char* s = static_cast<const char*>(src);
@@ -21,9 +28,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
         __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
         _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
     }
+#elif USE_ARM_NEON
+    // AArch64 optimized path using NEON: copy 16 bytes at a time
+    for (; i + 16 <= n; i += 16) {
+        uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
+        vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
+    }
 #endif
 
-    // Copy remaining bytes (or all on non-x86)
+    // Copy remaining bytes (or all on non-SIMD)
     for (; i < n; i++) {
         d[i] = s[i];
     }

diff --git a/polynomial_eval.cpp b/polynomial_eval.cpp
@@ -9,13 +9,19 @@
 #define USE_X86_SIMD 0
 #endif
 
-double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
+#ifdef __aarch64__
+#include <arm_neon.h>
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
+double polynomial_eval_simd(double x, const std::vector<double>& coeffs) {
 #if USE_X86_SIMD
     // x86-64 optimized path using SSE2
     __m128d result_vec = _mm_setzero_pd();
-    __m128d x_vec = _mm_set1_pd(x);
-    __m128d power_vec = _mm_set_pd(x, 1.0);  // [x, 1.0]
     __m128d power_mult = _mm_set1_pd(x * x);
+    __m128d power_vec = _mm_set_pd(x, 1.0);  // _mm_set_pd is high-to-low: element[0]=1.0, element[1]=x
 
     size_t i = 0;
 
@@ -39,6 +45,36 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
         result += coeffs[i] * power_arr[0];
     }
 
+    return result;
+#elif USE_ARM_NEON
+    // AArch64 optimized path using NEON
+    float64x2_t result_vec = vdupq_n_f64(0.0);
+    float64x2_t x_sq_vec = vdupq_n_f64(x * x);
+    // element[0]=1.0 (power for even-indexed coeffs), element[1]=x (power for odd-indexed coeffs)
+    // vld1q_f64 is low-to-high: same ordering as array indices (consistent with SSE2 element[0]/[1] above)
+    double init_powers[2] = {1.0, x};
+    float64x2_t power_vec = vld1q_f64(init_powers);
+
+    size_t i = 0;
+
+    // Process 2 coefficients at a time
+    for (; i + 1 < coeffs.size(); i += 2) {
+        double coeff_vals[2] = {coeffs[i], coeffs[i + 1]};
+        float64x2_t coeff_vec = vld1q_f64(coeff_vals);
+        result_vec = vmlaq_f64(result_vec, coeff_vec, power_vec);
+        power_vec = vmulq_f64(power_vec, x_sq_vec);
+    }
+
+    // Horizontal add
+    double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1);
+
+    // Handle remaining coefficient
+    if (i < coeffs.size()) {
+        double power_arr[2];
+        vst1q_f64(power_arr, power_vec);
+        result += coeffs[i] * power_arr[0];
+    }
+
     return result;
 #else
     // Fallback scalar implementation
@@ -61,7 +97,7 @@ void benchmark_polynomial() {
     auto start = std::chrono::high_resolution_clock::now();
     double sum = 0.0;
     for (int i = 0; i < iterations; i++) {
-        sum += polynomial_eval_sse(1.5 + i * 0.0001, coeffs);
+        sum += polynomial_eval_simd(1.5 + i * 0.0001, coeffs);
     }
     auto end = std::chrono::high_resolution_clock::now();
 

diff --git a/polynomial_eval.h b/polynomial_eval.h
@@ -4,7 +4,7 @@
 #include <vector>
 
 // Vectorized polynomial evaluation using x86 SSE2
-double polynomial_eval_sse(double x, const std::vector<double>& coeffs);
+double polynomial_eval_simd(double x, const std::vector<double>& coeffs);
 
 // Benchmark function
 void benchmark_polynomial();

diff --git a/string_search.cpp b/string_search.cpp
@@ -9,6 +9,13 @@
 #define USE_X86_SIMD 0
 #endif
 
+#ifdef __aarch64__
+#include <arm_neon.h>
+#define USE_ARM_NEON 1
+#else
+#define USE_ARM_NEON 0
+#endif
+
 int simd_string_search(const std::string& text, const std::string& pattern) {
     int count = 0;
     size_t text_len = text.length();
@@ -44,9 +51,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) {
             }
         }
     }
+#elif USE_ARM_NEON
+    // AArch64 optimized path using NEON
+    uint8x16_t first_char_vec = vdupq_n_u8(static_cast<uint8_t>(first_char));
+
+    for (; i + 16 <= text_len - pattern_len + 1; i += 16) {
+        uint8x16_t text_chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(text.data() + i));
+        uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec);
+
+        // Store comparison result as a byte array (0xFF = match, 0x00 = no match)
+        uint8_t cmp_bytes[16];
+        vst1q_u8(cmp_bytes, cmp);
+
+        // Check each potential match position
+        for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) {
+            if (cmp_bytes[bit]) {
+                bool match = true;
+                for (size_t j = 1; j < pattern_len; j++) {
+                    if (text[i + bit + j] != pattern[j]) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) count++;
+            }
+        }
+    }
 #endif
 
-    // Handle remaining characters (or all on non-x86)
+    // Handle remaining characters (or all on non-SIMD)
     for (; i <= text_len - pattern_len; i++) {
         bool match = true;
         for (size_t j = 0; j < pattern_len; j++) {