arm · Copilot · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+benchmark
+benchmark-arm64
+*.o
diff --git a/Dockerfile b/Dockerfile
@@ -16,7 +16,7 @@ COPY *.h ./
 COPY *.cpp ./
 
 # Build the application with optimizations
-# SSE2 intrinsics are used in the code for x86-64 platforms
+# Supports both x86-64 SSE2 intrinsics and ARM64 NEON intrinsics
 RUN g++ -O2 -o benchmark \
     main.cpp \
     matrix_operations.cpp \

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Compute Benchmark Suite
 
-A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions.
+A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions.
 
 ## Overview
 
@@ -11,7 +11,9 @@ This benchmark suite tests various compute-intensive operations including:
 - Memory operations (50MB copy operations)
 - Polynomial evaluation (10M iterations)
 
-The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors.
+The code is optimized using:
+- **x86-64**: SSE2 SIMD intrinsics for Intel and AMD processors
+- **ARM64**: NEON SIMD intrinsics for ARM processors
 
 ## Building with Docker
 
@@ -33,16 +35,18 @@ This will execute all benchmark tests and display timing results for each operat
 
 ## Architecture Notes
 
-- **Optimized for**: x86-64 architecture with SSE2 support
-- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
-- **Fallback**: Includes scalar fallback implementation for non-x86 platforms
+- **x86-64**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
+- **ARM64**: Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations
+- **Fallback**: Includes scalar fallback implementation for other platforms
+
+The code automatically detects the architecture at compile time and uses the appropriate SIMD instructions.
 
 ## Output Example
 
 ```
 ========================================
   Compute Benchmark Suite
-  x86-64 with SSE2 Optimizations
+  ARM64 with NEON Optimizations
 ========================================
 
 === Matrix Multiplication Benchmark ===
@@ -69,4 +73,4 @@ The benchmark suite is organized into separate modules:
 - `memory_operations.{h,cpp}` - Fast memory copy operations
 - `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation
 
-Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable.
+Each module uses C++11 standard library with architecture-specific SIMD intrinsics (SSE2 for x86-64, NEON for ARM64) where applicable.
diff --git a/_codeql_detected_source_root b/_codeql_detected_source_root
@@ -0,0 +1 @@
+.
diff --git a/hash_operations.cpp b/hash_operations.cpp
@@ -4,11 +4,17 @@
 #include <chrono>
 #include <iomanip>
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #include <immintrin.h>
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 unsigned long long compute_hash(const char* data, size_t len) {
@@ -20,20 +26,30 @@ unsigned long long compute_hash(const char* data, size_t len) {
     for (; i + 16 <= len; i += 16) {
         __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));
 
-        // Extract bytes and update hash
+        // Store to array and extract bytes
+        alignas(16) unsigned char bytes[16];
+        _mm_store_si128(reinterpret_cast<__m128i*>(bytes), chunk);
+
+        for (int j = 0; j < 16; j++) {
+            hash = ((hash << 5) + hash) + bytes[j];
+        }
+    }
+#elif USE_ARM_NEON
+    // ARM64 optimized path using NEON
+    for (; i + 16 <= len; i += 16) {
+        uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));
+
+        // Store to array and extract bytes  
+        alignas(16) unsigned char bytes[16];
+        vst1q_u8(bytes, chunk);
+
         for (int j = 0; j < 16; j++) {
-            unsigned char byte = _mm_extract_epi16(chunk, j / 2);
-            if (j % 2 == 0) {
-                byte = byte & 0xFF;
-            } else {
-                byte = (byte >> 8) & 0xFF;
-            }
-            hash = ((hash << 5) + hash) + byte;
+            hash = ((hash << 5) + hash) + bytes[j];
         }
     }
 #endif
 
-    // Process remaining bytes (or all bytes on non-x86)
+    // Process remaining bytes (or all bytes on non-SIMD platforms)
     for (; i < len; i++) {
         hash = ((hash << 5) + hash) + data[i];
     }

diff --git a/invocation_reasons.yaml b/invocation_reasons.yaml
@@ -0,0 +1,34 @@
+---
+id: ae8a3c57-43f0-420b-8a30-4fdbb638925c
+timestamp: '2026-02-04T21:48:05.998962+00:00'
+tool: check_image
+args:
+  image: ubuntu:22.04
+reason: Checking if the ubuntu:22.04 base image in the Dockerfile supports ARM architecture
+---
+id: 2cc8eb7d-74e8-48b7-953b-eb1c0aaa8ae6
+timestamp: '2026-02-04T21:48:10.913142+00:00'
+tool: knowledge_base_search
+args:
+  query: Is g++ compiler compatible with ARM architecture?
+reason: Verifying that the g++ compiler package in the Dockerfile is compatible with
+  ARM
+---
+id: 86964ccd-504a-42c0-a95e-72fc5d7c017b
+timestamp: '2026-02-04T21:48:10.932771+00:00'
+tool: knowledge_base_search
+args:
+  query: Is make build tool compatible with ARM architecture?
+reason: Verifying that the make package in the Dockerfile is compatible with ARM
+---
+id: 91d9de77-dd63-4f3c-9ab4-bff77b0385f2
+timestamp: '2026-02-04T21:48:16.681056+00:00'
+tool: migrate_ease_scan
+args:
+  scanner: cpp
+  arch: armv8-a
+  git_repo: null
+  output_format: json
+  extra_args: null
+reason: Scanning the C++ codebase to identify x86-specific code that needs to be migrated
+  to ARM, including SSE2 intrinsics and architecture-specific dependencies
diff --git a/main.cpp b/main.cpp
@@ -10,20 +10,26 @@
 #include "memory_operations.h"
 #include "polynomial_eval.h"
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 int main() {
     std::cout << "========================================" << std::endl;
     std::cout << "  Compute Benchmark Suite" << std::endl;
 #if USE_X86_SIMD
     std::cout << "  x86-64 with SSE2 Optimizations" << std::endl;
+#elif USE_ARM_NEON
+    std::cout << "  ARM64 with NEON Optimizations" << std::endl;
 #else
     std::cout << "  Generic Build (No SIMD)" << std::endl;
-    std::cout << "  NOTE: This code is optimized for x86-64" << std::endl;
 #endif
     std::cout << "========================================" << std::endl;
 

diff --git a/matrix_operations.cpp b/matrix_operations.cpp
@@ -4,11 +4,17 @@
 #include <chrono>
 #include <stdexcept>
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #include <immintrin.h>
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
@@ -58,6 +64,31 @@ Matrix Matrix::multiply(const Matrix& other) const {
                 sum += data[i][k] * other.data[k][j];
             }
 
+            result.data[i][j] = sum;
+        }
+    }
+#elif USE_ARM_NEON
+    // ARM64 optimized path using NEON
+    for (size_t i = 0; i < rows; i++) {
+        for (size_t j = 0; j < other.cols; j++) {
+            float64x2_t sum_vec = vdupq_n_f64(0.0);
+            size_t k = 0;
+
+            // Process 2 elements at a time with NEON
+            for (; k + 1 < cols; k += 2) {
+                float64x2_t a_vec = vld1q_f64(&data[i][k]);
+                float64x2_t b_vec = {other.data[k][j], other.data[k+1][j]};
+                sum_vec = vfmaq_f64(sum_vec, a_vec, b_vec);
+            }
+
+            // Horizontal add
+            double sum = vaddvq_f64(sum_vec);
+
+            // Handle remaining element
+            if (k < cols) {
+                sum += data[i][k] * other.data[k][j];
+            }
+
             result.data[i][j] = sum;
         }
     }

diff --git a/memory_operations.cpp b/memory_operations.cpp
@@ -3,11 +3,17 @@
 #include <vector>
 #include <chrono>
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #include <immintrin.h>
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 void fast_memcpy(void* dest, const void* src, size_t n) {
@@ -21,9 +27,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
         __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
         _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
     }
+#elif USE_ARM_NEON
+    // ARM64 optimized path using NEON
+    for (; i + 16 <= n; i += 16) {
+        uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
+        vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
+    }
 #endif
 
-    // Copy remaining bytes (or all on non-x86)
+    // Copy remaining bytes (or all on non-SIMD platforms)
     for (; i < n; i++) {
         d[i] = s[i];
     }

diff --git a/polynomial_eval.cpp b/polynomial_eval.cpp
@@ -2,11 +2,17 @@
 #include <iostream>
 #include <chrono>
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #include <immintrin.h>
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
@@ -39,6 +45,31 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
         result += coeffs[i] * power_arr[0];
     }
 
+    return result;
+#elif USE_ARM_NEON
+    // ARM64 optimized path using NEON
+    float64x2_t result_vec = vdupq_n_f64(0.0);
+    float64x2_t power_vec = {1.0, x};
+    float64x2_t power_mult = vdupq_n_f64(x * x);
+
+    size_t i = 0;
+
+    // Process 2 coefficients at a time
+    for (; i + 1 < coeffs.size(); i += 2) {
+        float64x2_t coeff_vec = {coeffs[i], coeffs[i + 1]};
+        result_vec = vfmaq_f64(result_vec, coeff_vec, power_vec);
+        power_vec = vmulq_f64(power_vec, power_mult);
+    }
+
+    // Horizontal add
+    double result = vaddvq_f64(result_vec);
+
+    // Handle remaining coefficient
+    if (i < coeffs.size()) {
+        double power = vgetq_lane_f64(power_vec, 0);
+        result += coeffs[i] * power;
+    }
+
     return result;
 #else
     // Fallback scalar implementation

diff --git a/string_search.cpp b/string_search.cpp
@@ -2,11 +2,17 @@
 #include <iostream>
 #include <chrono>
 
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(__amd64__)
 #include <immintrin.h>
 #define USE_X86_SIMD 1
+#define USE_ARM_NEON 0
+#elif defined(__aarch64__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#define USE_X86_SIMD 0
+#define USE_ARM_NEON 1
 #else
 #define USE_X86_SIMD 0
+#define USE_ARM_NEON 0
 #endif
 
 int simd_string_search(const std::string& text, const std::string& pattern) {
@@ -44,9 +50,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) {
             }
         }
     }
+#elif USE_ARM_NEON
+    // ARM64 optimized path using NEON
+    uint8x16_t first_char_vec = vdupq_n_u8(static_cast<uint8_t>(first_char));
+
+    for (; i + 16 <= text_len - pattern_len + 1; i += 16) {
+        uint8x16_t text_chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(text.data() + i));
+        uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec);
+
+        // Store comparison result to array
+        alignas(16) uint8_t cmp_result[16];
+        vst1q_u8(cmp_result, cmp);
+
+        // Check each potential match
+        for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) {
+            if (cmp_result[bit] != 0) {
+                bool match = true;
+                for (size_t j = 1; j < pattern_len; j++) {
+                    if (text[i + bit + j] != pattern[j]) {
+                        match = false;
+                        break;
+                    }
+                }
+                if (match) count++;
+            }
+        }
+    }
 #endif
 
-    // Handle remaining characters (or all on non-x86)
+    // Handle remaining characters (or all on non-SIMD platforms)
     for (; i <= text_len - pattern_len; i++) {
         bool match = true;
         for (size_t j = 0; j < pattern_len; j++) {