diff --git a/Dockerfile b/Dockerfile index 24fec07..47beb5d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,15 +16,21 @@ COPY *.h ./ COPY *.cpp ./ # Build the application with optimizations -# SSE2 intrinsics are used in the code for x86-64 platforms -RUN g++ -O2 -o benchmark \ - main.cpp \ - matrix_operations.cpp \ - hash_operations.cpp \ - string_search.cpp \ - memory_operations.cpp \ - polynomial_eval.cpp \ - -std=c++11 +# Detects architecture at build time: uses NEON on AArch64, native opts on x86-64 +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "aarch64" ]; then \ + MARCH_FLAG="-march=armv8-a"; \ + else \ + MARCH_FLAG="-march=native"; \ + fi && \ + g++ -O3 $MARCH_FLAG -o benchmark \ + main.cpp \ + matrix_operations.cpp \ + hash_operations.cpp \ + string_search.cpp \ + memory_operations.cpp \ + polynomial_eval.cpp \ + -std=c++14 # Create a startup script COPY start.sh . diff --git a/hash_operations.cpp b/hash_operations.cpp index 0d1d1ca..20f978a 100644 --- a/hash_operations.cpp +++ b/hash_operations.cpp @@ -11,6 +11,13 @@ #define USE_X86_SIMD 0 #endif +#ifdef __aarch64__ +#include +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + unsigned long long compute_hash(const char* data, size_t len) { unsigned long long hash = 5381; size_t i = 0; @@ -31,11 +38,21 @@ unsigned long long compute_hash(const char* data, size_t len) { hash = ((hash << 5) + hash) + byte; } } +#elif USE_ARM_NEON + // AArch64 optimized path using NEON: load 16 bytes at a time + for (; i + 16 <= len; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(data + i)); + uint8_t bytes[16]; + vst1q_u8(bytes, chunk); + for (int j = 0; j < 16; j++) { + hash = ((hash << 5) + hash) + bytes[j]; + } + } #endif - // Process remaining bytes (or all bytes on non-x86) + // Process remaining bytes (or all bytes on non-SIMD) for (; i < len; i++) { - hash = ((hash << 5) + hash) + data[i]; + hash = ((hash << 5) + hash) + static_cast(data[i]); } return hash; diff --git a/invocation_reasons.yaml b/invocation_reasons.yaml new file mode 100644 index 0000000..7af95ba --- /dev/null +++ b/invocation_reasons.yaml @@ -0,0 +1,39 @@ +--- +id: f0ef4f16-bb37-4361-9a72-6454cff5ec4e +timestamp: '2026-03-10T20:52:56.735332+00:00' +tool: check_image +args: + image: ubuntu:22.04 +reason: Checking if ubuntu:22.04 base image supports ARM64 architecture +--- +id: 76249d17-ab54-4117-9116-e9b9f5ac6c1b +timestamp: '2026-03-10T20:52:56.964652+00:00' +tool: migrate_ease_scan +args: + scanner: cpp + arch: armv8-a + git_repo: null + output_format: json + extra_args: null +reason: Scanning C++ codebase for x86-specific code that needs ARM migration +--- +id: 6489af70-f9ce-41f2-9f07-eb4c33a0cf4b +timestamp: '2026-03-10T20:53:35.583334+00:00' +tool: knowledge_base_search +args: + query: Is g++ compatible with ARM architecture? +reason: Checking if g++ compiler package in Dockerfile is ARM compatible +--- +id: 992020cb-0b7e-42f7-a50d-e6f7f4b3ec11 +timestamp: '2026-03-10T20:53:35.605637+00:00' +tool: knowledge_base_search +args: + query: Is make compatible with ARM architecture? +reason: Checking if make build tool in Dockerfile is ARM compatible +--- +id: 34af97ec-cfdd-4e25-b5a2-279e3e730aec +timestamp: '2026-03-10T20:53:35.618797+00:00' +tool: knowledge_base_search +args: + query: ARM NEON SIMD intrinsics equivalent to SSE2 x86 for C++ migration +reason: Finding NEON equivalents for SSE2 intrinsics used in the codebase diff --git a/main.cpp b/main.cpp index 1c6e1a7..36f4291 100644 --- a/main.cpp +++ b/main.cpp @@ -1,6 +1,6 @@ /* * High-Performance Compute Benchmark Suite - * Optimized for x86-64 architecture with SSE/AVX SIMD instructions + * Optimized for x86-64 (SSE2) and AArch64 (NEON) architectures */ #include @@ -16,14 +16,21 @@ #define USE_X86_SIMD 0 #endif +#ifdef __aarch64__ +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + int main() { std::cout << "========================================" << std::endl; std::cout << " Compute Benchmark Suite" << std::endl; #if USE_X86_SIMD std::cout << " x86-64 with SSE2 Optimizations" << std::endl; +#elif USE_ARM_NEON + std::cout << " AArch64 with NEON Optimizations" << std::endl; #else std::cout << " Generic Build (No SIMD)" << std::endl; - std::cout << " NOTE: This code is optimized for x86-64" << std::endl; #endif std::cout << "========================================" << std::endl; diff --git a/matrix_operations.cpp b/matrix_operations.cpp index f85a899..3af99d9 100644 --- a/matrix_operations.cpp +++ b/matrix_operations.cpp @@ -11,6 +11,13 @@ #define USE_X86_SIMD 0 #endif +#ifdef __aarch64__ +#include +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) { data.resize(rows, std::vector(cols, 0.0)); } @@ -58,6 +65,32 @@ Matrix Matrix::multiply(const Matrix& other) const { sum += data[i][k] * other.data[k][j]; } + result.data[i][j] = sum; + } + } +#elif USE_ARM_NEON + // AArch64 optimized path using NEON + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < other.cols; j++) { + float64x2_t sum_vec = vdupq_n_f64(0.0); + size_t k = 0; + + // Process 2 elements at a time with NEON + for (; k + 1 < cols; k += 2) { + float64x2_t a_vec = vld1q_f64(&data[i][k]); + double b_vals[2] = {other.data[k][j], other.data[k+1][j]}; + float64x2_t b_vec = vld1q_f64(b_vals); + sum_vec = vmlaq_f64(sum_vec, a_vec, b_vec); + } + + // Horizontal add + double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1); + + // Handle remaining element + if (k < cols) { + sum += data[i][k] * other.data[k][j]; + } + result.data[i][j] = sum; } } diff --git a/matrix_operations.h b/matrix_operations.h index aa98741..9045549 100644 --- a/matrix_operations.h +++ b/matrix_operations.h @@ -4,7 +4,7 @@ #include #include -// Matrix class with x86 SSE2 optimizations +// Matrix class with x86 SSE2 and AArch64 NEON optimizations class Matrix { private: std::vector> data; diff --git a/memory_operations.cpp b/memory_operations.cpp index 0e5b970..5d32195 100644 --- a/memory_operations.cpp +++ b/memory_operations.cpp @@ -10,6 +10,13 @@ #define USE_X86_SIMD 0 #endif +#ifdef __aarch64__ +#include +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + void fast_memcpy(void* dest, const void* src, size_t n) { char* d = static_cast(dest); const char* s = static_cast(src); @@ -21,9 +28,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) { __m128i chunk = _mm_loadu_si128(reinterpret_cast(s + i)); _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk); } +#elif USE_ARM_NEON + // AArch64 optimized path using NEON: copy 16 bytes at a time + for (; i + 16 <= n; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(s + i)); + vst1q_u8(reinterpret_cast(d + i), chunk); + } #endif - // Copy remaining bytes (or all on non-x86) + // Copy remaining bytes (or all on non-SIMD) for (; i < n; i++) { d[i] = s[i]; } diff --git a/polynomial_eval.cpp b/polynomial_eval.cpp index db2247a..86f0d2b 100644 --- a/polynomial_eval.cpp +++ b/polynomial_eval.cpp @@ -9,13 +9,19 @@ #define USE_X86_SIMD 0 #endif -double polynomial_eval_sse(double x, const std::vector& coeffs) { +#ifdef __aarch64__ +#include +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + +double polynomial_eval_simd(double x, const std::vector& coeffs) { #if USE_X86_SIMD // x86-64 optimized path using SSE2 __m128d result_vec = _mm_setzero_pd(); - __m128d x_vec = _mm_set1_pd(x); - __m128d power_vec = _mm_set_pd(x, 1.0); // [x, 1.0] __m128d power_mult = _mm_set1_pd(x * x); + __m128d power_vec = _mm_set_pd(x, 1.0); // _mm_set_pd is high-to-low: element[0]=1.0, element[1]=x size_t i = 0; @@ -39,6 +45,36 @@ double polynomial_eval_sse(double x, const std::vector& coeffs) { result += coeffs[i] * power_arr[0]; } + return result; +#elif USE_ARM_NEON + // AArch64 optimized path using NEON + float64x2_t result_vec = vdupq_n_f64(0.0); + float64x2_t x_sq_vec = vdupq_n_f64(x * x); + // element[0]=1.0 (power for even-indexed coeffs), element[1]=x (power for odd-indexed coeffs) + // vld1q_f64 is low-to-high: same ordering as array indices (consistent with SSE2 element[0]/[1] above) + double init_powers[2] = {1.0, x}; + float64x2_t power_vec = vld1q_f64(init_powers); + + size_t i = 0; + + // Process 2 coefficients at a time + for (; i + 1 < coeffs.size(); i += 2) { + double coeff_vals[2] = {coeffs[i], coeffs[i + 1]}; + float64x2_t coeff_vec = vld1q_f64(coeff_vals); + result_vec = vmlaq_f64(result_vec, coeff_vec, power_vec); + power_vec = vmulq_f64(power_vec, x_sq_vec); + } + + // Horizontal add + double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1); + + // Handle remaining coefficient + if (i < coeffs.size()) { + double power_arr[2]; + vst1q_f64(power_arr, power_vec); + result += coeffs[i] * power_arr[0]; + } + return result; #else // Fallback scalar implementation @@ -61,7 +97,7 @@ void benchmark_polynomial() { auto start = std::chrono::high_resolution_clock::now(); double sum = 0.0; for (int i = 0; i < iterations; i++) { - sum += polynomial_eval_sse(1.5 + i * 0.0001, coeffs); + sum += polynomial_eval_simd(1.5 + i * 0.0001, coeffs); } auto end = std::chrono::high_resolution_clock::now(); diff --git a/polynomial_eval.h b/polynomial_eval.h index 97c595c..d9e21e1 100644 --- a/polynomial_eval.h +++ b/polynomial_eval.h @@ -4,7 +4,7 @@ #include // Vectorized polynomial evaluation using x86 SSE2 -double polynomial_eval_sse(double x, const std::vector& coeffs); +double polynomial_eval_simd(double x, const std::vector& coeffs); // Benchmark function void benchmark_polynomial(); diff --git a/string_search.cpp b/string_search.cpp index 7c5c340..909df3c 100644 --- a/string_search.cpp +++ b/string_search.cpp @@ -9,6 +9,13 @@ #define USE_X86_SIMD 0 #endif +#ifdef __aarch64__ +#include +#define USE_ARM_NEON 1 +#else +#define USE_ARM_NEON 0 +#endif + int simd_string_search(const std::string& text, const std::string& pattern) { int count = 0; size_t text_len = text.length(); @@ -44,9 +51,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) { } } } +#elif USE_ARM_NEON + // AArch64 optimized path using NEON + uint8x16_t first_char_vec = vdupq_n_u8(static_cast(first_char)); + + for (; i + 16 <= text_len - pattern_len + 1; i += 16) { + uint8x16_t text_chunk = vld1q_u8(reinterpret_cast(text.data() + i)); + uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec); + + // Store comparison result as a byte array (0xFF = match, 0x00 = no match) + uint8_t cmp_bytes[16]; + vst1q_u8(cmp_bytes, cmp); + + // Check each potential match position + for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) { + if (cmp_bytes[bit]) { + bool match = true; + for (size_t j = 1; j < pattern_len; j++) { + if (text[i + bit + j] != pattern[j]) { + match = false; + break; + } + } + if (match) count++; + } + } + } #endif - // Handle remaining characters (or all on non-x86) + // Handle remaining characters (or all on non-SIMD) for (; i <= text_len - pattern_len; i++) { bool match = true; for (size_t j = 0; j < pattern_len; j++) {