Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ COPY *.h ./
COPY *.cpp ./

# Build the application with optimizations
# SSE2 intrinsics are used in the code for x86-64 platforms
RUN g++ -O2 -o benchmark \
main.cpp \
matrix_operations.cpp \
hash_operations.cpp \
string_search.cpp \
memory_operations.cpp \
polynomial_eval.cpp \
-std=c++11
# Detects architecture at build time: uses NEON on AArch64, native opts on x86-64
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ]; then \
MARCH_FLAG="-march=armv8-a"; \
else \
MARCH_FLAG="-march=native"; \
fi && \
g++ -O3 $MARCH_FLAG -o benchmark \
main.cpp \
matrix_operations.cpp \
hash_operations.cpp \
string_search.cpp \
memory_operations.cpp \
polynomial_eval.cpp \
-std=c++14

# Create a startup script
COPY start.sh .
Expand Down
21 changes: 19 additions & 2 deletions hash_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
#define USE_X86_SIMD 0
#endif

#ifdef __aarch64__
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

unsigned long long compute_hash(const char* data, size_t len) {
unsigned long long hash = 5381;
size_t i = 0;
Expand All @@ -31,11 +38,21 @@ unsigned long long compute_hash(const char* data, size_t len) {
hash = ((hash << 5) + hash) + byte;
}
}
#elif USE_ARM_NEON
// AArch64 optimized path using NEON: load 16 bytes at a time
for (; i + 16 <= len; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));
uint8_t bytes[16];
vst1q_u8(bytes, chunk);
for (int j = 0; j < 16; j++) {
hash = ((hash << 5) + hash) + bytes[j];
}
}
#endif

// Process remaining bytes (or all bytes on non-x86)
// Process remaining bytes (or all bytes on non-SIMD)
for (; i < len; i++) {
hash = ((hash << 5) + hash) + data[i];
hash = ((hash << 5) + hash) + static_cast<unsigned char>(data[i]);
}

return hash;
Expand Down
39 changes: 39 additions & 0 deletions invocation_reasons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
---
id: f0ef4f16-bb37-4361-9a72-6454cff5ec4e
timestamp: '2026-03-10T20:52:56.735332+00:00'
tool: check_image
args:
image: ubuntu:22.04
reason: Checking if ubuntu:22.04 base image supports ARM64 architecture
---
id: 76249d17-ab54-4117-9116-e9b9f5ac6c1b
timestamp: '2026-03-10T20:52:56.964652+00:00'
tool: migrate_ease_scan
args:
scanner: cpp
arch: armv8-a
git_repo: null
output_format: json
extra_args: null
reason: Scanning C++ codebase for x86-specific code that needs ARM migration
---
id: 6489af70-f9ce-41f2-9f07-eb4c33a0cf4b
timestamp: '2026-03-10T20:53:35.583334+00:00'
tool: knowledge_base_search
args:
query: Is g++ compatible with ARM architecture?
reason: Checking if g++ compiler package in Dockerfile is ARM compatible
---
id: 992020cb-0b7e-42f7-a50d-e6f7f4b3ec11
timestamp: '2026-03-10T20:53:35.605637+00:00'
tool: knowledge_base_search
args:
query: Is make compatible with ARM architecture?
reason: Checking if make build tool in Dockerfile is ARM compatible
---
id: 34af97ec-cfdd-4e25-b5a2-279e3e730aec
timestamp: '2026-03-10T20:53:35.618797+00:00'
tool: knowledge_base_search
args:
query: ARM NEON SIMD intrinsics equivalent to SSE2 x86 for C++ migration
reason: Finding NEON equivalents for SSE2 intrinsics used in the codebase
11 changes: 9 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* High-Performance Compute Benchmark Suite
* Optimized for x86-64 architecture with SSE/AVX SIMD instructions
* Optimized for x86-64 (SSE2) and AArch64 (NEON) architectures
*/

#include <iostream>
Expand All @@ -16,14 +16,21 @@
#define USE_X86_SIMD 0
#endif

#ifdef __aarch64__
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

int main() {
std::cout << "========================================" << std::endl;
std::cout << " Compute Benchmark Suite" << std::endl;
#if USE_X86_SIMD
std::cout << " x86-64 with SSE2 Optimizations" << std::endl;
#elif USE_ARM_NEON
std::cout << " AArch64 with NEON Optimizations" << std::endl;
#else
std::cout << " Generic Build (No SIMD)" << std::endl;
std::cout << " NOTE: This code is optimized for x86-64" << std::endl;
#endif
std::cout << "========================================" << std::endl;

Expand Down
33 changes: 33 additions & 0 deletions matrix_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
#define USE_X86_SIMD 0
#endif

#ifdef __aarch64__
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
data.resize(rows, std::vector<double>(cols, 0.0));
}
Expand Down Expand Up @@ -58,6 +65,32 @@ Matrix Matrix::multiply(const Matrix& other) const {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
#elif USE_ARM_NEON
// AArch64 optimized path using NEON
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
float64x2_t sum_vec = vdupq_n_f64(0.0);
size_t k = 0;

// Process 2 elements at a time with NEON
for (; k + 1 < cols; k += 2) {
float64x2_t a_vec = vld1q_f64(&data[i][k]);
double b_vals[2] = {other.data[k][j], other.data[k+1][j]};
float64x2_t b_vec = vld1q_f64(b_vals);
sum_vec = vmlaq_f64(sum_vec, a_vec, b_vec);
}

// Horizontal add
double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1);

// Handle remaining element
if (k < cols) {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
Expand Down
2 changes: 1 addition & 1 deletion matrix_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <vector>
#include <cstddef>

// Matrix class with x86 SSE2 optimizations
// Matrix class with x86 SSE2 and AArch64 NEON optimizations
class Matrix {
private:
std::vector<std::vector<double>> data;
Expand Down
15 changes: 14 additions & 1 deletion memory_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
#define USE_X86_SIMD 0
#endif

#ifdef __aarch64__
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

void fast_memcpy(void* dest, const void* src, size_t n) {
char* d = static_cast<char*>(dest);
const char* s = static_cast<const char*>(src);
Expand All @@ -21,9 +28,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
}
#elif USE_ARM_NEON
// AArch64 optimized path using NEON: copy 16 bytes at a time
for (; i + 16 <= n; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
}
#endif

// Copy remaining bytes (or all on non-x86)
// Copy remaining bytes (or all on non-SIMD)
for (; i < n; i++) {
d[i] = s[i];
}
Expand Down
44 changes: 40 additions & 4 deletions polynomial_eval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,19 @@
#define USE_X86_SIMD 0
#endif

double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
#ifdef __aarch64__
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

double polynomial_eval_simd(double x, const std::vector<double>& coeffs) {
#if USE_X86_SIMD
// x86-64 optimized path using SSE2
__m128d result_vec = _mm_setzero_pd();
__m128d x_vec = _mm_set1_pd(x);
__m128d power_vec = _mm_set_pd(x, 1.0); // [x, 1.0]
__m128d power_mult = _mm_set1_pd(x * x);
__m128d power_vec = _mm_set_pd(x, 1.0); // _mm_set_pd is high-to-low: element[0]=1.0, element[1]=x

size_t i = 0;

Expand All @@ -39,6 +45,36 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
result += coeffs[i] * power_arr[0];
}

return result;
#elif USE_ARM_NEON
// AArch64 optimized path using NEON
float64x2_t result_vec = vdupq_n_f64(0.0);
float64x2_t x_sq_vec = vdupq_n_f64(x * x);
// element[0]=1.0 (power for even-indexed coeffs), element[1]=x (power for odd-indexed coeffs)
// vld1q_f64 is low-to-high: same ordering as array indices (consistent with SSE2 element[0]/[1] above)
double init_powers[2] = {1.0, x};
float64x2_t power_vec = vld1q_f64(init_powers);

size_t i = 0;

// Process 2 coefficients at a time
for (; i + 1 < coeffs.size(); i += 2) {
double coeff_vals[2] = {coeffs[i], coeffs[i + 1]};
float64x2_t coeff_vec = vld1q_f64(coeff_vals);
result_vec = vmlaq_f64(result_vec, coeff_vec, power_vec);
power_vec = vmulq_f64(power_vec, x_sq_vec);
}

// Horizontal add
double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1);

// Handle remaining coefficient
if (i < coeffs.size()) {
double power_arr[2];
vst1q_f64(power_arr, power_vec);
result += coeffs[i] * power_arr[0];
}

return result;
#else
// Fallback scalar implementation
Expand All @@ -61,7 +97,7 @@ void benchmark_polynomial() {
auto start = std::chrono::high_resolution_clock::now();
double sum = 0.0;
for (int i = 0; i < iterations; i++) {
sum += polynomial_eval_sse(1.5 + i * 0.0001, coeffs);
sum += polynomial_eval_simd(1.5 + i * 0.0001, coeffs);
}
auto end = std::chrono::high_resolution_clock::now();

Expand Down
2 changes: 1 addition & 1 deletion polynomial_eval.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <vector>

// Vectorized polynomial evaluation using x86 SSE2
double polynomial_eval_sse(double x, const std::vector<double>& coeffs);
double polynomial_eval_simd(double x, const std::vector<double>& coeffs);

// Benchmark function
void benchmark_polynomial();
Expand Down
35 changes: 34 additions & 1 deletion string_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
#define USE_X86_SIMD 0
#endif

#ifdef __aarch64__
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_ARM_NEON 0
#endif

int simd_string_search(const std::string& text, const std::string& pattern) {
int count = 0;
size_t text_len = text.length();
Expand Down Expand Up @@ -44,9 +51,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) {
}
}
}
#elif USE_ARM_NEON
// AArch64 optimized path using NEON
uint8x16_t first_char_vec = vdupq_n_u8(static_cast<uint8_t>(first_char));

for (; i + 16 <= text_len - pattern_len + 1; i += 16) {
uint8x16_t text_chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(text.data() + i));
uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec);

// Store comparison result as a byte array (0xFF = match, 0x00 = no match)
uint8_t cmp_bytes[16];
vst1q_u8(cmp_bytes, cmp);

// Check each potential match position
for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) {
if (cmp_bytes[bit]) {
bool match = true;
for (size_t j = 1; j < pattern_len; j++) {
if (text[i + bit + j] != pattern[j]) {
match = false;
break;
}
}
if (match) count++;
}
}
}
#endif

// Handle remaining characters (or all on non-x86)
// Handle remaining characters (or all on non-SIMD)
for (; i <= text_len - pattern_len; i++) {
bool match = true;
for (size_t j = 0; j < pattern_len; j++) {
Expand Down