Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
benchmark
benchmark-arm64
*.o
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ COPY *.h ./
COPY *.cpp ./

# Build the application with optimizations
# SSE2 intrinsics are used in the code for x86-64 platforms
# Supports both x86-64 SSE2 intrinsics and ARM64 NEON intrinsics
RUN g++ -O2 -o benchmark \
main.cpp \
matrix_operations.cpp \
Expand Down
18 changes: 11 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Compute Benchmark Suite

A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions.
A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions.

## Overview

Expand All @@ -11,7 +11,9 @@ This benchmark suite tests various compute-intensive operations including:
- Memory operations (50MB copy operations)
- Polynomial evaluation (10M iterations)

The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors.
The code is optimized using:
- **x86-64**: SSE2 SIMD intrinsics for Intel and AMD processors
- **ARM64**: NEON SIMD intrinsics for ARM processors

## Building with Docker

Expand All @@ -33,16 +35,18 @@ This will execute all benchmark tests and display timing results for each operat

## Architecture Notes

- **Optimized for**: x86-64 architecture with SSE2 support
- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
- **Fallback**: Includes scalar fallback implementation for non-x86 platforms
- **x86-64**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
- **ARM64**: Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations
- **Fallback**: Includes scalar fallback implementation for other platforms

The code automatically detects the architecture at compile time and uses the appropriate SIMD instructions.

## Output Example

```
========================================
Compute Benchmark Suite
x86-64 with SSE2 Optimizations
ARM64 with NEON Optimizations
========================================

=== Matrix Multiplication Benchmark ===
Expand All @@ -69,4 +73,4 @@ The benchmark suite is organized into separate modules:
- `memory_operations.{h,cpp}` - Fast memory copy operations
- `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation

Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable.
Each module uses C++11 standard library with architecture-specific SIMD intrinsics (SSE2 for x86-64, NEON for ARM64) where applicable.
1 change: 1 addition & 0 deletions _codeql_detected_source_root
36 changes: 26 additions & 10 deletions hash_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
#include <chrono>
#include <iomanip>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

unsigned long long compute_hash(const char* data, size_t len) {
Expand All @@ -20,20 +26,30 @@ unsigned long long compute_hash(const char* data, size_t len) {
for (; i + 16 <= len; i += 16) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));

// Extract bytes and update hash
// Store to array and extract bytes
alignas(16) unsigned char bytes[16];
_mm_store_si128(reinterpret_cast<__m128i*>(bytes), chunk);

for (int j = 0; j < 16; j++) {
hash = ((hash << 5) + hash) + bytes[j];
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= len; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));

// Store to array and extract bytes
alignas(16) unsigned char bytes[16];
vst1q_u8(bytes, chunk);

for (int j = 0; j < 16; j++) {
unsigned char byte = _mm_extract_epi16(chunk, j / 2);
if (j % 2 == 0) {
byte = byte & 0xFF;
} else {
byte = (byte >> 8) & 0xFF;
}
hash = ((hash << 5) + hash) + byte;
hash = ((hash << 5) + hash) + bytes[j];
}
}
#endif

// Process remaining bytes (or all bytes on non-x86)
// Process remaining bytes (or all bytes on non-SIMD platforms)
for (; i < len; i++) {
hash = ((hash << 5) + hash) + data[i];
}
Expand Down
34 changes: 34 additions & 0 deletions invocation_reasons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
id: ae8a3c57-43f0-420b-8a30-4fdbb638925c
timestamp: '2026-02-04T21:48:05.998962+00:00'
tool: check_image
args:
image: ubuntu:22.04
reason: Checking if the ubuntu:22.04 base image in the Dockerfile supports ARM architecture
---
id: 2cc8eb7d-74e8-48b7-953b-eb1c0aaa8ae6
timestamp: '2026-02-04T21:48:10.913142+00:00'
tool: knowledge_base_search
args:
query: Is g++ compiler compatible with ARM architecture?
reason: Verifying that the g++ compiler package in the Dockerfile is compatible with
ARM
---
id: 86964ccd-504a-42c0-a95e-72fc5d7c017b
timestamp: '2026-02-04T21:48:10.932771+00:00'
tool: knowledge_base_search
args:
query: Is make build tool compatible with ARM architecture?
reason: Verifying that the make package in the Dockerfile is compatible with ARM
---
id: 91d9de77-dd63-4f3c-9ab4-bff77b0385f2
timestamp: '2026-02-04T21:48:16.681056+00:00'
tool: migrate_ease_scan
args:
scanner: cpp
arch: armv8-a
git_repo: null
output_format: json
extra_args: null
reason: Scanning the C++ codebase to identify x86-specific code that needs to be migrated
to ARM, including SSE2 intrinsics and architecture-specific dependencies
10 changes: 8 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,26 @@
#include "memory_operations.h"
#include "polynomial_eval.h"

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

int main() {
std::cout << "========================================" << std::endl;
std::cout << " Compute Benchmark Suite" << std::endl;
#if USE_X86_SIMD
std::cout << " x86-64 with SSE2 Optimizations" << std::endl;
#elif USE_ARM_NEON
std::cout << " ARM64 with NEON Optimizations" << std::endl;
#else
std::cout << " Generic Build (No SIMD)" << std::endl;
std::cout << " NOTE: This code is optimized for x86-64" << std::endl;
#endif
std::cout << "========================================" << std::endl;

Expand Down
33 changes: 32 additions & 1 deletion matrix_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
#include <chrono>
#include <stdexcept>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
Expand Down Expand Up @@ -58,6 +64,31 @@ Matrix Matrix::multiply(const Matrix& other) const {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
float64x2_t sum_vec = vdupq_n_f64(0.0);
size_t k = 0;

// Process 2 elements at a time with NEON
for (; k + 1 < cols; k += 2) {
float64x2_t a_vec = vld1q_f64(&data[i][k]);
float64x2_t b_vec = {other.data[k][j], other.data[k+1][j]};
sum_vec = vfmaq_f64(sum_vec, a_vec, b_vec);
}

// Horizontal add
double sum = vaddvq_f64(sum_vec);

// Handle remaining element
if (k < cols) {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
Expand Down
16 changes: 14 additions & 2 deletions memory_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
#include <vector>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

void fast_memcpy(void* dest, const void* src, size_t n) {
Expand All @@ -21,9 +27,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= n; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
}
#endif

// Copy remaining bytes (or all on non-x86)
// Copy remaining bytes (or all on non-SIMD platforms)
for (; i < n; i++) {
d[i] = s[i];
}
Expand Down
33 changes: 32 additions & 1 deletion polynomial_eval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
#include <iostream>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
Expand Down Expand Up @@ -39,6 +45,31 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
result += coeffs[i] * power_arr[0];
}

return result;
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
float64x2_t result_vec = vdupq_n_f64(0.0);
float64x2_t power_vec = {1.0, x};
float64x2_t power_mult = vdupq_n_f64(x * x);

size_t i = 0;

// Process 2 coefficients at a time
for (; i + 1 < coeffs.size(); i += 2) {
float64x2_t coeff_vec = {coeffs[i], coeffs[i + 1]};
result_vec = vfmaq_f64(result_vec, coeff_vec, power_vec);
power_vec = vmulq_f64(power_vec, power_mult);
}

// Horizontal add
double result = vaddvq_f64(result_vec);

// Handle remaining coefficient
if (i < coeffs.size()) {
double power = vgetq_lane_f64(power_vec, 0);
result += coeffs[i] * power;
}

return result;
#else
// Fallback scalar implementation
Expand Down
36 changes: 34 additions & 2 deletions string_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
#include <iostream>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(__amd64__)
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

int simd_string_search(const std::string& text, const std::string& pattern) {
Expand Down Expand Up @@ -44,9 +50,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) {
}
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
uint8x16_t first_char_vec = vdupq_n_u8(static_cast<uint8_t>(first_char));

for (; i + 16 <= text_len - pattern_len + 1; i += 16) {
uint8x16_t text_chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(text.data() + i));
uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec);

// Store comparison result to array
alignas(16) uint8_t cmp_result[16];
vst1q_u8(cmp_result, cmp);

// Check each potential match
for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) {
if (cmp_result[bit] != 0) {
bool match = true;
for (size_t j = 1; j < pattern_len; j++) {
if (text[i + bit + j] != pattern[j]) {
match = false;
break;
}
}
if (match) count++;
}
}
}
#endif

// Handle remaining characters (or all on non-x86)
// Handle remaining characters (or all on non-SIMD platforms)
for (; i <= text_len - pattern_len; i++) {
bool match = true;
for (size_t j = 0; j < pattern_len; j++) {
Expand Down