Indexing in Parallel Computing and HPC

Overview

Indexing is a fundamental concept in parallel computing, particularly critical for GPU programming and high-performance computing. Proper indexing ensures efficient memory access, load balancing, and optimal utilization of parallel hardware resources.

Core Indexing Concepts

1. Thread Indexing in CUDA/OpenMP

CUDA Thread Hierarchy

__global__ void kernel(float* data, int N) {
    // Block and thread indices
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    int blockSize = blockDim.x;

    // Global thread index
    int globalIdx = blockId * blockSize + threadId;

    if (globalIdx < N) {
        data[globalIdx] = compute(data[globalIdx]);
    }
}

Multi-Dimensional Indexing

__global__ void matrix_kernel(float* matrix, int width, int height) {
    // 2D thread indexing
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = y * width + x;  // Row-major ordering
        matrix[idx] = process(matrix[idx]);
    }
}

2. Memory Layout and Indexing

Row-Major vs Column-Major

// Row-major (C/C++ style)
#define IDX2D_ROW_MAJOR(i, j, width) ((i) * (width) + (j))

// Column-major (Fortran/MATLAB style)
#define IDX2D_COL_MAJOR(i, j, height) ((j) * (height) + (i))

// 3D indexing
#define IDX3D(i, j, k, width, height) \
    ((i) * (height) * (width) + (j) * (width) + (k))

Strided Access Patterns

// Coalesced access (good for GPU)
for (int i = threadIdx.x; i < N; i += blockDim.x) {
    data[i] = data[i] * 2.0f;  // Contiguous memory access
}

// Strided access (bad for GPU)
for (int i = threadIdx.x; i < N; i += blockDim.x) {
    data[i * stride] = data[i * stride] * 2.0f;  // Non-contiguous
}

3. Advanced Indexing Techniques

Morton/Z-Order Indexing

// Morton code (Z-order curve) for spatial locality
uint32_t morton_2d(uint32_t x, uint32_t y) {
    x = (x | (x << 8)) & 0x00FF00FF;
    x = (x | (x << 4)) & 0x0F0F0F0F;
    x = (x | (x << 2)) & 0x33333333;
    x = (x | (x << 1)) & 0x55555555;

    y = (y | (y << 8)) & 0x00FF00FF;
    y = (y | (y << 4)) & 0x0F0F0F0F;
    y = (y | (y << 2)) & 0x33333333;
    y = (y | (y << 1)) & 0x55555555;

    return x | (y << 1);
}

// Hilbert curve indexing for better locality
uint32_t hilbert_xy_to_index(uint32_t x, uint32_t y, int order) {
    uint32_t index = 0;
    uint32_t s = 1 << (order - 1);

    while (s > 0) {
        uint32_t rx = (x & s) > 0;
        uint32_t ry = (y & s) > 0;

        index += s * s * ((3 * rx) ^ ry);

        // Rotate/flip quadrant
        if (ry == 0) {
            if (rx == 1) {
                x = (1 << order) - 1 - x;
                y = (1 << order) - 1 - y;
            }
            // Swap x and y
            uint32_t temp = x;
            x = y;
            y = temp;
        }
        s >>= 1;
    }
    return index;
}

Blocked/Indexing for Cache Efficiency

// Blocked matrix multiplication indexing
#define BLOCK_SIZE 16

__global__ void blocked_matmul(float* A, float* B, float* C, int N) {
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;

    int row = by * BLOCK_SIZE + ty;
    int col = bx * BLOCK_SIZE + tx;

    float sum = 0.0f;

    for (int m = 0; m < N/BLOCK_SIZE; ++m) {
        // Load blocks into shared memory
        As[ty][tx] = A[row*N + m*BLOCK_SIZE + tx];
        Bs[ty][tx] = B[(m*BLOCK_SIZE + ty)*N + col];

        __syncthreads();

        // Compute partial sum
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            sum += As[ty][k] * Bs[k][tx];
        }

        __syncthreads();
    }

    C[row*N + col] = sum;
}

4. Parallel Algorithm Indexing

Parallel Prefix Sum (Scan) Indexing

// Hillis-Steele parallel scan
__global__ void parallel_scan(float* input, float* output, int n) {
    extern __shared__ float temp[];

    int tid = threadIdx.x;
    int offset = 1;

    // Load input into shared memory
    temp[2*tid] = input[2*tid];
    temp[2*tid+1] = input[2*tid+1];

    // Build sum in place up the tree
    for (int d = n>>1; d > 0; d >>= 1) {
        __syncthreads();
        if (tid < d) {
            int ai = offset*(2*tid+1)-1;
            int bi = offset*(2*tid+2)-1;
            temp[bi] += temp[ai];
        }
        offset *= 2;
    }

    // Clear the last element
    if (tid == 0) temp[n-1] = 0;

    // Traverse down tree & build scan
    for (int d = 1; d < n; d *= 2) {
        offset >>= 1;
        __syncthreads();
        if (tid < d) {
            int ai = offset*(2*tid+1)-1;
            int bi = offset*(2*tid+2)-1;
            float t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;
        }
    }

    __syncthreads();

    // Write results to output
    output[2*tid] = temp[2*tid];
    output[2*tid+1] = temp[2*tid+1];
}

Radix Sort Indexing

__device__ uint32_t extract_bits(uint32_t value, int bit_start, int num_bits) {
    return (value >> bit_start) & ((1 << num_bits) - 1);
}

__global__ void radix_sort_local(uint32_t* data, uint32_t* temp, int n, int bit_offset) {
    extern __shared__ uint32_t shared_data[];

    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + tid;

    // Load data into shared memory
    if (gid < n) {
        shared_data[tid] = data[gid];
    }
    __syncthreads();

    // Local histogram
    uint32_t local_histogram[256] = {0};
    for (int i = 0; i < blockDim.x && gid + i * blockDim.x < n; ++i) {
        uint32_t value = shared_data[i];
        uint32_t bits = extract_bits(value, bit_offset, 8);
        local_histogram[bits]++;
    }

    // Prefix sum for local histogram
    uint32_t prefix_sum[256];
    prefix_sum[0] = 0;
    for (int i = 1; i < 256; ++i) {
        prefix_sum[i] = prefix_sum[i-1] + local_histogram[i-1];
    }

    // Scatter to temporary array
    for (int i = 0; i < blockDim.x && gid + i * blockDim.x < n; ++i) {
        uint32_t value = shared_data[i];
        uint32_t bits = extract_bits(value, bit_offset, 8);
        uint32_t pos = prefix_sum[bits]++;
        temp[gid + i * blockDim.x] = value;
    }
}

5. OpenMP Indexing Patterns

Loop Indexing with OpenMP

void openmp_indexing(float* data, int N) {
    // Static scheduling
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < N; ++i) {
        data[i] = process(data[i]);
    }

    // Dynamic scheduling for load balancing
    #pragma omp parallel for schedule(dynamic, 64)
    for (int i = 0; i < N; ++i) {
        data[i] = complex_process(data[i]);
    }

    // Guided scheduling
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < N; ++i) {
        data[i] = variable_workload(data[i]);
    }
}

Multi-dimensional OpenMP Indexing

void matrix_openmp(float** matrix, int rows, int cols) {
    #pragma omp parallel for collapse(2) schedule(static)
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            matrix[i][j] = compute(matrix[i][j]);
        }
    }
}

6. C++ Parallel Algorithm Indexing

Parallel Transform with Custom Indexing

#include <execution>
#include <algorithm>
#include <vector>

void parallel_transform_indexing(std::vector<float>& data) {
    // Parallel transform with index awareness
    std::transform(std::execution::par, data.begin(), data.end(), data.begin(),
                   [idx = 0](float val) mutable {
                       // Access neighboring elements
                       float result = val;
                       if (idx > 0) result += data[idx - 1] * 0.1f;
                       if (idx < data.size() - 1) result += data[idx + 1] * 0.1f;
                       ++idx;
                       return result;
                   });
}

Indexed Parallel Reduction

#include <numeric>

float indexed_parallel_reduce(const std::vector<float>& data) {
    // Parallel reduction with index-based weighting
    return std::transform_reduce(
        std::execution::par,
        data.begin(), data.end(),
        0.0f,
        std::plus<float>{},
        [idx = 0](float val) mutable {
            float weight = 1.0f / (idx + 1);  // Example weighting
            ++idx;
            return val * weight;
        }
    );
}

7. Advanced Indexing Strategies

Space-Filling Curves

// Peano curve for 2D indexing
std::vector<std::pair<int, int>> peano_curve(int order) {
    std::vector<std::pair<int, int>> curve;
    // Implementation of Peano space-filling curve
    // Used for cache-efficient traversal of 2D data
    return curve;
}

// Gray code indexing for error detection
uint32_t gray_encode(uint32_t n) {
    return n ^ (n >> 1);
}

uint32_t gray_decode(uint32_t n) {
    uint32_t mask = n;
    while (mask) {
        mask >>= 1;
        n ^= mask;
    }
    return n;
}

Hash-Based Indexing

// Perfect hash functions for GPU
__device__ uint32_t perfect_hash(uint32_t key, uint32_t table_size) {
    // Minimal perfect hash implementation
    // Ensures no collisions for known key set
    return key % table_size;  // Simplified example
}

// Cuckoo hashing for GPU
__device__ uint32_t cuckoo_hash(uint32_t key, int function_id) {
    const uint32_t hash_functions[4] = {
        0x9E3779B1, 0xB5297A4D, 0x68E31DA4, 0x1B56C4E9
    };
    return (key ^ hash_functions[function_id]) % TABLE_SIZE;
}

8. Performance Optimization

Memory Coalescing

// Good: Coalesced memory access
__global__ void coalesced_access(float* data, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        data[idx] = data[idx] * 2.0f;
    }
}

// Bad: Non-coalesced memory access
__global__ void non_coalesced_access(float* data, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        data[idx * stride] = data[idx * stride] * 2.0f;
    }
}

Bank Conflict Avoidance

// Shared memory bank conflict free access
__shared__ float shared_data[1024];

__global__ void bank_conflict_free(float* data) {
    int tid = threadIdx.x;
    int bid = blockIdx.x;

    // Good: No bank conflicts
    shared_data[tid] = data[bid * blockDim.x + tid];
    shared_data[tid + 32] = data[bid * blockDim.x + tid + 32];

    __syncthreads();

    // Process data
    data[bid * blockDim.x + tid] = shared_data[tid] + shared_data[tid + 32];
}

9. Debugging Indexing Issues

Bounds Checking

#define DEBUG_INDEXING 1

__device__ void check_bounds(int idx, int max_idx, const char* location) {
    #ifdef DEBUG_INDEXING
    if (idx < 0 || idx >= max_idx) {
        printf("Index out of bounds at %s: %d (max: %d)\n", location, idx, max_idx);
        __trap();
    }
    #endif
}

__global__ void safe_kernel(float* data, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    check_bounds(idx, N, "kernel entry");

    if (idx < N) {
        data[idx] = compute(data[idx]);
    }
}

Race Condition Detection

// Atomic operations for safe indexing
__global__ void atomic_indexing(int* counters, int* data) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Atomic increment for unique index assignment
    int unique_idx = atomicAdd(&counters[0], 1);

    if (unique_idx < MAX_DATA) {
        data[unique_idx] = process(idx);
    }
}

Conclusion

Proper indexing is crucial for achieving optimal performance in parallel computing and HPC applications. Key principles include:

Memory Coalescing: Ensure contiguous memory access patterns
Load Balancing: Distribute work evenly across processing units
Cache Efficiency: Use space-filling curves and blocking techniques
Bank Conflict Avoidance: Optimize shared memory access patterns
Bounds Checking: Implement safety checks in debug builds
Algorithm Selection: Choose appropriate parallel algorithms for data access patterns

Effective indexing can improve performance by orders of magnitude in parallel computing environments, making it one of the most important considerations in HPC programming. /home/rahul/AiStätt/cpp/Indexing_in_Parallel_Computing.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Indexing in Parallel Computing and HPC

Overview

Core Indexing Concepts

1. Thread Indexing in CUDA/OpenMP

CUDA Thread Hierarchy

Multi-Dimensional Indexing

2. Memory Layout and Indexing

Row-Major vs Column-Major

Strided Access Patterns

3. Advanced Indexing Techniques

Morton/Z-Order Indexing

Blocked/Indexing for Cache Efficiency

4. Parallel Algorithm Indexing

Parallel Prefix Sum (Scan) Indexing

Radix Sort Indexing

5. OpenMP Indexing Patterns

Loop Indexing with OpenMP

Multi-dimensional OpenMP Indexing

6. C++ Parallel Algorithm Indexing

Parallel Transform with Custom Indexing

Indexed Parallel Reduction

7. Advanced Indexing Strategies

Space-Filling Curves

Hash-Based Indexing

8. Performance Optimization

Memory Coalescing

Bank Conflict Avoidance

9. Debugging Indexing Issues

Bounds Checking

Race Condition Detection

Conclusion

FilesExpand file tree

Indexing_in_Parallel_Computing.md

Latest commit

History

Indexing_in_Parallel_Computing.md

File metadata and controls

Indexing in Parallel Computing and HPC

Overview

Core Indexing Concepts

1. Thread Indexing in CUDA/OpenMP

CUDA Thread Hierarchy

Multi-Dimensional Indexing

2. Memory Layout and Indexing

Row-Major vs Column-Major

Strided Access Patterns

3. Advanced Indexing Techniques

Morton/Z-Order Indexing

Blocked/Indexing for Cache Efficiency

4. Parallel Algorithm Indexing

Parallel Prefix Sum (Scan) Indexing

Radix Sort Indexing

5. OpenMP Indexing Patterns

Loop Indexing with OpenMP

Multi-dimensional OpenMP Indexing

6. C++ Parallel Algorithm Indexing

Parallel Transform with Custom Indexing

Indexed Parallel Reduction

7. Advanced Indexing Strategies

Space-Filling Curves

Hash-Based Indexing

8. Performance Optimization

Memory Coalescing

Bank Conflict Avoidance

9. Debugging Indexing Issues

Bounds Checking

Race Condition Detection

Conclusion