-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_3_cuda_code.cu
More file actions
64 lines (53 loc) · 2.29 KB
/
Copy pathexample_3_cuda_code.cu
File metadata and controls
64 lines (53 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
* CUDA Math Hello World - Beginner Version with Extensive Comments
*
* This program demonstrates the basics of GPU programming with CUDA.
* It launches 8 threads on the GPU, each computing the square of its thread ID.
*/
#include <stdio.h> // For printf() function
/*
* GPU FUNCTION (called a "kernel")
* The __global__ keyword tells CUDA this function runs ON THE GPU, not the CPU.
* This function will be executed by multiple GPU threads simultaneously.
*/
__global__ void compute(int *data) {
// Get this thread's unique ID number (0, 1, 2, 3, 4, 5, 6, or 7)
// threadIdx.x is a built-in CUDA variable that tells each thread its ID
int idx = threadIdx.x;
// Each thread computes the square of its ID and stores it in the array
// Thread 0 computes 0^2 = 0, thread 1 computes 1^2 = 1, etc.
data[idx] = idx * idx;
// Print from the GPU! This shows which thread did what computation
// Note: GPU printf might appear out of order since threads run in parallel
printf("GPU thread %d computed %d^2 = %d\n", idx, idx, data[idx]);
}
/*
* MAIN FUNCTION - runs on the CPU
* This coordinates the GPU work and manages memory
*/
int main() {
printf("Starting GPU math computation...\n");
// STEP 1: Allocate memory ON THE GPU
// We need space for 8 integers (one per GPU thread)
int *d_data; // 'd_' prefix means "device" (GPU) memory
// cudaMalloc is like malloc(), but it allocates memory on the GPU
// 8 * sizeof(int) = space for 8 integers
cudaMalloc(&d_data, 8 * sizeof(int));
// STEP 2: Launch the GPU kernel
// The <<<1, 8>>> syntax means:
// - 1 block of threads
// - 8 threads per block
// - So total: 1 × 8 = 8 threads running in parallel
// Each of these 8 threads will execute the compute() function simultaneously
compute<<<1, 8>>>(d_data);
// STEP 3: Wait for all GPU threads to finish
// cudaDeviceSynchronize() makes the CPU wait until the GPU is completely done
// Without this, the CPU might continue before the GPU finishes
cudaDeviceSynchronize();
// STEP 4: Clean up GPU memory
// cudaFree is like free(), but for GPU memory
// Always free what you allocate!
cudaFree(d_data);
printf("GPU computation completed!\n");
return 0;
}