#include <stdio.h>

// 1. ATOMIC REDUCTION (Baseline)
// Extremely slow because every thread competes for a single memory address.
__global__ void reduce_atomic(float *out, float *in, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        atomicAdd(out, in[idx]); 
    }
}

// 2. SHARED MEMORY REDUCTION
// Efficient: Threads reduce data within a block-level "staging area."
__global__ void reduce_shared(float *out, float *in, int n) {
    // Allocate shared memory for the block
    __shared__ float sdata[256]; 

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    sdata[tid] = (idx < n) ? in[idx] : 0.0f;
    __syncthreads(); // Wait for all threads to load

    // Binary tree reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads(); // Sync after each level of the tree
    }

    // Only thread 0 of each block writes to global memory
    if (tid == 0) atomicAdd(out, sdata[0]);
}

// 3. WARP SHUFFLE REDUCTION (Elite)
// Fastest: Threads communicate directly via registers, skipping shared memory.
__global__ void reduce_shuffle(float *out, float *in, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float val = (idx < n) ? in[idx] : 0.0f;

    // Warp-level reduction (no __syncthreads() needed inside a warp)
    for (int offset = 16; offset > 0; offset /= 2) {
        val += __shfl_down_sync(0xFFFFFFFF, val, offset);
    }

    // Only the 'lane 0' thread of each warp performs the final atomic
    if ((threadIdx.x & 31) == 0) atomicAdd(out, val);
}

int main() {
    int n = 1 << 20;
    float *d_in, *d_out;
    cudaMallocManaged(&d_in, n * sizeof(float));
    cudaMallocManaged(&d_out, sizeof(float));

    // Initialize
    for(int i=0; i<n; i++) d_in[i] = 1.0f;
    *d_out = 0.0f;

    int block = 256;
    int grid = (n + block - 1) / block;

    // Run Shared Memory version
    reduce_shared<<<grid, block>>>(d_out, d_in, n);
    cudaDeviceSynchronize();

    printf("Result: %f\n", *d_out);
    return 0;
}