#include <cuda_runtime.h>
#include <iostream>

#define NUM_BLOCKS 256
#define THREADS_PER_BLOCK 256

__global__ void integrate_pi(double *d_results, int num_intervals, double step) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    double x, local_sum = 0.0;

    for (int i = idx; i < num_intervals; i += gridDim.x * blockDim.x) {
        x = (i + 0.5) * step;
        local_sum += sqrt(1.0 - x * x) * step;
    }

    d_results[idx] = local_sum;
}

int main() {
    const int num_intervals = 1 << 30; // Total number of intervals
    const double step = 1.0 / num_intervals; // Width of each interval
    const int total_threads = NUM_BLOCKS * THREADS_PER_BLOCK;

    // Allocate memory for results on device and host
    double *d_results;
    cudaMalloc(&d_results, total_threads * sizeof(double));

    // Launch the kernel
    integrate_pi<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(d_results, num_intervals, step);

    // Copy results back to host
    double *h_results = new double[total_threads];
    cudaMemcpy(h_results, d_results, total_threads * sizeof(double), cudaMemcpyDeviceToHost);

    // Accumulate the results
    double pi = 0.0;
    for (int i = 0; i < total_threads; ++i) {
        pi += h_results[i];
    }

    pi *= 4.0; // Scale to compute full Pi from quarter-circle integration

    std::cout << "Estimated value of Pi: " << pi << std::endl;

    // Free memory
    delete[] h_results;
    cudaFree(d_results);

    return 0;
}
