#include <stdio.h>
#include <stdlib.h> // For rand()

// --- CUDA Best Practice: Error Checking Macro ---
// This macro wraps CUDA calls and prints an error message if they fail.
// This is essential for debugging.
#define checkCudaErrors(call)                                                  \
  do {                                                                         \
    cudaError_t err = call;                                                    \
    if (err != cudaSuccess) {                                                  \
      printf("CUDA Error at %s:%d: %s\n", __FILE__, __LINE__,                   \
             cudaGetErrorString(err));                                         \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

// --- CUDA Kernel: axpb ---
// This kernel calculates y = a*x + b for each element in a vector.
// It's a "Single Instruction, Multiple Thread" (SIMT) operation.
__global__ void axpbKernel(float *y, const float *x, float a, float b, int n) {
  // Calculate the global thread ID
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  // --- CUDA Best Practice: Grid-Stride Loop ---
  // Use a grid-stride loop instead of a simple 'if (idx < n)' check.
  // This makes the kernel more flexible and reusable. If the number of threads
  // is less than 'n', threads will loop to process multiple elements.
  for (int i = idx; i < n; i += gridDim.x * blockDim.x) {
    y[i] = a * x[i] + b;
  }
}

// --- Host Code (Main Function) ---
int main() {
  // --- 1. Define Problem Size ---
  // We'll process 2^24 elements (about 16.7 million)
  int n = 1 << 24;
  size_t bytes = n * sizeof(float);

  // Constants for y = ax + b
  float a = 2.0f;
  float b = 1.0f;

  // --- 2. Allocate Host (CPU) Memory ---
  float *h_x = (float *)malloc(bytes);
  float *h_y = (float *)malloc(bytes);
  if (h_x == NULL || h_y == NULL) {
    printf("Failed to allocate host memory\n");
    return EXIT_FAILURE;
  }

  // Initialize host data
  for (int i = 0; i < n; i++) {
    h_x[i] = (float)rand() / (float)RAND_MAX; // Simple random value [0, 1]
    h_y[i] = 0.0f;                            // Initialize output to 0
  }

  // --- 3. Allocate Device (GPU) Memory ---
  float *d_x, *d_y;
  checkCudaErrors(cudaMalloc(&d_x, bytes));
  checkCudaErrors(cudaMalloc(&d_y, bytes));

  // --- 4. Copy Data from Host to Device ---
  printf("Copying %d elements (%lu MB) from Host to Device...\n", n, bytes / (1024 * 1024));
  checkCudaErrors(cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(d_y, h_y, bytes, cudaMemcpyHostToDevice)); // Copy initial 0s

  // --- 5. Create CUDA Events for Timing ---
  // --- CUDA Best Practice: Use cudaEvent_t for accurate GPU timing ---
  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));

  printf("\n--- Starting Block Size Experiment ---\n");
  printf("Processing %d elements with y = %.1f*x + %.1f\n", n, a, b);

  // --- 6. Experiment with different block sizes ---
  // We test block sizes from 64 to 1024, in powers of 2.
  for (int blockSize = 64; blockSize <= 1024; blockSize *= 2) {
    
    // Calculate grid size
    // --- CUDA Best Practice: Grid Size Calculation ---
    // (n + blockSize - 1) / blockSize ensures we have enough blocks
    // to cover all 'n' elements, even if 'n' is not a multiple of 'blockSize'.
    int gridSize = (n + blockSize - 1) / blockSize;

    // --- 7. Run Kernel and Measure Time ---
    checkCudaErrors(cudaEventRecord(start));

    // Launch the kernel
    axpbKernel<<<gridSize, blockSize>>>(d_y, d_x, a, b, n);

    checkCudaErrors(cudaEventRecord(stop));

    // Check for any errors during kernel launch
    checkCudaErrors(cudaGetLastError());

    // --- CUDA Best Practice: Synchronize for Timing ---
    // Wait for the 'stop' event to complete. This blocks the CPU
    // until the GPU kernel has finished executing.
    checkCudaErrors(cudaEventSynchronize(stop));

    // Calculate elapsed time
    float milliseconds = 0;
    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));

    printf("Block Size: %4d | Grid Size: %6d | Time: %f ms\n", blockSize,
           gridSize, milliseconds);
  }
  printf("--- Experiment Finished ---\n\n");

  // --- 8. Copy Result from Device to Host ---
  checkCudaErrors(cudaMemcpy(h_y, d_y, bytes, cudaMemcpyDeviceToHost));

  // --- 9. Verify Result on Host ---
  printf("Verifying result on host...\n");
  bool success = true;
  // Check the first and last elements as a simple correctness test
  for (int i = 0; i < n; i += (n-1)/2) { // Check first, middle, last (approx)
      if (i >= n) i = n-1; // handle n=1 case
      float expected = a * h_x[i] + b;
      // Use a small tolerance for floating-point comparison
      if (fabs(h_y[i] - expected) > 1e-5) {
          printf("Verification FAILED at index %d!\n", i);
          printf("  Expected: %f, Got: %f\n", expected, h_y[i]);
          success = false;
          break;
      }
      if (i == (n-1)) break; // avoid infinite loop if n=1
  }
  if (success) {
    printf("Verification SUCCESSFUL!\n");
  }


  // --- 10. Clean Up Memory ---
  printf("Cleaning up memory...\n");
  free(h_x);
  free(h_y);
  checkCudaErrors(cudaFree(d_x));
  checkCudaErrors(cudaFree(d_y));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));

  return 0;
}