#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// --- CUDA Best Practice: Error Checking Macro ---
#define checkCudaErrors(call)                                                  \
  do {                                                                         \
    cudaError_t err = call;                                                    \
    if (err != cudaSuccess) {                                                  \
      printf("CUDA Error at %s:%d: %s\n", __FILE__, __LINE__,                   \
             cudaGetErrorString(err));                                         \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

// --- CONFIGURATION: CHANGE THIS VALUE TO TEST EACH STRATEGY ---
// 1 = Pageable Host Memory (The default, slow)
// 2 = Pinned Host Memory (Synchronous)
// 3 = Pinned Host Memory (Asynchronous, with Streams)
// 4 = Unified Memory (On-Demand Page Faulting)
// 5 = Unified Memory (With Prefetching)
#define STRATEGY 5
// --------------------------------------------------------------

// --- CONFIGURATION: CHOOSE KERNEL TO RUN ---
// 0 = Coalesced (Good)
// 1 = Non-Coalesced (Bad)
#define KERNEL_MODE 0
// --------------------------------------------------------------

// --- KERNEL 1: COALESCED (GOOD) ---
// Threads access global memory in a contiguous, aligned pattern.
// e.g., threads 0-31 access x[0]-x[31]. This is ideal.
__global__ void coalesced_axpb(float *y, const float *x, float a, float b, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < n; i += gridDim.x * blockDim.x) {
    y[i] = a * x[i] + b;
  }
}

// --- KERNEL 2: NON-COALESCED (BAD) ---
// Threads access memory in a "strided" pattern.
// e.g., threads 0-31 access x[0], x[32], x[64], ...
// This forces the GPU to make many separate memory transactions.
__global__ void non_coalesced_axpb(float *y, const float *x, float a, float b, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = 32; // A common source of non-coalescing
  
  for (int i = idx; i < n; i += gridDim.x * blockDim.x) {
    // This read is non-coalesced. It's a "gather" operation.
    // The write to y[i] is still coalesced, but the read from x will
    // be the bottleneck.
    int read_idx = (i * stride) % n; // Jump around in memory
    y[i] = a * x[read_idx] + b;
  }
}

// --- HOST CODE (Main Function) ---
int main() {
  // --- 1. Problem Setup ---
  int n = 1 << 26; // 67 million elements (large enough to see effects)
  size_t bytes = n * sizeof(float);
  float a = 2.0f;
  float b = 1.0f;
  
  int deviceId;
  checkCudaErrors(cudaGetDevice(&deviceId));

  // --- 2. Host Memory Allocation (Strategies 1-3) ---
  float *h_x = NULL, *h_y = NULL;
  
  // --- 3. Device Memory Allocation (Strategies 1-3) ---
  float *d_x = NULL, *d_y = NULL;

  // --- 4. Unified Memory Allocation (Strategies 4-5) ---
  float *u_x = NULL, *u_y = NULL; // 'u' for Unified

  // --- 5. Stream & Event Setup ---
  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  
  // For Strategy 3 (Async Pinned)
  const int num_streams = 4;
  cudaStream_t streams[num_streams];
  for (int i = 0; i < num_streams; i++) {
    checkCudaErrors(cudaStreamCreate(&streams[i]));
  }
  
  // For Strategy 5 (Prefetch)
  cudaStream_t prefetchStream;
  checkCudaErrors(cudaStreamCreate(&prefetchStream));


  // --- 6. LAUNCH KERNEL (Generic setup) ---
  int blockSize = 256;
  int gridSize = (n + blockSize - 1) / blockSize;
  
  void (*kernel_to_run)(float*, const float*, float, float, int);
  if (KERNEL_MODE == 0) {
    kernel_to_run = coalesced_axpb;
  } else {
    kernel_to_run = non_coalesced_axpb;
  }
  
// =========================================================================
// STRATEGY 1: PAGEABLE HOST MEMORY (The Default)
// =========================================================================
#if STRATEGY == 1
  printf("Strategy 1: Pageable Host Memory (Synchronous)\n");
  
  // 1. Allocate Host (Pageable)
  h_x = (float *)malloc(bytes);
  h_y = (float *)malloc(bytes);
  for (int i = 0; i < n; i++) h_x[i] = (float)i;

  // 2. Allocate Device
  checkCudaErrors(cudaMalloc(&d_x, bytes));
  checkCudaErrors(cudaMalloc(&d_y, bytes));

  // 3. Run and Time (Full process)
  checkCudaErrors(cudaEventRecord(start));
  
  // cudaMemcpy is *blocking* and has hidden work:
  // It must first page-lock the host memory, then transfer.
  checkCudaErrors(cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice));
  
  kernel_to_run<<<gridSize, blockSize>>>(d_y, d_x, a, b, n);
  
  checkCudaErrors(cudaMemcpy(h_y, d_y, bytes, cudaMemcpyDeviceToHost));
  
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));
  
  free(h_x);
  free(h_y);
  checkCudaErrors(cudaFree(d_x));
  checkCudaErrors(cudaFree(d_y));

// =========================================================================
// STRATEGY 2: PINNED HOST MEMORY (Synchronous)
// =========================================================================
#elif STRATEGY == 2
  printf("Strategy 2: Pinned Host Memory (Synchronous)\n");

  // 1. Allocate Host (Pinned)
  // The OS is forbidden from paging this memory.
  checkCudaErrors(cudaHostAlloc(&h_x, bytes, cudaHostAllocDefault));
  checkCudaErrors(cudaHostAlloc(&h_y, bytes, cudaHostAllocDefault));
  for (int i = 0; i < n; i++) h_x[i] = (float)i;
  
  // 2. Allocate Device
  checkCudaErrors(cudaMalloc(&d_x, bytes));
  checkCudaErrors(cudaMalloc(&d_y, bytes));
  
  // 3. Run and Time
  checkCudaErrors(cudaEventRecord(start));

  // This cudaMemcpy is *still synchronous*...
  checkCudaErrors(cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice));
  
  kernel_to_run<<<gridSize, blockSize>>>(d_y, d_x, a, b, n);
  
  // ...but it's faster because the driver can skip the page-locking step.
  checkCudaErrors(cudaMemcpy(h_y, d_y, bytes, cudaMemcpyDeviceToHost));
  
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));

  checkCudaErrors(cudaFreeHost(h_x));
  checkCudaErrors(cudaFreeHost(h_y));
  checkCudaErrors(cudaFree(d_x));
  checkCudaErrors(cudaFree(d_y));
  
// =========================================================================
// STRATEGY 3: PINNED HOST MEMORY (Asynchronous + Streams)
// =========================================================================
#elif STRATEGY == 3
  printf("Strategy 3: Pinned Host Memory (Asynchronous + Streams)\n");
  
  // 1. Allocate Host (Pinned) - Required for Async
  checkCudaErrors(cudaHostAlloc(&h_x, bytes, cudaHostAllocDefault));
  checkCudaErrors(cudaHostAlloc(&h_y, bytes, cudaHostAllocDefault));
  for (int i = 0; i < n; i++) h_x[i] = (float)i;
  
  // 2. Allocate Device
  checkCudaErrors(cudaMalloc(&d_x, bytes));
  checkCudaErrors(cudaMalloc(&d_y, bytes));

  // 3. Run and Time (Pipelined)
  checkCudaErrors(cudaEventRecord(start));

  size_t chunk_bytes = bytes / num_streams;
  int chunk_n = n / num_streams;
  int chunk_gridSize = gridSize / num_streams;

  for (int i = 0; i < num_streams; i++) {
    int offset = i * chunk_n;
    size_t c_bytes = (i == num_streams - 1) ? (bytes - (num_streams - 1) * chunk_bytes) : chunk_bytes;
    int c_n = (i == num_streams - 1) ? (n - (num_streams - 1) * chunk_n) : chunk_n;
    int c_gridSize = (i == num_streams - 1) ? (gridSize - (num_streams - 1) * chunk_gridSize) : chunk_gridSize;

    // This is the "pipeline":
    // 1. Copy chunk (HtoD) on stream[i]
    checkCudaErrors(cudaMemcpyAsync(d_x + offset, h_x + offset, c_bytes, 
                                   cudaMemcpyHostToDevice, streams[i]));
    
    // 2. Launch kernel on stream[i]
    kernel_to_run<<<c_gridSize, blockSize, 0, streams[i]>>>(
        d_y + offset, d_x + offset, a, b, c_n);

    // 3. Copy chunk (DtoH) on stream[i]
    checkCudaErrors(cudaMemcpyAsync(h_y + offset, d_y + offset, c_bytes, 
                                   cudaMemcpyDeviceToHost, streams[i]));
  }
  
  // Wait for ALL streams to finish all work
  checkCudaErrors(cudaDeviceSynchronize());
  
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));

  checkCudaErrors(cudaFreeHost(h_x));
  checkCudaErrors(cudaFreeHost(h_y));
  checkCudaErrors(cudaFree(d_x));
  checkCudaErrors(cudaFree(d_y));

// =========================================================================
// STRATEGY 4: UNIFIED MEMORY (On-Demand Faulting)
// =========================================================================
#elif STRATEGY == 4
  printf("Strategy 4: Unified Memory (On-Demand Faulting)\n");
  
  // 1. Allocate Unified Memory
  // This memory is visible to both CPU and GPU
  checkCudaErrors(cudaMallocManaged(&u_x, bytes));
  checkCudaErrors(cudaMallocManaged(&u_y, bytes));
  
  // 2. Initialize on Host (data is "paged" on the host)
  for (int i = 0; i < n; i++) u_x[i] = (float)i;

  // 3. Run and Time
  checkCudaErrors(cudaEventRecord(start));
  
  // As the kernel runs, it will access u_x[i].
  // The GPU will see this memory is on the host, pause,
  // and migrate the page ("page fault"). This happens
  // for *every single page*, and it is very slow.
  kernel_to_run<<<gridSize, blockSize>>>(u_y, u_x, a, b, n);
  
  // Wait for kernel to finish
  checkCudaErrors(cudaDeviceSynchronize());
  
  // Now, the CPU tries to read h_y.
  // The memory is on the device, so it page-faults
  // and migrates back to the host.
  // (We'll just touch one element to show)
  float test = u_y[0]; 
  
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));
  
  (void)test; // Suppress unused variable warning

  checkCudaErrors(cudaFree(u_x));
  checkCudaErrors(cudaFree(u_y));
  
// =========================================================================
// STRATEGY 5: UNIFIED MEMORY (With Prefetching)
// =========================================================================
#elif STRATEGY == 5
  printf("Strategy 5: Unified Memory (With Prefetching)\n");
  
  // 1. Allocate Unified Memory
  checkCudaErrors(cudaMallocManaged(&u_x, bytes));
  checkCudaErrors(cudaMallocManaged(&u_y, bytes));

  // 2. Initialize on Host
  for (int i = 0; i < n; i++) u_x[i] = (float)i;

  // 3. Run and Time
  checkCudaErrors(cudaEventRecord(start));
  
  // --- This is the optimization ---
  // We explicitly tell the driver: "Move all of u_x and u_y to the
  // GPU *right now*, on this stream." This avoids page faults.
  checkCudaErrors(cudaMemPrefetchAsync(u_x, bytes, deviceId, prefetchStream));
  checkCudaErrors(cudaMemPrefetchAsync(u_y, bytes, deviceId, prefetchStream));
  
  // Launch kernel on the *same stream* to ensure it
  // runs *after* the prefetch is complete.
  kernel_to_run<<<gridSize, blockSize, 0, prefetchStream>>>(u_y, u_x, a, b, n);
  
  // Now, explicitly prefetch the result back to the host
  checkCudaErrors(cudaMemPrefetchAsync(u_y, bytes, cudaCpuDeviceId, prefetchStream));
  
  // Wait for all work on the stream (prefetches, kernel) to finish
  checkCudaErrors(cudaStreamSynchronize(prefetchStream));
  
  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));

  checkCudaErrors(cudaFree(u_x));
  checkCudaErrors(cudaFree(u_y));
  
#endif
// =========================================================================

  // --- 7. Final Timing and Cleanup ---
  float milliseconds = 0;
  checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
  printf("\n--- RESULTS ---\n");
  printf("Kernel Mode:   %s\n", (KERNEL_MODE == 0) ? "COALESCED" : "NON-COALESCED");
  printf("Strategy:      %d\n", STRATEGY);
  printf("Total Time:    %f ms\n", milliseconds);
  printf("Bandwidth (GB/s): %f\n", (bytes * 2.0 / 1e9) / (milliseconds / 1000.0));
  printf("   (Note: Bandwidth calc assumes 1 read + 1 write, valid for axpb)\n");

  // Cleanup streams
  for (int i = 0; i < num_streams; i++) {
    checkCudaErrors(cudaStreamDestroy(streams[i]));
  }
  checkCudaErrors(cudaStreamDestroy(prefetchStream));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));

  return 0;
}