Lab6_1

// MP Scan
// Given a list (lst) of length n
// Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
// +
// lst[n-1]}

#include <wb.h>
#include <iostream>

#define BLOCK_SIZE 512 //@@ You can change this
#define MAX_BLOCKS 65535 // Maximum number of blocks per grid dimension

#define wbCheck(stmt)                                                     \
  do {                                                                    \
    cudaError_t err = stmt;                                               \
    if (err != cudaSuccess) {                                             \
      wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
      wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
      return -1;                                                          \
    }                                                                     \
  } while (0)

__global__ void blockScan(float *input, float *output, int len, float *AuxSum) {

  __shared__ float partialSum[2*BLOCK_SIZE];
  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;
  if (start + t < len) {
      partialSum[t] = input[start + t];
  } else {
      partialSum[t] = 0.0f;
  }
  if (start + blockDim.x + t < len) {
      partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  } else {
      partialSum[blockDim.x + t] = 0.0f;
  }
  __syncthreads();

  int stride = 1;
  while (stride < 2*BLOCK_SIZE) {
    __syncthreads();
    int index = (threadIdx.x + 1) * stride * 2 - 1;
    if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
      partialSum[index] += partialSum[index - stride];
    }
    stride = stride*2;
  }

  stride = BLOCK_SIZE/2;
  while (stride > 0) {
    __syncthreads();
    int index = (threadIdx.x + 1) * stride * 2 - 1;
    if ((index + stride) < 2*BLOCK_SIZE){
      partialSum[index+stride] += partialSum[index];
    }
    stride = stride/2;
  }
  if (start + t < len){
    output[start + t] = partialSum[t];
  }

  if (threadIdx.x == blockDim.x -1) {
    AuxSum[blockIdx.x] = partialSum[2*blockDim.x - 1];
  }
}

/////////////////////// KERNEL 2////////////////////////////

__global__ void auxScan(float *AuxSum, int len) {

  __shared__ float partialSum[2*BLOCK_SIZE];
  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;

  if (start + t < len) {
      partialSum[t] = AuxSum[start + t];
  } else {
      partialSum[t] = 0.0f;
  }
  if (start + blockDim.x + t < len) {
      partialSum[blockDim.x + t] = AuxSum[start + blockDim.x + t];
  } else {
      partialSum[blockDim.x + t] = 0.0f;
  }
  __syncthreads();

  int stride = 1;
  while (stride < 2*BLOCK_SIZE) {
    __syncthreads();
    int index = (threadIdx.x + 1) * stride * 2 - 1;
    if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
      partialSum[index] += partialSum[index - stride];
    }
    stride = stride*2;
  }

  stride = BLOCK_SIZE/2;
  while (stride > 0) {
    __syncthreads();
    int index = (threadIdx.x + 1) * stride * 2 - 1;
    if ((index + stride) < 2*BLOCK_SIZE){
      partialSum[index+stride] += partialSum[index];
    }
    stride = stride/2;
  }
  if (start + t < len) {
      AuxSum[start + t] = partialSum[t];
  }
}

__global__ void sum(float *input, float *output, float *AuxSum, int len) {

  unsigned int t = blockDim.x * blockIdx.x + threadIdx.x;

  if (t < len && blockIdx.x > 0){
    output[t] = input[t] + AuxSum[blockIdx.x - 1];
  }
}

int main(int argc, char **argv) {
  wbArg_t args;
  float *hostInput;  // The input 1D list
  float *hostOutput1; // the final output
  float *deviceInput;
  float *deviceInput1;
  float *deviceOutput;
  float *deviceOutput1;
  float *deviceAuxSum;
  int numElements; // number of elements in the list

  args = wbArg_read(argc, argv);

  // Import data and create memory on host
  // The number of input elements in the input is numElements
  hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  hostOutput1 = (float *)malloc(numElements * sizeof(float));

  int len_aux = ceil(numElements / (2*BLOCK_SIZE));

  // Allocate GPU memory.
  wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceInput1, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceOutput1, numElements * sizeof(float)));
  wbCheck(cudaMalloc((void **)&deviceAuxSum, len_aux * sizeof(float)));


  // Clear output memory.
  wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  wbCheck(cudaMemset(deviceOutput1, 0, numElements * sizeof(float)));
  wbCheck(cudaMemset(deviceAuxSum, 0, len_aux * sizeof(float)));

  // Copy input memory to the GPU.
  wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), cudaMemcpyHostToDevice));

  //@@ Initialize the grid and block dimensions here --- Kernels 1


  dim3 DimGrid(ceil(numElements / (2*BLOCK_SIZE)), 1, 1);
  dim3 DimBlock(BLOCK_SIZE, 1, 1);

  blockScan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceAuxSum);
  cudaDeviceSynchronize();

  //@@ Initialize the grid and block dimensions here --- Kernel 2
  dim3 AuxGrid(ceil(len_aux / (2*BLOCK_SIZE)), 1, 1);

  auxScan<<<AuxGrid, DimBlock>>>(deviceAuxSum, len_aux);
  cudaDeviceSynchronize();

  wbCheck(cudaMemcpy(deviceInput1, deviceOutput, numElements * sizeof(float), cudaMemcpyDeviceToDevice));

  //@@ Initialize the grid and block dimensions here --- Kernel 3
  sum<<<DimGrid, DimBlock>>>(deviceInput1, deviceOutput1, deviceAuxSum, numElements);
  cudaDeviceSynchronize();

  // Copying output memory to the CPU from Kernel 3
  wbCheck(cudaMemcpy(hostOutput1, deviceOutput1, numElements * sizeof(float), cudaMemcpyDeviceToHost));

  //@@  Free GPU Memory

    cudaFree(deviceInput);
    cudaFree(deviceInput1);
    cudaFree(deviceOutput);
    cudaFree(deviceOutput1);
    cudaFree(deviceAuxSum);


  wbSolution(args, hostOutput1, numElements);

  free(hostInput);
  free(hostOutput1);

  return 0;
}