Advertisement
phystota

Lab6_2

Oct 25th, 2024
42
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.58 KB | None | 0 0
  1. // MP Scan
  2. // Given a list (lst) of length n
  3. // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4. // +
  5. // lst[n-1]}
  6.  
  7. #include <wb.h>
  8. #include <iostream>
  9.  
  10. #define BLOCK_SIZE 512 //@@ You can change this
  11. #define MAX_BLOCKS 65535 // Maximum number of blocks per grid dimension
  12.  
  13. #define wbCheck(stmt)                                                     \
  14.   do {                                                                    \
  15.     cudaError_t err = stmt;                                               \
  16.     if (err != cudaSuccess) {                                             \
  17.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  18.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  19.       return -1;                                                          \
  20.     }                                                                     \
  21.   } while (0)
  22.  
  23. ////// KERNEL 1//////////////
  24. __global__ void blockScan(float *input, float *output, int len, float *AuxSum) {
  25.    
  26.   __shared__ float partialSum[2*BLOCK_SIZE];
  27.   unsigned int t = threadIdx.x;
  28.   unsigned int start = 2 * blockIdx.x * blockDim.x;
  29.   if (start + t < len) {
  30.       partialSum[t] = input[start + t];
  31.   } else {
  32.       partialSum[t] = 0.0f;
  33.   }
  34.   if (start + blockDim.x + t < len) {
  35.       partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  36.   } else {
  37.       partialSum[blockDim.x + t] = 0.0f;
  38.   }
  39.   __syncthreads();
  40.  
  41.   int stride = 1;
  42.   while (stride < 2*BLOCK_SIZE) {
  43.     __syncthreads();
  44.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  45.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  46.       partialSum[index] += partialSum[index - stride];
  47.     }
  48.     stride = stride*2;
  49.   }
  50.  
  51.   stride = BLOCK_SIZE/2;
  52.   while (stride > 0) {
  53.     __syncthreads();
  54.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  55.     if ((index + stride) < 2*BLOCK_SIZE){
  56.       partialSum[index+stride] += partialSum[index];
  57.     }
  58.     stride = stride/2;
  59.   }
  60.   if (start + t < len){
  61.     output[start + t] = partialSum[t];
  62.   }
  63.   if (start + blockDim.x + t < len) {
  64.     output[start + blockDim.x + t] = partialSum[blockDim.x + t];
  65.   }
  66.  
  67.   if (threadIdx.x == blockDim.x -1) {
  68.     AuxSum[blockIdx.x] = partialSum[2*blockDim.x - 1];
  69.   }
  70.  
  71. }
  72.  
  73. /////////////////////// KERNEL 2////////////////////////////
  74.  
  75. __global__ void auxScan(float *AuxSum, int len) {
  76.    
  77.   __shared__ float partialSum[2*BLOCK_SIZE];
  78.   unsigned int t = threadIdx.x;
  79.   unsigned int start = 2 * blockIdx.x * blockDim.x;    
  80.  
  81.   if (start + t < len) {
  82.       partialSum[t] = AuxSum[start + t];
  83.   } else {
  84.       partialSum[t] = 0.0f;
  85.   }
  86.   if (start + blockDim.x + t < len) {
  87.       partialSum[blockDim.x + t] = AuxSum[start + blockDim.x + t];
  88.   } else {
  89.       partialSum[blockDim.x + t] = 0.0f;
  90.   }
  91.   __syncthreads();
  92.  
  93.   int stride = 1;
  94.   while (stride < 2*BLOCK_SIZE) {
  95.     __syncthreads();
  96.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  97.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  98.       partialSum[index] += partialSum[index - stride];
  99.     }
  100.     stride = stride*2;
  101.   }
  102.  
  103.   stride = BLOCK_SIZE/2;
  104.   while (stride > 0) {
  105.     __syncthreads();
  106.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  107.     if ((index + stride) < 2*BLOCK_SIZE){
  108.       partialSum[index+stride] += partialSum[index];
  109.     }
  110.     stride = stride/2;
  111.   }
  112.   if (start + t < len) {
  113.       AuxSum[start + t] = partialSum[t];
  114.   }
  115.   if (start + t + blockDim.x < len) {
  116.       AuxSum[start + blockDim.x + t] = partialSum[t + blockDim.x];
  117.   }  
  118. }
  119.  
  120. __global__ void sum(float *input, float *output, float *AuxSum, int len) {
  121.  
  122.   unsigned int t = blockDim.x * blockIdx.x + threadIdx.x;
  123.  
  124.   if (t < len && blockIdx.x > 0){
  125.       output[t] = input[t] + AuxSum[blockIdx.x - 1];
  126.   } else if (t < len) {
  127.      output[t] = input[t];  // Copy elements from first block without modification
  128.   }
  129. }
  130.  
  131. int main(int argc, char **argv) {
  132.   wbArg_t args;
  133.   float *hostInput;  // The input 1D list
  134.   float *hostOutput1; // the final output
  135.   float *deviceInput;
  136.   float *deviceInput1;
  137.   float *deviceOutput;
  138.   float *deviceOutput1;
  139.   float *deviceAuxSum;
  140.   int numElements; // number of elements in the list
  141.  
  142.   args = wbArg_read(argc, argv);
  143.  
  144.   // Import data and create memory on host
  145.   // The number of input elements in the input is numElements
  146.   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  147.   hostOutput1 = (float *)malloc(numElements * sizeof(float));
  148.  
  149.   int len_aux = (numElements + (2*BLOCK_SIZE) - 1) / (2*BLOCK_SIZE);
  150.  
  151.   // Allocate GPU memory.
  152.   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  153.   wbCheck(cudaMalloc((void **)&deviceInput1, numElements * sizeof(float)));
  154.   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  155.   wbCheck(cudaMalloc((void **)&deviceOutput1, numElements * sizeof(float)));
  156.   wbCheck(cudaMalloc((void **)&deviceAuxSum, len_aux * sizeof(float)));
  157.  
  158.  
  159.   // Clear output memory.
  160.   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  161.   wbCheck(cudaMemset(deviceOutput1, 0, numElements * sizeof(float)));  
  162.   wbCheck(cudaMemset(deviceAuxSum, 0, len_aux * sizeof(float)));
  163.  
  164.   // Copy input memory to the GPU.
  165.   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), cudaMemcpyHostToDevice));              
  166.  
  167.   //@@ Initialize the grid and block dimensions here --- Kernels 1
  168.  
  169.  
  170.   dim3 DimGrid((numElements + (2*BLOCK_SIZE) - 1) / (2*BLOCK_SIZE), 1, 1);
  171.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  172.  
  173.   blockScan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceAuxSum);
  174.   cudaDeviceSynchronize();
  175.                    
  176.   //@@ Initialize the grid and block dimensions here --- Kernel 2
  177.   dim3 AuxGrid((len_aux + (2*BLOCK_SIZE) - 1) / (2*BLOCK_SIZE), 1, 1);
  178.  
  179.   auxScan<<<AuxGrid, DimBlock>>>(deviceAuxSum, len_aux);
  180.   cudaDeviceSynchronize();  
  181.  
  182.   wbCheck(cudaMemcpy(deviceInput1, deviceOutput, numElements * sizeof(float), cudaMemcpyDeviceToDevice));
  183.  
  184.   //@@ Initialize the grid and block dimensions here --- Kernel 3
  185.   sum<<<DimGrid, DimBlock>>>(deviceInput1, deviceOutput1, deviceAuxSum, numElements);
  186.   cudaDeviceSynchronize();
  187.  
  188.   // Copying output memory to the CPU from Kernel 3
  189.   wbCheck(cudaMemcpy(hostOutput1, deviceOutput1, numElements * sizeof(float), cudaMemcpyDeviceToHost));
  190.  
  191.   //@@  Free GPU Memory
  192.  
  193.     cudaFree(deviceInput);
  194.     cudaFree(deviceInput1);
  195.     cudaFree(deviceOutput);
  196.     cudaFree(deviceOutput1);
  197.     cudaFree(deviceAuxSum);
  198.  
  199.  
  200.   wbSolution(args, hostOutput1, numElements);
  201.  
  202.   free(hostInput);
  203.   free(hostOutput1);
  204.  
  205.   return 0;
  206. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement