Advertisement
phystota

Lab6_final

Oct 26th, 2024 (edited)
52
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.98 KB | None | 0 0
  1. // MP Scan
  2. // Given a list (lst) of length n
  3. // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4. // +
  5. // lst[n-1]}
  6.  
  7. #include <wb.h>
  8. #include <iostream>
  9.  
  10. #define BLOCK_SIZE 512 //@@ You can change this
  11. #define MAX_BLOCKS 65535 // Maximum number of blocks per grid dimension
  12.  
  13. #define wbCheck(stmt)                                                     \
  14.   do {                                                                    \
  15.     cudaError_t err = stmt;                                               \
  16.     if (err != cudaSuccess) {                                             \
  17.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  18.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  19.       return -1;                                                          \
  20.     }                                                                     \
  21.   } while (0)
  22.  
  23. ////// KERNEL 1//////////////
  24. __global__ void blockScan(float *input, float *output, int len, float *AuxSum) {
  25.    
  26.   __shared__ float partialSum[2*BLOCK_SIZE];
  27.   unsigned int t = threadIdx.x;
  28.   unsigned int start = 2 * blockIdx.x * blockDim.x;
  29.   if (start + t < len) {
  30.       partialSum[t] = input[start + t];
  31.   } else {
  32.       partialSum[t] = 0.0f;
  33.   }
  34.   if (start + blockDim.x + t < len) {
  35.       partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  36.   } else {
  37.       partialSum[blockDim.x + t] = 0.0f;
  38.   }
  39.   __syncthreads();
  40.  
  41.   int stride = 1;
  42.   while (stride < 2*BLOCK_SIZE) {
  43.     __syncthreads();
  44.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  45.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  46.       partialSum[index] += partialSum[index - stride];
  47.     }
  48.     stride = stride*2;
  49.   }
  50.  __syncthreads();
  51.   stride = BLOCK_SIZE/2;
  52.   while (stride > 0) {
  53.     __syncthreads();
  54.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  55.     if ((index + stride) < 2*BLOCK_SIZE){
  56.       partialSum[index+stride] += partialSum[index];
  57.     }
  58.     stride = stride/2;
  59.   }
  60.  __syncthreads();
  61.   if (start + t < len){
  62.     output[start + t] = partialSum[t];
  63.   }
  64.   if (start + blockDim.x + t < len) {
  65.     output[start + blockDim.x + t] = partialSum[blockDim.x + t];
  66.   }
  67.  
  68.   if (threadIdx.x == blockDim.x -1) {
  69.     AuxSum[blockIdx.x] = partialSum[2*blockDim.x - 1];
  70.   }
  71.  
  72. }
  73.  
  74. /////////////////////// KERNEL 2////////////////////////////
  75.  
  76. __global__ void auxScan(float *AuxSum, int len) {
  77.    
  78.   __shared__ float partialSum[2*BLOCK_SIZE];
  79.   unsigned int t = threadIdx.x;
  80.   unsigned int start = 2 * blockIdx.x * blockDim.x;    
  81.  
  82.   if (start + t < len) {
  83.       partialSum[t] = AuxSum[start + t];
  84.   } else {
  85.       partialSum[t] = 0.0f;
  86.   }
  87.   if (start + blockDim.x + t < len) {
  88.       partialSum[blockDim.x + t] = AuxSum[start + blockDim.x + t];
  89.   } else {
  90.       partialSum[blockDim.x + t] = 0.0f;
  91.   }
  92.   __syncthreads();
  93.  
  94.   int stride = 1;
  95.   while (stride < 2*BLOCK_SIZE) {
  96.     __syncthreads();
  97.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  98.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  99.       partialSum[index] += partialSum[index - stride];
  100.     }
  101.     stride = stride*2;
  102.   }
  103.    __syncthreads();
  104.  
  105.   stride = BLOCK_SIZE/2;
  106.   while (stride > 0) {
  107.     __syncthreads();
  108.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  109.     if ((index + stride) < 2*BLOCK_SIZE){
  110.       partialSum[index+stride] += partialSum[index];
  111.     }
  112.     stride = stride/2;
  113.   }
  114.    __syncthreads();
  115.  
  116.   if (start + t < len) {
  117.       AuxSum[start + t] = partialSum[t];
  118.   }
  119.   if (start + t + blockDim.x < len) {
  120.       AuxSum[start + blockDim.x + t] = partialSum[t + blockDim.x];
  121.   }  
  122. }
  123.  
  124. __global__ void sum(float *input, float *output, float *AuxSum, int len) {
  125.    
  126.     unsigned int t = threadIdx.x;
  127.     unsigned int start = 2 * blockDim.x * blockIdx.x;
  128.     if (blockIdx.x == 0){
  129.       if (t < len){
  130.         output[t] = input[t];
  131.       }
  132.       if (t + blockDim.x < len){
  133.         output[t + blockDim.x] = input[t + blockDim.x];
  134.       }
  135.     }
  136.     else {
  137.       if(t + start < len){
  138.         output[t + start] = input[t + start] + AuxSum[blockIdx.x-1];
  139.       }
  140.       if (t + start + blockDim.x < len){
  141.         output[t + start + blockDim.x] = input[t + start + blockDim.x] + AuxSum[blockIdx.x-1];
  142.       }
  143.     }
  144. }
  145.  
  146. int main(int argc, char **argv) {
  147.   wbArg_t args;
  148.   float *hostInput;  // The input 1D list
  149.   float *hostOutput1; // the final output
  150.   float *deviceInput;
  151.   float *deviceInput1;
  152.   float *deviceOutput;
  153.   float *deviceOutput1;
  154.   float *deviceAuxSum;
  155.   int numElements; // number of elements in the list
  156.  
  157.   args = wbArg_read(argc, argv);
  158.  
  159.   // Import data and create memory on host
  160.   // The number of input elements in the input is numElements
  161.   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  162.   hostOutput1 = (float *)malloc(numElements * sizeof(float));
  163.  
  164.   int len_aux = ceil(1.0*numElements/(2*BLOCK_SIZE));
  165.  
  166.   // Allocate GPU memory.
  167.   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  168.   wbCheck(cudaMalloc((void **)&deviceInput1, numElements * sizeof(float)));
  169.   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  170.   wbCheck(cudaMalloc((void **)&deviceOutput1, numElements * sizeof(float)));
  171.   wbCheck(cudaMalloc((void **)&deviceAuxSum, len_aux * sizeof(float)));
  172.  
  173.  
  174.   // Clear output memory.
  175.   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  176.   wbCheck(cudaMemset(deviceOutput1, 0, numElements * sizeof(float)));  
  177.   wbCheck(cudaMemset(deviceAuxSum, 0, len_aux * sizeof(float)));
  178.  
  179.   // Copy input memory to the GPU.
  180.   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), cudaMemcpyHostToDevice));              
  181.  
  182.   //@@ Initialize the grid and block dimensions here --- Kernels 1
  183.  
  184.  
  185.   dim3 DimGrid(ceil(1.0*numElements/(2*BLOCK_SIZE)), 1, 1);
  186.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  187.  
  188.   blockScan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceAuxSum);
  189.   wbCheck(cudaGetLastError());
  190.   cudaDeviceSynchronize();
  191.  
  192.   //@@ Initialize the grid and block dimensions here --- Kernel 2
  193.   dim3 AuxGrid(ceil(1.0*len_aux/ (2*BLOCK_SIZE)), 1, 1);
  194.  
  195.   auxScan<<<AuxGrid, DimBlock>>>(deviceAuxSum, len_aux);
  196.  
  197.   wbCheck(cudaGetLastError());
  198.   cudaDeviceSynchronize();  
  199.  
  200.  
  201.   wbCheck(cudaMemcpy(deviceInput1, deviceOutput, numElements * sizeof(float), cudaMemcpyDeviceToDevice));
  202.  
  203.   //@@ Initialize the grid and block dimensions here --- Kernel 3
  204.   sum<<<DimGrid, DimBlock>>>(deviceInput1, deviceOutput1, deviceAuxSum, numElements);
  205.   wbCheck(cudaGetLastError());
  206.   cudaDeviceSynchronize();
  207.  
  208.   // Copying output memory to the CPU from Kernel 3
  209.   wbCheck(cudaMemcpy(hostOutput1, deviceOutput1, numElements * sizeof(float), cudaMemcpyDeviceToHost));
  210.  
  211.  
  212.   //@@  Free GPU Memory
  213.  
  214.     cudaFree(deviceInput);
  215.     cudaFree(deviceInput1);
  216.     cudaFree(deviceOutput);
  217.     cudaFree(deviceOutput1);
  218.     cudaFree(deviceAuxSum);
  219.  
  220.  
  221.   wbSolution(args, hostOutput1, numElements);
  222.  
  223.   free(hostInput);
  224.   free(hostOutput1);
  225.  
  226.   return 0;
  227. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement