Advertisement
phystota

Lab6_1

Oct 25th, 2024
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.17 KB | None | 0 0
  1. // MP Scan
  2. // Given a list (lst) of length n
  3. // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4. // +
  5. // lst[n-1]}
  6.  
  7. #include <wb.h>
  8. #include <iostream>
  9.  
  10. #define BLOCK_SIZE 512 //@@ You can change this
  11. #define MAX_BLOCKS 65535 // Maximum number of blocks per grid dimension
  12.  
  13. #define wbCheck(stmt)                                                     \
  14.   do {                                                                    \
  15.     cudaError_t err = stmt;                                               \
  16.     if (err != cudaSuccess) {                                             \
  17.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  18.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  19.       return -1;                                                          \
  20.     }                                                                     \
  21.   } while (0)
  22.  
  23. __global__ void blockScan(float *input, float *output, int len, float *AuxSum) {
  24.    
  25.   __shared__ float partialSum[2*BLOCK_SIZE];
  26.   unsigned int t = threadIdx.x;
  27.   unsigned int start = 2 * blockIdx.x * blockDim.x;
  28.   if (start + t < len) {
  29.       partialSum[t] = input[start + t];
  30.   } else {
  31.       partialSum[t] = 0.0f;
  32.   }
  33.   if (start + blockDim.x + t < len) {
  34.       partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  35.   } else {
  36.       partialSum[blockDim.x + t] = 0.0f;
  37.   }
  38.   __syncthreads();
  39.  
  40.   int stride = 1;
  41.   while (stride < 2*BLOCK_SIZE) {
  42.     __syncthreads();
  43.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  44.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  45.       partialSum[index] += partialSum[index - stride];
  46.     }
  47.     stride = stride*2;
  48.   }
  49.  
  50.   stride = BLOCK_SIZE/2;
  51.   while (stride > 0) {
  52.     __syncthreads();
  53.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  54.     if ((index + stride) < 2*BLOCK_SIZE){
  55.       partialSum[index+stride] += partialSum[index];
  56.     }
  57.     stride = stride/2;
  58.   }
  59.   if (start + t < len){
  60.     output[start + t] = partialSum[t];
  61.   }
  62.  
  63.   if (threadIdx.x == blockDim.x -1) {
  64.     AuxSum[blockIdx.x] = partialSum[2*blockDim.x - 1];
  65.   }
  66. }
  67.  
  68. /////////////////////// KERNEL 2////////////////////////////
  69.  
  70. __global__ void auxScan(float *AuxSum, int len) {
  71.    
  72.   __shared__ float partialSum[2*BLOCK_SIZE];
  73.   unsigned int t = threadIdx.x;
  74.   unsigned int start = 2 * blockIdx.x * blockDim.x;    
  75.  
  76.   if (start + t < len) {
  77.       partialSum[t] = AuxSum[start + t];
  78.   } else {
  79.       partialSum[t] = 0.0f;
  80.   }
  81.   if (start + blockDim.x + t < len) {
  82.       partialSum[blockDim.x + t] = AuxSum[start + blockDim.x + t];
  83.   } else {
  84.       partialSum[blockDim.x + t] = 0.0f;
  85.   }
  86.   __syncthreads();
  87.  
  88.   int stride = 1;
  89.   while (stride < 2*BLOCK_SIZE) {
  90.     __syncthreads();
  91.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  92.     if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
  93.       partialSum[index] += partialSum[index - stride];
  94.     }
  95.     stride = stride*2;
  96.   }
  97.  
  98.   stride = BLOCK_SIZE/2;
  99.   while (stride > 0) {
  100.     __syncthreads();
  101.     int index = (threadIdx.x + 1) * stride * 2 - 1;
  102.     if ((index + stride) < 2*BLOCK_SIZE){
  103.       partialSum[index+stride] += partialSum[index];
  104.     }
  105.     stride = stride/2;
  106.   }
  107.   if (start + t < len) {
  108.       AuxSum[start + t] = partialSum[t];
  109.   }
  110. }
  111.  
  112. __global__ void sum(float *input, float *output, float *AuxSum, int len) {
  113.  
  114.   unsigned int t = blockDim.x * blockIdx.x + threadIdx.x;
  115.  
  116.   if (t < len && blockIdx.x > 0){
  117.     output[t] = input[t] + AuxSum[blockIdx.x - 1];
  118.   }
  119. }
  120.  
  121. int main(int argc, char **argv) {
  122.   wbArg_t args;
  123.   float *hostInput;  // The input 1D list
  124.   float *hostOutput1; // the final output
  125.   float *deviceInput;
  126.   float *deviceInput1;
  127.   float *deviceOutput;
  128.   float *deviceOutput1;
  129.   float *deviceAuxSum;
  130.   int numElements; // number of elements in the list
  131.  
  132.   args = wbArg_read(argc, argv);
  133.  
  134.   // Import data and create memory on host
  135.   // The number of input elements in the input is numElements
  136.   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
  137.   hostOutput1 = (float *)malloc(numElements * sizeof(float));
  138.  
  139.   int len_aux = ceil(numElements / (2*BLOCK_SIZE));
  140.  
  141.   // Allocate GPU memory.
  142.   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
  143.   wbCheck(cudaMalloc((void **)&deviceInput1, numElements * sizeof(float)));
  144.   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
  145.   wbCheck(cudaMalloc((void **)&deviceOutput1, numElements * sizeof(float)));
  146.   wbCheck(cudaMalloc((void **)&deviceAuxSum, len_aux * sizeof(float)));
  147.  
  148.  
  149.   // Clear output memory.
  150.   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
  151.   wbCheck(cudaMemset(deviceOutput1, 0, numElements * sizeof(float)));  
  152.   wbCheck(cudaMemset(deviceAuxSum, 0, len_aux * sizeof(float)));
  153.  
  154.   // Copy input memory to the GPU.
  155.   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), cudaMemcpyHostToDevice));              
  156.  
  157.   //@@ Initialize the grid and block dimensions here --- Kernels 1
  158.  
  159.  
  160.   dim3 DimGrid(ceil(numElements / (2*BLOCK_SIZE)), 1, 1);
  161.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  162.  
  163.   blockScan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceAuxSum);
  164.   cudaDeviceSynchronize();
  165.                    
  166.   //@@ Initialize the grid and block dimensions here --- Kernel 2
  167.   dim3 AuxGrid(ceil(len_aux / (2*BLOCK_SIZE)), 1, 1);
  168.  
  169.   auxScan<<<AuxGrid, DimBlock>>>(deviceAuxSum, len_aux);
  170.   cudaDeviceSynchronize();  
  171.  
  172.   wbCheck(cudaMemcpy(deviceInput1, deviceOutput, numElements * sizeof(float), cudaMemcpyDeviceToDevice));
  173.  
  174.   //@@ Initialize the grid and block dimensions here --- Kernel 3
  175.   sum<<<DimGrid, DimBlock>>>(deviceInput1, deviceOutput1, deviceAuxSum, numElements);
  176.   cudaDeviceSynchronize();
  177.  
  178.   // Copying output memory to the CPU from Kernel 3
  179.   wbCheck(cudaMemcpy(hostOutput1, deviceOutput1, numElements * sizeof(float), cudaMemcpyDeviceToHost));
  180.  
  181.   //@@  Free GPU Memory
  182.  
  183.     cudaFree(deviceInput);
  184.     cudaFree(deviceInput1);
  185.     cudaFree(deviceOutput);
  186.     cudaFree(deviceOutput1);
  187.     cudaFree(deviceAuxSum);
  188.  
  189.  
  190.   wbSolution(args, hostOutput1, numElements);
  191.  
  192.   free(hostInput);
  193.   free(hostOutput1);
  194.  
  195.   return 0;
  196. }
  197.  
  198.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement