Advertisement
phystota

lab5

Oct 16th, 2024
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.97 KB | None | 0 0
  1. // MP5 Reduction
  2. // Input: A num list of length n
  3. // Output: Sum of the list = list[0] + list[1] + ... + list[n-1];
  4.  
  5. #include <wb.h>
  6.  
  7. #define BLOCK_SIZE 512 //@@ This value is not fixed and you can adjust it according to the situation
  8.  
  9. #define wbCheck(stmt)                                                     \
  10.   do {                                                                    \
  11.     cudaError_t err = stmt;                                               \
  12.     if (err != cudaSuccess) {                                             \
  13.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  14.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  15.       return -1;                                                          \
  16.     }                                                                     \
  17.   } while (0)
  18.  
  19. __global__ void total(float *input, float *output, int len) {
  20.   //@@ Load a segment of the input vector into shared memory
  21.   //@@ Traverse the reduction tree
  22.   //@@ Write the computed sum of the block to the output vector at the correct index
  23. __shared__ float partialSum[2*BLOCK_SIZE];
  24.  
  25.  
  26. unsigned int t = threadIdx.x;
  27. unsigned int start = 2*blockIdx.x * blockDim.x;
  28.  
  29. if (start + t < len){
  30.   partialSum[t] = input[start + t];
  31. }
  32. else {
  33.   partialSum[t] = 0.0;
  34. }
  35. if (start + blockDim.x + t < len){
  36.   partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  37. }
  38. else {
  39.   partialSum[blockDim.x + t] = 0.0;
  40. }
  41.  
  42.  
  43. for (unsigned int stride = blockDim.x; stride >=1; stride >>= 1){
  44.   __syncthreads();
  45.   if (t < stride){
  46.     partialSum[t] +=partialSum[t + stride];
  47.   }
  48. }
  49. if (t==0) {
  50.   output[blockIdx.x] = partialSum[t];
  51. }
  52.  
  53. }
  54.  
  55. int main(int argc, char **argv) {
  56.   int ii;
  57.   wbArg_t args;
  58.   float *hostInput;  // The input 1D list
  59.   float *hostOutput; // The output list
  60.   //@@ Initialize device input and output pointers
  61.  
  62.   int numInputElements;  // number of elements in the input list
  63.   int numOutputElements; // number of elements in the output list
  64.  
  65.   args = wbArg_read(argc, argv);
  66.  
  67.   //Import data and create memory on host
  68.   hostInput =
  69.       (float *)wbImport(wbArg_getInputFile(args, 0), &numInputElements);
  70.  
  71.   numOutputElements = numInputElements / (BLOCK_SIZE << 1);
  72.   if (numInputElements % (BLOCK_SIZE << 1)) {
  73.     numOutputElements++;
  74.   }
  75.   hostOutput = (float *)malloc(numOutputElements * sizeof(float));
  76.  
  77.   // The number of input elements in the input is numInputElements
  78.   // The number of output elements in the output is numOutputElements
  79.  
  80.   //@@ Allocate GPU memory
  81.   float *deviceInput, *deviceOutput;
  82.   cudaMalloc((void **)&deviceInput, numInputElements*sizeof(float));
  83.   cudaMalloc((void **)&deviceOutput, numOutputElements*sizeof(float));  
  84.  
  85.   //@@ Copy input memory to the GPU
  86.   cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
  87.  
  88.   //@@ Initialize the grid and block dimensions here
  89.  
  90.   dim3 DimGrid(ceil(numInputElements/(2.0*BLOCK_SIZE)), 1, 1);
  91.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  92.  
  93.  
  94.   //@@ Launch the GPU Kernel and perform CUDA computation
  95.  
  96.   total<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numInputElements);  
  97.  
  98.   cudaDeviceSynchronize();  
  99.   //@@ Copy the GPU output memory back to the CPU
  100.  
  101.   cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
  102.  
  103.   /********************************************************************
  104.    * Reduce output vector on the host
  105.    * NOTE: One could also perform the reduction of the output vector
  106.    * recursively and support any size input.
  107.    * For simplicity, we do not require that for this lab.
  108.    ********************************************************************/
  109.   for (ii = 1; ii < numOutputElements; ii++) {
  110.     hostOutput[0] += hostOutput[ii];
  111.   }
  112.  
  113.   //@@ Free the GPU memory
  114.  
  115.   cudaFree(deviceInput);
  116.   cudaFree(deviceOutput);
  117.  
  118.  
  119.   wbSolution(args, hostOutput, 1);
  120.  
  121.   free(hostInput);
  122.   free(hostOutput);
  123.  
  124.   return 0;
  125. }
  126.  
  127.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement