Advertisement
phystota

lab5_1

Oct 18th, 2024
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.18 KB | None | 0 0
  1. // MP5 Reduction
  2. // Input: A num list of length n
  3. // Output: Sum of the list = list[0] + list[1] + ... + list[n-1];
  4.  
  5. #include <wb.h>
  6.  
  7. #define BLOCK_SIZE 512 //@@ This value is not fixed and you can adjust it according to the situation
  8.  
  9. #define wbCheck(stmt)                                                     \
  10.   do {                                                                    \
  11.     cudaError_t err = stmt;                                               \
  12.     if (err != cudaSuccess) {                                             \
  13.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  14.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  15.       return -1;                                                          \
  16.     }                                                                     \
  17.   } while (0)
  18.  
  19. __global__ void total(float *input, float *output, int len) {
  20.   //@@ Load a segment of the input vector into shared memory
  21.   //@@ Traverse the reduction tree
  22.   //@@ Write the computed sum of the block to the output vector at the correct index
  23. __shared__ float partialSum[2*BLOCK_SIZE];
  24.  
  25.  
  26. unsigned int t = threadIdx.x;
  27. unsigned int start = 2 * blockIdx.x * blockDim.x;
  28.  
  29. if (start + t < len){
  30.   partialSum[t] = input[start + t];
  31. }
  32. else {
  33.   partialSum[t] = 0.0;
  34. }
  35. if (start + blockDim.x + t < len){
  36.   partialSum[blockDim.x + t] = input[start + blockDim.x + t];
  37. }
  38. else {
  39.   partialSum[blockDim.x + t] = 0.0;
  40. }
  41.  
  42. __syncthreads();
  43. for (unsigned int stride = blockDim.x; stride >=1; stride /= 2){
  44.   __syncthreads();
  45.   if (t < stride){
  46.     partialSum[t] +=partialSum[t + stride];
  47.   }
  48. }
  49. if (t==0) {
  50.   output[blockIdx.x] = partialSum[t];
  51. }
  52.  
  53. }
  54.  
  55. int main(int argc, char **argv) {
  56.   int ii;
  57.   wbArg_t args;
  58.   float *hostInput;  // The input 1D list
  59.   float *hostOutput; // The output list
  60.   //@@ Initialize device input and output pointers
  61.  
  62.   int numInputElements;  // number of elements in the input list
  63.   int numOutputElements; // number of elements in the output list
  64.  
  65.   args = wbArg_read(argc, argv);
  66.  
  67.   //Import data and create memory on host
  68.   hostInput =
  69.       (float *)wbImport(wbArg_getInputFile(args, 0), &numInputElements);
  70.  
  71.   for (int k = 0; k < ceil(numInputElements/2048*65535); k++){
  72.  
  73.   numOutputElements = numInputElements / (BLOCK_SIZE << 1);
  74.   if (numInputElements % (BLOCK_SIZE << 1)) {
  75.     numOutputElements++;
  76.   }
  77.   hostOutput = (float *)malloc(numOutputElements * sizeof(float));
  78.  
  79.   // The number of input elements in the input is numInputElements
  80.   // The number of output elements in the output is numOutputElements
  81.  
  82.  
  83.  
  84.   //@@ Allocate GPU memory
  85.   float *deviceInput, *deviceOutput;
  86.   cudaMalloc((void **)&deviceInput, numInputElements*sizeof(float));
  87.   cudaMalloc((void **)&deviceOutput, numOutputElements*sizeof(float));  
  88.  
  89.  
  90.  
  91.   //@@ Copy input memory to the GPU
  92.   cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice);
  93.  
  94.   //@@ Initialize the grid and block dimensions here
  95.  
  96.   dim3 DimGrid(ceil(numInputElements/(2.0*BLOCK_SIZE)), 1, 1);
  97.   dim3 DimBlock(BLOCK_SIZE, 1, 1);
  98.  
  99.  
  100.   //@@ Launch the GPU Kernel and perform CUDA computation
  101.  
  102.   total<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numInputElements);  
  103.  
  104.   cudaDeviceSynchronize();  
  105.   //@@ Copy the GPU output memory back to the CPU
  106.  
  107.   cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);
  108.  
  109.   /********************************************************************
  110.    * Reduce output vector on the host
  111.    * NOTE: One could also perform the reduction of the output vector
  112.    * recursively and support any size input.
  113.    * For simplicity, we do not require that for this lab.
  114.    ********************************************************************/
  115.  
  116.  
  117.   //@@ Free the GPU memory
  118.  
  119.   cudaFree(deviceInput);
  120.   cudaFree(deviceOutput);
  121.  
  122.   numInputElements = numOutputElements;
  123.   memcpy(hostInput, hostOutput, numOutputElements *sizeof(float));
  124.  
  125.   }
  126.  
  127.   for (ii = 1; ii < numOutputElements; ii++) {
  128.     hostOutput[0] += hostOutput[ii];
  129.   }
  130.  
  131.   wbSolution(args, hostOutput, 1);
  132.  
  133.   free(hostInput);
  134.   free(hostOutput);
  135.  
  136.   return 0;
  137. }
  138.  
  139.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement