Advertisement
homer512

cuda inscan numeric check version 2

Aug 10th, 2024
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /*
  2.  * Test code related to
  3.  * https://stackoverflow.com/questions/78855895
  4.  * Compile with nvcc -lcurand -lnpps file.cu
  5.  */
  6.  
  7. #include <cuda_runtime.h>
  8. #include <curand.h>
  9. #include <npp.h>
  10.  
  11. #include <thrust/device_vector.h>
  12. #include <thrust/scan.h>
  13.  
  14. #include <algorithm>
  15. // using std::max
  16. #include <cstdio>
  17. // using std::printf
  18.  
  19.  
  20. int main()
  21. {
  22.     const int vecsize = 1024 * 1024;
  23.     const int iterations = 10000;
  24.     float* rndvec, *normalized, *inscan, *diffs, *hostvars;
  25.     for(float** vec: {&rndvec, &normalized, &inscan, &diffs})
  26.         if(cudaMalloc(vec, vecsize * sizeof(float)))
  27.             return 1;
  28.     /*
  29.      * nppsSum_32f and nppsMinMax_32f store their results in device memory.
  30.      * We use pinned host memory accessible by both to retrieve it without memcpy.
  31.      * But we need a cudaDeviceSynchronize() before accessing the values
  32.      */
  33.     if(cudaHostAlloc(&hostvars, 3 * sizeof(float), 0))
  34.         return 2;
  35.     float* sum = hostvars, *min = hostvars + 1, *max = hostvars + 2;
  36.     curandGenerator_t gen;
  37.     if(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS)
  38.         return 3;
  39.     if(curandSetPseudoRandomGeneratorSeed(gen, 0xbad1dea) != CURAND_STATUS_SUCCESS)
  40.         return 4;
  41.     int sumbufsize, minmaxbufsize;
  42.     if(nppsSumGetBufferSize_32f(vecsize, &sumbufsize))
  43.         return 5;
  44.     if(nppsMinMaxGetBufferSize_32f(vecsize - 1, &minmaxbufsize))
  45.         return 6;
  46.     Npp8u* nppbuf;
  47.     if(cudaMalloc(&nppbuf, std::max({sumbufsize, minmaxbufsize})))
  48.         return 7;
  49.     for(int i = 0; i < iterations; ++i) {
  50.         /* Generate random values range (0..1] */
  51.         if(curandGenerateUniform(gen, rndvec, vecsize) != CURAND_STATUS_SUCCESS)
  52.             return 8;
  53.         /* We want range [0..1) so as a clutch, we subtract the minimum */
  54.         if(nppsMinMax_32f(rndvec, vecsize, min, max, nppbuf))
  55.             return 9;
  56.         if(cudaDeviceSynchronize())
  57.             return 10;
  58.         if(nppsSubC_32f(rndvec, *min, rndvec, vecsize))
  59.             return 11;
  60.         /* Normalize so that their sum is 1.0 */
  61.         if(nppsSum_32f(rndvec, vecsize, sum, nppbuf))
  62.             return 12;
  63.         cudaDeviceSynchronize();
  64.         if(nppsNormalize_32f(rndvec, normalized, vecsize, 0.f /*subtracted*/, *sum /*divided*/))
  65.             return 13;
  66.         /* verify normalization */
  67.         if(nppsSum_32f(normalized, vecsize, sum, nppbuf))
  68.             return 14;
  69.         /* perform inscan, a.k.a. partial_sum */
  70.         thrust::device_ptr<float> normalized_ptr(normalized);
  71.         thrust::device_ptr<float> inscan_ptr(inscan);
  72.         thrust::inclusive_scan(normalized_ptr, normalized_ptr + vecsize, inscan_ptr);
  73. #     if 1
  74.         /*
  75.          * Optional fix: Filter numerical noise. We know the value should never decrease,
  76.          * so we just force that
  77.          */
  78.         thrust::inclusive_scan(inscan_ptr, inscan_ptr + vecsize, inscan_ptr, thrust::maximum<float>{});
  79. #     endif
  80.         /* compute pairwise differences of adjacent values */
  81.         if(nppsSub_32f(inscan, inscan + 1, diffs, vecsize - 1))
  82.             return 15;
  83.         if(nppsMinMax_32f(diffs, vecsize - 1, min, max, nppbuf))
  84.             return 16;
  85.         cudaDeviceSynchronize();
  86.         std::printf("mindiff %g maxdiff %g sum %g\n", *min, *max, *sum);
  87.         if(*min < 0.f) /* abort of the result ever gets negative */
  88.             return 17;
  89.     }
  90.     std::puts("No error detected");
  91. }
  92.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement