Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * Test code related to
- * https://stackoverflow.com/questions/78855895
- * Compile with nvcc -lcurand -lnpps file.cu
- */
- #include <cuda_runtime.h>
- #include <curand.h>
- #include <npp.h>
- #include <thrust/device_vector.h>
- #include <thrust/scan.h>
- #include <algorithm>
- // using std::max
- #include <cstdio>
- // using std::printf
- int main()
- {
- const int vecsize = 1024 * 1024;
- const int iterations = 10000;
- float* rndvec, *normalized, *inscan, *diffs, *hostvars;
- for(float** vec: {&rndvec, &normalized, &inscan, &diffs})
- if(cudaMalloc(vec, vecsize * sizeof(float)))
- return 1;
- /*
- * nppsSum_32f and nppsMinMax_32f store their results in device memory.
- * We use pinned host memory accessible by both to retrieve it without memcpy.
- * But we need a cudaDeviceSynchronize() before accessing the values
- */
- if(cudaHostAlloc(&hostvars, 3 * sizeof(float), 0))
- return 2;
- float* sum = hostvars, *min = hostvars + 1, *max = hostvars + 2;
- curandGenerator_t gen;
- if(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS)
- return 3;
- if(curandSetPseudoRandomGeneratorSeed(gen, 0xbad1dea) != CURAND_STATUS_SUCCESS)
- return 4;
- int sumbufsize, minmaxbufsize;
- if(nppsSumGetBufferSize_32f(vecsize, &sumbufsize))
- return 5;
- if(nppsMinMaxGetBufferSize_32f(vecsize - 1, &minmaxbufsize))
- return 6;
- Npp8u* nppbuf;
- if(cudaMalloc(&nppbuf, std::max({sumbufsize, minmaxbufsize})))
- return 7;
- for(int i = 0; i < iterations; ++i) {
- /* Generate random values range (0..1] */
- if(curandGenerateUniform(gen, rndvec, vecsize) != CURAND_STATUS_SUCCESS)
- return 8;
- /* We want range [0..1) so as a clutch, we subtract the minimum */
- if(nppsMinMax_32f(rndvec, vecsize, min, max, nppbuf))
- return 9;
- if(cudaDeviceSynchronize())
- return 10;
- if(nppsSubC_32f(rndvec, *min, rndvec, vecsize))
- return 11;
- /* Normalize so that their sum is 1.0 */
- if(nppsSum_32f(rndvec, vecsize, sum, nppbuf))
- return 12;
- cudaDeviceSynchronize();
- if(nppsNormalize_32f(rndvec, normalized, vecsize, 0.f /*subtracted*/, *sum /*divided*/))
- return 13;
- /* verify normalization */
- if(nppsSum_32f(normalized, vecsize, sum, nppbuf))
- return 14;
- /* perform inscan, a.k.a. partial_sum */
- thrust::device_ptr<float> normalized_ptr(normalized);
- thrust::device_ptr<float> inscan_ptr(inscan);
- thrust::inclusive_scan(normalized_ptr, normalized_ptr + vecsize, inscan_ptr);
- # if 1
- /*
- * Optional fix: Filter numerical noise. We know the value should never decrease,
- * so we just force that
- */
- thrust::inclusive_scan(inscan_ptr, inscan_ptr + vecsize, inscan_ptr, thrust::maximum<float>{});
- # endif
- /* compute pairwise differences of adjacent values */
- if(nppsSub_32f(inscan, inscan + 1, diffs, vecsize - 1))
- return 15;
- if(nppsMinMax_32f(diffs, vecsize - 1, min, max, nppbuf))
- return 16;
- cudaDeviceSynchronize();
- std::printf("mindiff %g maxdiff %g sum %g\n", *min, *max, *sum);
- if(*min < 0.f) /* abort of the result ever gets negative */
- return 17;
- }
- std::puts("No error detected");
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement