cuda inscan numeric check version 2

/*
 * Test code related to
 * https://stackoverflow.com/questions/78855895
 * Compile with nvcc -lcurand -lnpps file.cu
 */

#include <cuda_runtime.h>
#include <curand.h>
#include <npp.h>

#include <thrust/device_vector.h>
#include <thrust/scan.h>

#include <algorithm>
// using std::max
#include <cstdio>
// using std::printf


int main()
{
    const int vecsize = 1024 * 1024;
    const int iterations = 10000;
    float* rndvec, *normalized, *inscan, *diffs, *hostvars;
    for(float** vec: {&rndvec, &normalized, &inscan, &diffs})
        if(cudaMalloc(vec, vecsize * sizeof(float)))
            return 1;
    /*
     * nppsSum_32f and nppsMinMax_32f store their results in device memory.
     * We use pinned host memory accessible by both to retrieve it without memcpy.
     * But we need a cudaDeviceSynchronize() before accessing the values
     */
    if(cudaHostAlloc(&hostvars, 3 * sizeof(float), 0))
        return 2;
    float* sum = hostvars, *min = hostvars + 1, *max = hostvars + 2;
    curandGenerator_t gen;
    if(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS)
        return 3;
    if(curandSetPseudoRandomGeneratorSeed(gen, 0xbad1dea) != CURAND_STATUS_SUCCESS)
        return 4;
    int sumbufsize, minmaxbufsize;
    if(nppsSumGetBufferSize_32f(vecsize, &sumbufsize))
        return 5;
    if(nppsMinMaxGetBufferSize_32f(vecsize - 1, &minmaxbufsize))
        return 6;
    Npp8u* nppbuf;
    if(cudaMalloc(&nppbuf, std::max({sumbufsize, minmaxbufsize})))
        return 7;
    for(int i = 0; i < iterations; ++i) {
        /* Generate random values range (0..1] */
        if(curandGenerateUniform(gen, rndvec, vecsize) != CURAND_STATUS_SUCCESS)
            return 8;
        /* We want range [0..1) so as a clutch, we subtract the minimum */
        if(nppsMinMax_32f(rndvec, vecsize, min, max, nppbuf))
            return 9;
        if(cudaDeviceSynchronize())
            return 10;
        if(nppsSubC_32f(rndvec, *min, rndvec, vecsize))
            return 11;
        /* Normalize so that their sum is 1.0 */
        if(nppsSum_32f(rndvec, vecsize, sum, nppbuf))
            return 12;
        cudaDeviceSynchronize();
        if(nppsNormalize_32f(rndvec, normalized, vecsize, 0.f /*subtracted*/, *sum /*divided*/))
            return 13;
        /* verify normalization */
        if(nppsSum_32f(normalized, vecsize, sum, nppbuf))
            return 14;
        /* perform inscan, a.k.a. partial_sum */
        thrust::device_ptr<float> normalized_ptr(normalized);
        thrust::device_ptr<float> inscan_ptr(inscan);
        thrust::inclusive_scan(normalized_ptr, normalized_ptr + vecsize, inscan_ptr);
#     if 1
        /*
         * Optional fix: Filter numerical noise. We know the value should never decrease,
         * so we just force that
         */
        thrust::inclusive_scan(inscan_ptr, inscan_ptr + vecsize, inscan_ptr, thrust::maximum<float>{});
#     endif
        /* compute pairwise differences of adjacent values */
        if(nppsSub_32f(inscan, inscan + 1, diffs, vecsize - 1))
            return 15;
        if(nppsMinMax_32f(diffs, vecsize - 1, min, max, nppbuf))
            return 16;
        cudaDeviceSynchronize();
        std::printf("mindiff %g maxdiff %g sum %g\n", *min, *max, *sum);
        if(*min < 0.f) /* abort of the result ever gets negative */
            return 17;
    }
    std::puts("No error detected");
}