Advertisement
phystota

sparse_final

Nov 14th, 2024
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.77 KB | None | 0 0
  1. #include <wb.h>
  2.  
  3. #define wbCheck(stmt)                                                     \
  4.   do {                                                                    \
  5.     cudaError_t err = stmt;                                               \
  6.     if (err != cudaSuccess) {                                             \
  7.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  8.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  9.       return -1;                                                          \
  10.     }                                                                     \
  11.   } while (0)
  12.  
  13. __global__ void spmvJDSKernel(float *out, int *matColStart, int *matCols,
  14.                               int *matRowPerm, int *matRows,
  15.                               float *matData, float *vec, int dim) {
  16.   //@@ invoke spmv kernel for jds format
  17.     unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
  18.      if (row < dim) {
  19.       float sum = 0.0f;
  20.       for (int i = 0; i < matRows[row]; i++) {
  21.           unsigned int col = matCols[matColStart[i] + row];
  22.           unsigned int val = matData[matColStart[i] + row];
  23.           sum += val * vec[col];
  24.       }
  25. //        __syncthreads(); // think do I need it?
  26.       out[matRowPerm[row]] = sum;
  27.      }
  28. }
  29.  
  30. static void spmvJDS(float *out, int *matColStart, int *matCols,
  31.                     int *matRowPerm, int *matRows, float *matData,
  32.                     float *vec, int dim) {
  33.  
  34.   unsigned int blockDim = 512;
  35.   unsigned int gridDim = ceil(1.0*dim/blockDim);
  36.  
  37.   spmvJDSKernel<<<gridDim,blockDim>>>(out, matColStart, matCols, matRowPerm, matRows, matData, vec, dim);
  38.  
  39.   //@@ invoke spmv kernel for jds format
  40. }
  41.  
  42. int main(int argc, char **argv) {
  43.   wbArg_t args;
  44.   int *hostCSRCols;
  45.   int *hostCSRRows;
  46.   float *hostCSRData;
  47.   int *hostJDSColStart;
  48.   int *hostJDSCols;
  49.   int *hostJDSRowPerm;
  50.   int *hostJDSRows;
  51.   float *hostJDSData;
  52.   float *hostVector;
  53.   float *hostOutput;
  54.   int *deviceJDSColStart;
  55.   int *deviceJDSCols;
  56.   int *deviceJDSRowPerm;
  57.   int *deviceJDSRows;
  58.   float *deviceJDSData;
  59.   float *deviceVector;
  60.   float *deviceOutput;
  61.   int dim, ncols, nrows, ndata;
  62.   int maxRowNNZ;
  63.  
  64.   args = wbArg_read(argc, argv);
  65.  
  66.   // Import data and create memory on host
  67.   wbTime_start(Generic, "Importing data and creating memory on host");
  68.   hostCSRCols = (int *)wbImport(wbArg_getInputFile(args, 0), &ncols, "Integer");
  69.   hostCSRRows = (int *)wbImport(wbArg_getInputFile(args, 1), &nrows, "Integer");
  70.   hostCSRData = (float *)wbImport(wbArg_getInputFile(args, 2), &ndata, "Real");
  71.   hostVector = (float *)wbImport(wbArg_getInputFile(args, 3), &dim, "Real");
  72.  
  73.   hostOutput = (float *)malloc(sizeof(float) * dim);
  74.  
  75.   CSRToJDS(dim, hostCSRRows, hostCSRCols, hostCSRData, &hostJDSRowPerm, &hostJDSRows,
  76.            &hostJDSColStart, &hostJDSCols, &hostJDSData); // converting from CSR to JDS
  77.   maxRowNNZ = hostJDSRows[0];
  78.  
  79.   // Allocate GPU memory.
  80.   cudaMalloc((void **)&deviceJDSColStart, sizeof(int) * maxRowNNZ);
  81.   cudaMalloc((void **)&deviceJDSCols, sizeof(int) * ndata);
  82.   cudaMalloc((void **)&deviceJDSRowPerm, sizeof(int) * dim);
  83.   cudaMalloc((void **)&deviceJDSRows, sizeof(int) * dim);
  84.   cudaMalloc((void **)&deviceJDSData, sizeof(float) * ndata);
  85.  
  86.   cudaMalloc((void **)&deviceVector, sizeof(float) * dim);
  87.   cudaMalloc((void **)&deviceOutput, sizeof(float) * dim);
  88.  
  89.   // Copy input memory to the GPU.
  90.   cudaMemcpy(deviceJDSColStart, hostJDSColStart, sizeof(int) * maxRowNNZ,
  91.              cudaMemcpyHostToDevice);
  92.   cudaMemcpy(deviceJDSCols, hostJDSCols, sizeof(int) * ndata, cudaMemcpyHostToDevice);
  93.   cudaMemcpy(deviceJDSRowPerm, hostJDSRowPerm, sizeof(int) * dim, cudaMemcpyHostToDevice);
  94.   cudaMemcpy(deviceJDSRows, hostJDSRows, sizeof(int) * dim, cudaMemcpyHostToDevice);
  95.   cudaMemcpy(deviceJDSData, hostJDSData, sizeof(float) * ndata, cudaMemcpyHostToDevice);
  96.   cudaMemcpy(deviceVector, hostVector, sizeof(float) * dim, cudaMemcpyHostToDevice);
  97.  
  98.  
  99.   // Perform CUDA computation
  100.   spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows,
  101.           deviceJDSData, deviceVector, dim);
  102.   cudaDeviceSynchronize();
  103.  
  104.   // Copy output memory to the CPU
  105.   cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * dim, cudaMemcpyDeviceToHost);
  106.  
  107.   // Free GPU Memory
  108.   cudaFree(deviceVector);
  109.   cudaFree(deviceOutput);
  110.   cudaFree(deviceJDSColStart);
  111.   cudaFree(deviceJDSCols);
  112.   cudaFree(deviceJDSRowPerm);
  113.   cudaFree(deviceJDSRows);
  114.   cudaFree(deviceJDSData);
  115.  
  116.   wbSolution(args, hostOutput, dim);
  117.  
  118.   free(hostCSRCols);
  119.   free(hostCSRRows);
  120.   free(hostCSRData);
  121.   free(hostVector);
  122.   free(hostOutput);
  123.   free(hostJDSColStart);
  124.   free(hostJDSCols);
  125.   free(hostJDSRowPerm);
  126.   free(hostJDSRows);  
  127.   free(hostJDSData);
  128.  
  129.   return 0;
  130. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement