Advertisement
phystota

Sparsing_skeleton

Nov 13th, 2024
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.07 KB | None | 0 0
  1. #include <wb.h>
  2.  
  3. #define wbCheck(stmt)                                                     \
  4.   do {                                                                    \
  5.     cudaError_t err = stmt;                                               \
  6.     if (err != cudaSuccess) {                                             \
  7.       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  8.       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  9.       return -1;                                                          \
  10.     }                                                                     \
  11.   } while (0)
  12.  
  13. __global__ void spmvJDSKernel(float *out, int *matColStart, int *matCols,
  14.                               int *matRowPerm, int *matRows,
  15.                               float *matData, float *vec, int dim) {
  16.   //@@ insert spmv kernel for jds format
  17. }
  18.  
  19. static void spmvJDS(float *out, int *matColStart, int *matCols,
  20.                     int *matRowPerm, int *matRows, float *matData,
  21.                     float *vec, int dim) {
  22.  
  23.   //@@ invoke spmv kernel for jds format
  24. }
  25.  
  26. int main(int argc, char **argv) {
  27.   wbArg_t args;
  28.   int *hostCSRCols;
  29.   int *hostCSRRows;
  30.   float *hostCSRData;
  31.   int *hostJDSColStart;
  32.   int *hostJDSCols;
  33.   int *hostJDSRowPerm;
  34.   int *hostJDSRows;
  35.   float *hostJDSData;
  36.   float *hostVector;
  37.   float *hostOutput;
  38.   int *deviceJDSColStart;
  39.   int *deviceJDSCols;
  40.   int *deviceJDSRowPerm;
  41.   int *deviceJDSRows;
  42.   float *deviceJDSData;
  43.   float *deviceVector;
  44.   float *deviceOutput;
  45.   int dim, ncols, nrows, ndata;
  46.   int maxRowNNZ;
  47.  
  48.   args = wbArg_read(argc, argv);
  49.  
  50.   // Import data and create memory on host
  51.   hostCSRCols = (int *)wbImport(wbArg_getInputFile(args, 0), &ncols, "Integer");
  52.   hostCSRRows = (int *)wbImport(wbArg_getInputFile(args, 1), &nrows, "Integer");
  53.   hostCSRData = (float *)wbImport(wbArg_getInputFile(args, 2), &ndata, "Real");
  54.   hostVector = (float *)wbImport(wbArg_getInputFile(args, 3), &dim, "Real");
  55.  
  56.   hostOutput = (float *)malloc(sizeof(float) * dim);
  57.  
  58.  
  59.  
  60.   CSRToJDS(dim, hostCSRRows, hostCSRCols, hostCSRData, &hostJDSRowPerm, &hostJDSRows,
  61.            &hostJDSColStart, &hostJDSCols, &hostJDSData);
  62.   maxRowNNZ = hostJDSRows[0];
  63.  
  64.   // Allocate GPU memory.
  65.   cudaMalloc((void **)&deviceJDSColStart, sizeof(int) * maxRowNNZ);
  66.   cudaMalloc((void **)&deviceJDSCols, sizeof(int) * ndata);
  67.   cudaMalloc((void **)&deviceJDSRowPerm, sizeof(int) * dim);
  68.   cudaMalloc((void **)&deviceJDSRows, sizeof(int) * dim);
  69.   cudaMalloc((void **)&deviceJDSData, sizeof(float) * ndata);
  70.  
  71.   cudaMalloc((void **)&deviceVector, sizeof(float) * dim);
  72.   cudaMalloc((void **)&deviceOutput, sizeof(float) * dim);
  73.  
  74.  
  75.   // Copy input memory to the GPU.
  76.   cudaMemcpy(deviceJDSColStart, hostJDSColStart, sizeof(int) * maxRowNNZ,
  77.              cudaMemcpyHostToDevice);
  78.   cudaMemcpy(deviceJDSCols, hostJDSCols, sizeof(int) * ndata, cudaMemcpyHostToDevice);
  79.   cudaMemcpy(deviceJDSRowPerm, hostJDSRowPerm, sizeof(int) * dim, cudaMemcpyHostToDevice);
  80.   cudaMemcpy(deviceJDSRows, hostJDSRows, sizeof(int) * dim, cudaMemcpyHostToDevice);
  81.   cudaMemcpy(deviceJDSData, hostJDSData, sizeof(float) * ndata, cudaMemcpyHostToDevice);
  82.   cudaMemcpy(deviceVector, hostVector, sizeof(float) * dim, cudaMemcpyHostToDevice);
  83.  
  84.  
  85.   // Perform CUDA computation
  86.   spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows,
  87.           deviceJDSData, deviceVector, dim);
  88.   cudaDeviceSynchronize();
  89.  
  90.   // Copy output memory to the CPU
  91.   cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * dim, cudaMemcpyDeviceToHost);
  92.  
  93.   // Free GPU Memory
  94.   cudaFree(deviceVector);
  95.   cudaFree(deviceOutput);
  96.   cudaFree(deviceJDSColStart);
  97.   cudaFree(deviceJDSCols);
  98.   cudaFree(deviceJDSRowPerm);
  99.   cudaFree(deviceJDSRows);
  100.   cudaFree(deviceJDSData);
  101.  
  102.  
  103.   wbSolution(args, hostOutput, dim);
  104.  
  105.   free(hostCSRCols);
  106.   free(hostCSRRows);
  107.   free(hostCSRData);
  108.   free(hostVector);
  109.   free(hostOutput);
  110.   free(hostJDSColStart);
  111.   free(hostJDSCols);
  112.   free(hostJDSRowPerm);
  113.   free(hostJDSRows);
  114.   free(hostJDSData);
  115.  
  116.   return 0;
  117. }
  118.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement