Advertisement
phystota

cublas_1

Nov 28th, 2024
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.54 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include <cublas_v2.h>
  4. #include "gpu-new-forward.h"
  5.  
  6. #define TILE_WIDTH 16
  7. #define BLOCK_SIZE 512
  8.  
  9. // Error checking macro for cuBLAS
  10. #define CUBLAS_CHECK(call) \
  11.     do { \
  12.         cublasStatus_t status = call; \
  13.         if (status != CUBLAS_STATUS_SUCCESS) { \
  14.             std::cerr << "cuBLAS error at " << __FILE__ << ":" << __LINE__ << std::endl; \
  15.             exit(1); \
  16.         } \
  17.     } while(0)
  18.  
  19. // The matrix unrolling kernel remains the same as it's already optimized
  20. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  21.                                       const int Batch, const int Channel,
  22.                                       const int Height, const int Width,
  23.                                       const int K) {
  24.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  25.     #define out_2d(i1, i0) output[(i0) * (Channel * K * K) + i1]  // Fixed column-major indexing
  26.  
  27.     const int Height_out = Height - K + 1;
  28.     const int Width_out = Width - K + 1;
  29.     const int Width_unrolled = Batch * Height_out * Width_out;
  30.  
  31.     const int c = blockIdx.x * blockDim.x + threadIdx.x;
  32.     const int hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  33.     const int batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  34.  
  35.     if (c >= Channel || hw_pos >= (Height_out * Width_out) || batch_idx >= Batch) return;
  36.  
  37.     const int h_out = hw_pos / Width_out;
  38.     const int w_out = hw_pos % Width_out;
  39.     const int w_unroll = batch_idx * Height_out * Width_out + h_out * Width_out + w_out;
  40.  
  41.     // Write in column-major format for cuBLAS
  42.     for (int p = 0; p < K; p++) {
  43.         for (int q = 0; q < K; q++) {
  44.             const int h_unroll = c * K * K + p * K + q;
  45.             out_2d(h_unroll, w_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  46.         }
  47.     }
  48.  
  49.     #undef in_4d
  50.     #undef out_2d
  51. }
  52.  
  53. // The permute kernel remains the same
  54. __global__ void matrix_permute_kernel(const float *input, float *output,
  55.                                     const int Map_out, const int Batch,
  56.                                     const int Height_out, const int Width_out) {
  57.     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
  58.     const int total_elements = Batch * Height_out * Width_out;
  59.    
  60.     if (tid < total_elements) {
  61.         const int b = tid / (Height_out * Width_out);
  62.         const int h = (tid / Width_out) % Height_out;
  63.         const int w = tid % Width_out;
  64.        
  65.         // Fixed indexing for correct output layout
  66.         #define in_2d(i1, i0) input[(i0) * Map_out + i1]
  67.         #define out_4d(n, c, h, w) output[(n) * (Map_out * Height_out * Width_out) + (c) * (Height_out * Width_out) + (h) * Width_out + w]
  68.        
  69.         for (int m = 0; m < Map_out; m++) {
  70.             out_4d(b, m, h, w) = in_2d(m, tid);
  71.         }
  72.        
  73.         #undef in_2d
  74.         #undef out_4d
  75.     }
  76. }
  77.  
  78. class CuBLASHandler {
  79. private:
  80.     static cublasHandle_t handle;
  81.     static bool initialized;
  82.  
  83. public:
  84.     static cublasHandle_t& getHandle() {
  85.         if (!initialized) {
  86.             CUBLAS_CHECK(cublasCreate(&handle));
  87.             initialized = true;
  88.         }
  89.         return handle;
  90.     }
  91.  
  92.     static void destroy() {
  93.         if (initialized) {
  94.             CUBLAS_CHECK(cublasDestroy(handle));
  95.             initialized = false;
  96.         }
  97.     }
  98. };
  99.  
  100.  
  101. cublasHandle_t CuBLASHandler::handle;
  102. bool CuBLASHandler::initialized = false;
  103.  
  104. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
  105.     const float *host_mask, float **device_output_ptr, float **device_input_ptr,
  106.     float **device_mask_ptr, const int Batch, const int Map_out, const int Channel,
  107.     const int Height, const int Width, const int K)
  108. {
  109.     const int Height_out = Height - K + 1;
  110.     const int Width_out = Width - K + 1;
  111.    
  112.     cudaMalloc((void**)device_input_ptr, Batch * Channel * Height * Width * sizeof(float));
  113.     cudaMalloc((void**)device_mask_ptr, Map_out * Channel * K * K * sizeof(float));
  114.     cudaMalloc((void**)device_output_ptr, Batch * Map_out * Height_out * Width_out * sizeof(float));
  115.    
  116.     cudaMemcpy(*device_input_ptr, host_input, Batch * Channel * Height * Width * sizeof(float), cudaMemcpyHostToDevice);
  117.     cudaMemcpy(*device_mask_ptr, host_mask, Map_out * Channel * K * K * sizeof(float), cudaMemcpyHostToDevice);
  118. }
  119.  
  120. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
  121.     const float *device_mask, const int Batch, const int Map_out, const int Channel,
  122.     const int Height, const int Width, const int K)
  123. {
  124.     const int Height_out = Height - K + 1;
  125.     const int Width_out = Width - K + 1;
  126.     const int Height_unrolled = Channel * K * K;
  127.     const int Width_unrolled = Batch * Height_out * Width_out;
  128.  
  129.     float *unrolled_matrix, *matmul_output;
  130.     cudaMalloc((void**)&unrolled_matrix, (size_t)Height_unrolled * Width_unrolled * sizeof(float));
  131.     cudaMalloc((void**)&matmul_output, (size_t)Map_out * Width_unrolled * sizeof(float));
  132.  
  133.     // Unroll input matrix
  134.     dim3 blockDim(4, 256, 1);
  135.     dim3 gridDim(
  136.         (Channel + blockDim.x - 1) / blockDim.x,
  137.         (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
  138.         (Batch + blockDim.z - 1) / blockDim.z
  139.     );
  140.  
  141.     matrix_unrolling_kernel<<<gridDim, blockDim>>>(
  142.         device_input, unrolled_matrix,
  143.         Batch, Channel, Height, Width, K
  144.     );
  145.  
  146.     // Perform matrix multiplication
  147.     cublasHandle_t handle = CuBLASHandler::getHandle();
  148.     float alpha = 1.0f, beta = 0.0f;
  149.  
  150.     // Fixed cuBLAS call with correct dimensions and leading dimensions
  151.     CUBLAS_CHECK(cublasSgemm(
  152.         handle,
  153.         CUBLAS_OP_T,    // Transpose mask matrix
  154.         CUBLAS_OP_N,    // No transpose for unrolled matrix
  155.         Map_out,        // M: rows of op(A)
  156.         Width_unrolled, // N: cols of op(B)
  157.         Height_unrolled,// K: cols of op(A)
  158.         &alpha,
  159.         device_mask,    // A matrix
  160.         Height_unrolled,// LDA: leading dimension of A
  161.         unrolled_matrix,// B matrix
  162.         Height_unrolled,// LDB: leading dimension of B
  163.         &beta,
  164.         matmul_output,  // C matrix
  165.         Map_out         // LDC: leading dimension of C
  166.     ));
  167.  
  168.     // Reshape output
  169.     const int total_threads = Batch * Height_out * Width_out;
  170.     const int num_blocks = (total_threads + BLOCK_SIZE - 1) / BLOCK_SIZE;
  171.    
  172.     matrix_permute_kernel<<<num_blocks, BLOCK_SIZE>>>(
  173.         matmul_output, device_output,
  174.         Map_out, Batch, Height_out, Width_out
  175.     );
  176.  
  177.     cudaFree(matmul_output);
  178.     cudaFree(unrolled_matrix);
  179. }
  180.  
  181. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
  182.     float *device_input, float *device_mask, const int Batch, const int Map_out,
  183.     const int Channel, const int Height, const int Width, const int K)
  184. {
  185.     // Calculate output size
  186.     const int Height_out = Height - K + 1;
  187.     const int Width_out = Width - K + 1;
  188.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  189.  
  190.     // Copy result back to host
  191.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  192.  
  193.     // Free device memory
  194.     cudaFree(device_output);
  195.     cudaFree(device_input);
  196.     cudaFree(device_mask);
  197.  
  198.     // Clean up cuBLAS
  199.     CuBLASHandler::destroy();
  200. }
  201.  
  202. __host__ void GPUInterface::get_device_properties()
  203. {
  204.     int deviceCount;
  205.     cudaGetDeviceCount(&deviceCount);
  206.  
  207.     for(int dev = 0; dev < deviceCount; dev++)
  208.     {
  209.         cudaDeviceProp deviceProp;
  210.         cudaGetDeviceProperties(&deviceProp, dev);
  211.  
  212.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  213.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  214.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  215.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  216.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  217.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  218.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  219.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  220.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  221.     }
  222. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement