Advertisement
phystota

cublas_batch_optimization

Nov 28th, 2024
49
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.19 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include <cublas_v2.h>
  4. #include "gpu-new-forward.h"
  5.  
  6. #define TILE_WIDTH 16
  7. #define BLOCK_SIZE 512
  8.  
  9. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  10.                                       const int Batch, const int Channel,
  11.                                       const int Height, const int Width,
  12.                                       const int K) {
  13.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  14.     #define unroll_3d(i2, i1, i0) output[(i2) * (Height_unroll * Width_unroll) + (i1) * Width_unroll + i0]
  15.  
  16.     const int Height_out = Height - K + 1;
  17.     const int Width_out = Width - K + 1;
  18.     const int Height_unroll = Channel * K * K;
  19.     const int Width_unroll = Height_out * Width_out;
  20.  
  21.     const int c = blockIdx.x * blockDim.x + threadIdx.x;
  22.     const int hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  23.     const int batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  24.  
  25.     if (c >= Channel || hw_pos >= Height_out * Width_out || batch_idx >= Batch) {
  26.         return;
  27.     }
  28.  
  29.     const int h_out = hw_pos / Width_out;
  30.     const int w_out = hw_pos % Width_out;
  31.     const int w_base = c * K * K;
  32.  
  33.     // Unroll the input into a matrix suitable for batched matrix multiplication
  34.     for (int p = 0; p < K; p++) {
  35.         for (int q = 0; q < K; q++) {
  36.             int h_unroll = w_base + p * K + q;
  37.             unroll_3d(batch_idx, h_unroll, h_out * Width_out + w_out) =
  38.                 in_4d(batch_idx, c, h_out + p, w_out + q);
  39.         }
  40.     }
  41.  
  42.     #undef in_4d
  43.     #undef unroll_3d
  44. }
  45.  
  46. __global__ void matrix_permute_kernel(const float *input, float *output,
  47.                                     const int Map_out, const int Batch,
  48.                                     const int Height_out, const int Width_out) {
  49.     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  50.     const int b = blockIdx.y;
  51.    
  52.     const int image_size = Height_out * Width_out;
  53.    
  54.     if (idx < image_size && b < Batch) {
  55.         for (int m = 0; m < Map_out; m++) {
  56.             output[b * Map_out * image_size + m * image_size + idx] =
  57.                 input[b * Map_out * image_size + m * image_size + idx];
  58.         }
  59.     }
  60. }
  61.  
  62. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
  63.     const float *host_mask, float **device_output_ptr, float **device_input_ptr,
  64.     float **device_mask_ptr, const int Batch, const int Map_out, const int Channel,
  65.     const int Height, const int Width, const int K)
  66. {
  67.     const int Height_out = Height - K + 1;
  68.     const int Width_out = Width - K + 1;
  69.    
  70.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  71.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  72.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  73.  
  74.     // Allocate device memory
  75.     cudaMalloc((void**)device_input_ptr, input_size);
  76.     cudaMalloc((void**)device_mask_ptr, mask_size);
  77.     cudaMalloc((void**)device_output_ptr, output_size);
  78.  
  79.     // Transfer input data to device
  80.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  81.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  82. }
  83.  
  84. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
  85.     const float *device_mask, const int Batch, const int Map_out, const int Channel,
  86.     const int Height, const int Width, const int K)
  87. {
  88.     // Calculate dimensions
  89.     const int Height_out = Height - K + 1;
  90.     const int Width_out = Width - K + 1;
  91.     const int Height_unroll = Channel * K * K;
  92.     const int Width_unroll = Height_out * Width_out;
  93.  
  94.     // Allocate memory for intermediate results
  95.     float *unrolled_input = nullptr;
  96.     float *matmul_output = nullptr;
  97.    
  98.     const size_t unrolled_size = (size_t)Batch * Height_unroll * Width_unroll * sizeof(float);
  99.     const size_t matmul_size = (size_t)Batch * Map_out * Width_unroll * sizeof(float);
  100.    
  101.     cudaMalloc(&unrolled_input, unrolled_size);
  102.     cudaMalloc(&matmul_output, matmul_size);
  103.  
  104.     // Configure and launch unrolling kernel
  105.     dim3 blockDim(8, 16, 4);  // Optimized for better occupancy
  106.     dim3 gridDim(
  107.         (Channel + blockDim.x - 1) / blockDim.x,
  108.         ((Height_out * Width_out) + blockDim.y - 1) / blockDim.y,
  109.         (Batch + blockDim.z - 1) / blockDim.z
  110.     );
  111.  
  112.     matrix_unrolling_kernel<<<gridDim, blockDim>>>(
  113.         device_input, unrolled_input,
  114.         Batch, Channel, Height, Width, K
  115.     );
  116.  
  117.     // Create and configure cuBLAS handle
  118.     cublasHandle_t handle;
  119.     cublasCreate(&handle);
  120.    
  121.     // Set up parameters for strided batched GEMM
  122.     const float alpha = 1.0f;
  123.     const float beta = 0.0f;
  124.    
  125.     // Calculate strides for batched operation
  126.     const long long int strideA = 0;  // Mask is shared across batches
  127.     const long long int strideB = Height_unroll * Width_unroll;  // Stride between input matrices
  128.     const long long int strideC = Map_out * Width_unroll;        // Stride between output matrices
  129.    
  130.     // Perform batched matrix multiplication
  131.     // Note: cuBLAS uses column-major order, so we transpose the operation
  132.     // C[b] = A * B[b], where b is the batch index
  133.     cublasSgemmStridedBatched(handle,
  134.         CUBLAS_OP_N,    // No operation on A (mask)
  135.         CUBLAS_OP_N,    // No operation on B (unrolled input)
  136.         Width_unroll,   // M: number of rows of B and C
  137.         Map_out,        // N: number of columns of A and C
  138.         Height_unroll,  // K: number of columns of B and rows of A
  139.         &alpha,
  140.         unrolled_input, Width_unroll, strideB,  // Matrix B (input)
  141.         device_mask, Height_unroll, strideA,    // Matrix A (mask)
  142.         &beta,
  143.         matmul_output, Width_unroll, strideC,   // Matrix C (output)
  144.         Batch                                    // Number of matrices
  145.     );
  146.  
  147.     // Configure and launch permute kernel
  148.     dim3 permute_block(BLOCK_SIZE);
  149.     dim3 permute_grid((Height_out * Width_out + BLOCK_SIZE - 1) / BLOCK_SIZE, Batch);
  150.    
  151.     matrix_permute_kernel<<<permute_grid, permute_block>>>(
  152.         matmul_output, device_output,
  153.         Map_out, Batch, Height_out, Width_out
  154.     );
  155.  
  156.     // Cleanup
  157.     cublasDestroy(handle);
  158.     cudaFree(unrolled_input);
  159.     cudaFree(matmul_output);
  160. }
  161.  
  162. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
  163.     float *device_input, float *device_mask, const int Batch, const int Map_out,
  164.     const int Channel, const int Height, const int Width, const int K)
  165. {
  166.     const int Height_out = Height - K + 1;
  167.     const int Width_out = Width - K + 1;
  168.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  169.  
  170.     // Copy result back to host
  171.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  172.  
  173.     // Free device memory
  174.     cudaFree(device_output);
  175.     cudaFree(device_input);
  176.     cudaFree(device_mask);
  177. }
  178.  
  179. __host__ void GPUInterface::get_device_properties()
  180. {
  181.     int deviceCount;
  182.     cudaGetDeviceCount(&deviceCount);
  183.  
  184.     for(int dev = 0; dev < deviceCount; dev++)
  185.     {
  186.         cudaDeviceProp deviceProp;
  187.         cudaGetDeviceProperties(&deviceProp, dev);
  188.  
  189.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  190.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  191.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  192.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  193.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  194.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  195.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  196.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  197.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  198.     }
  199. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement