Advertisement
phystota

sbatch_5000_fused_fp16_optimization

Dec 7th, 2024
34
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.80 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include <cuda_fp16.h>
  4. #include "gpu-new-forward.h"
  5.  
  6. #define TILE_WIDTH 16
  7. #define BLOCK_SIZE 512
  8.  
  9. // Helper function to convert FP32 to FP16 on GPU
  10. __global__ void convertFP32ToFP16(const float* input, half* output, int size) {
  11.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  12.     if (idx < size) {
  13.         output[idx] = __float2half(input[idx]);
  14.     }
  15. }
  16.  
  17. // Helper function to convert FP16 to FP32 on GPU
  18. __global__ void convertFP16ToFP32(const half* input, float* output, int size) {
  19.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  20.     if (idx < size) {
  21.         output[idx] = __half2float(input[idx]);
  22.     }
  23. }
  24.  
  25. __global__ void fused_unroll_matmul_kernel_fp16(const half2 *input, const half2 *mask, half2 *output,
  26.                                                const int Batch, const int Map_out, const int Channel,
  27.                                                const int Height, const int Width, const int K) {
  28.     const int Height_out = Height - K + 1;
  29.     const int Width_out = Width - K + 1;
  30.     const int H_unroll = Channel * K * K;
  31.     const int W_unroll = Height_out * Width_out;
  32.    
  33.     // Using half2 for shared memory to enable vectorized loads
  34.     __shared__ half2 shared_mask[TILE_WIDTH][TILE_WIDTH];
  35.     __shared__ half2 shared_input[TILE_WIDTH][TILE_WIDTH];
  36.    
  37.     const int tx = threadIdx.x;
  38.     const int ty = threadIdx.y;
  39.     const int row = blockIdx.y * TILE_WIDTH + ty;
  40.     const int col = blockIdx.x * TILE_WIDTH + tx;
  41.     const int batch_idx = blockIdx.z;
  42.    
  43.     // Use half2 for accumulation to leverage hardware capabilities
  44.     half2 acc = __float2half2_rn(0.0f);
  45.    
  46.     const int numTiles = (H_unroll / 2 + TILE_WIDTH - 1) / TILE_WIDTH;
  47.    
  48.     for(int tile = 0; tile < numTiles; tile++) {
  49.         if(row < Map_out && (tile * TILE_WIDTH + tx) < (H_unroll/2)) {
  50.             shared_mask[ty][tx] = mask[row * (H_unroll/2) + tile * TILE_WIDTH + tx];
  51.         } else {
  52.             shared_mask[ty][tx] = __float2half2_rn(0.0f);
  53.         }
  54.        
  55.         if(col < W_unroll && (tile * TILE_WIDTH + ty) < (H_unroll/2)) {
  56.             int unfoldedIdx = (tile * TILE_WIDTH + ty) * 2;
  57.             int c = unfoldedIdx / (K * K);
  58.             int pixelOffset = unfoldedIdx % (K * K);
  59.             int kh = pixelOffset / K;
  60.             int kw = pixelOffset % K;
  61.             int h_out = col / Width_out;
  62.             int w_out = col % Width_out;
  63.            
  64.             // Load two consecutive elements using half2
  65.             shared_input[ty][tx] = input[
  66.                 batch_idx * (Channel * Height * Width / 2) +
  67.                 c * (Height * Width / 2) +
  68.                 (h_out + kh) * Width / 2 +
  69.                 (w_out + kw) / 2
  70.             ];
  71.         } else {
  72.             shared_input[ty][tx] = __float2half2_rn(0.0f);
  73.         }
  74.        
  75.         __syncthreads();
  76.        
  77.         if(row < Map_out && col < W_unroll) {
  78.             for(int k = 0; k < TILE_WIDTH; k++) {
  79.                 // Use half2 arithmetic operations
  80.                 acc = __hadd2(acc, __hmul2(shared_mask[ty][k], shared_input[k][tx]));
  81.             }
  82.         }
  83.        
  84.         __syncthreads();
  85.     }
  86.    
  87.     if(row < Map_out && col < W_unroll) {
  88.         int h_out = col / Width_out;
  89.         int w_out = col % Width_out;
  90.         output[
  91.             batch_idx * (Map_out * Height_out * Width_out / 2) +
  92.             row * (Height_out * Width_out / 2) +
  93.             h_out * Width_out / 2 +
  94.             w_out / 2
  95.         ] = acc;
  96.     }
  97. }
  98.  
  99. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
  100.                                                   const float *host_mask, float **device_output_ptr,
  101.                                                   float **device_input_ptr, float **device_mask_ptr,
  102.                                                   const int Batch, const int Map_out, const int Channel,
  103.                                                   const int Height, const int Width, const int K) {
  104.     const int Height_out = Height - K + 1;
  105.     const int Width_out = Width - K + 1;
  106.    
  107.     // Allocate FP32 memory on device first
  108.     cudaMalloc((void**)device_input_ptr, Batch * Channel * Height * Width * sizeof(float));
  109.     cudaMalloc((void**)device_mask_ptr, Map_out * Channel * K * K * sizeof(float));
  110.     cudaMalloc((void**)device_output_ptr, Batch * Map_out * Height_out * Width_out * sizeof(float));
  111.  
  112.     // Copy FP32 data to device
  113.     cudaMemcpy(*device_input_ptr, host_input, Batch * Channel * Height * Width * sizeof(float), cudaMemcpyHostToDevice);
  114.     cudaMemcpy(*device_mask_ptr, host_mask, Map_out * Channel * K * K * sizeof(float), cudaMemcpyHostToDevice);
  115.  
  116.     // Allocate FP16 memory on device
  117.     half *device_input_fp16, *device_mask_fp16, *device_output_fp16;
  118.     const int input_size = Batch * Channel * Height * Width;
  119.     const int mask_size = Map_out * Channel * K * K;
  120.     const int output_size = Batch * Map_out * Height_out * Width_out;
  121.    
  122.     cudaMalloc((void**)&device_input_fp16, input_size * sizeof(half));
  123.     cudaMalloc((void**)&device_mask_fp16, mask_size * sizeof(half));
  124.     cudaMalloc((void**)&device_output_fp16, output_size * sizeof(half));
  125.  
  126.     // Convert on GPU using helper kernels
  127.     dim3 blockDim(256);
  128.     dim3 gridDim((input_size + 255) / 256);
  129.     convertFP32ToFP16<<<gridDim, blockDim>>>(*device_input_ptr, device_input_fp16, input_size);
  130.    
  131.     gridDim.x = (mask_size + 255) / 256;
  132.     convertFP32ToFP16<<<gridDim, blockDim>>>(*device_mask_ptr, device_mask_fp16, mask_size);
  133.  
  134.     // Store FP16 pointers
  135.     *device_input_ptr = (float*)device_input_fp16;
  136.     *device_mask_ptr = (float*)device_mask_fp16;
  137.     *device_output_ptr = (float*)device_output_fp16;
  138. }
  139.  
  140. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
  141.                                            const float *device_mask, const int Batch, const int Map_out,
  142.                                            const int Channel, const int Height, const int Width, const int K) {
  143.     const int Height_out = Height - K + 1;
  144.     const int Width_out = Width - K + 1;
  145.     const int W_unroll = Height_out * Width_out;
  146.    
  147.     dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
  148.     dim3 gridDim(
  149.         (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH,
  150.         (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,
  151.         Batch
  152.     );
  153.    
  154.     // Cast to half2 pointers for vectorized operations
  155.     fused_unroll_matmul_kernel_fp16<<<gridDim, blockDim>>>(
  156.         (half2*)device_input, (half2*)device_mask, (half2*)device_output,
  157.         Batch, Map_out, Channel, Height, Width, K
  158.     );
  159. }
  160.  
  161. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
  162.                                                   float *device_input, float *device_mask,
  163.                                                   const int Batch, const int Map_out, const int Channel,
  164.                                                   const int Height, const int Width, const int K) {
  165.     const int Height_out = Height - K + 1;
  166.     const int Width_out = Width - K + 1;
  167.     const int output_size = Batch * Map_out * Height_out * Width_out;
  168.  
  169.     // Allocate temporary FP32 buffer on device
  170.     float* device_output_fp32;
  171.     cudaMalloc((void**)&device_output_fp32, output_size * sizeof(float));
  172.  
  173.     // Convert back to FP32 on GPU
  174.     dim3 blockDim(256);
  175.     dim3 gridDim((output_size + 255) / 256);
  176.     convertFP16ToFP32<<<gridDim, blockDim>>>((half*)device_output, device_output_fp32, output_size);
  177.  
  178.     // Copy final FP32 results back to host
  179.     cudaMemcpy(host_output, device_output_fp32, output_size * sizeof(float), cudaMemcpyDeviceToHost);
  180.  
  181.     // Clean up
  182.     cudaFree(device_output_fp32);
  183.     cudaFree((half*)device_output);
  184.     cudaFree((half*)device_input);
  185.     cudaFree((half*)device_mask);
  186. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement