Advertisement
phystota

kernel_fuisoin_final

Nov 29th, 2024
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.42 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 256
  7.  
  8. __global__ void fused_unroll_matmul_kernel(const float *input, const float *mask, float *output,
  9.                                          const int Batch, const int Map_out, const int Channel,
  10.                                          const int Height, const int Width, const int K) {
  11.     // Calculate dimensions
  12.     const int Height_out = Height - K + 1;
  13.     const int Width_out = Width - K + 1;
  14.     const int H_unroll = Channel * K * K;
  15.     const int W_unroll = Height_out * Width_out;
  16.    
  17.     // Shared memory for input and mask tiles
  18.     __shared__ float shared_mask[TILE_WIDTH][TILE_WIDTH];
  19.     __shared__ float shared_input[TILE_WIDTH][TILE_WIDTH];
  20.    
  21.     // Calculate thread and block indices
  22.     const int tx = threadIdx.x;
  23.     const int ty = threadIdx.y;
  24.     const int row = blockIdx.y * TILE_WIDTH + ty;
  25.     const int col = blockIdx.x * TILE_WIDTH + tx;
  26.     const int batch_idx = blockIdx.z;
  27.    
  28.     // Each thread accumulates one element of the output
  29.     float acc = 0.0f;
  30.    
  31.     // Calculate how many tiles we need
  32.     const int numTiles = (H_unroll + TILE_WIDTH - 1) / TILE_WIDTH;
  33.    
  34.     // Loop over tiles
  35.     for(int tile = 0; tile < numTiles; tile++) {
  36.         // Load mask tile - each thread loads one element
  37.         if(row < Map_out && (tile * TILE_WIDTH + tx) < H_unroll) {
  38.             shared_mask[ty][tx] = mask[row * H_unroll + tile * TILE_WIDTH + tx];
  39.         } else {
  40.             shared_mask[ty][tx] = 0.0f;
  41.         }
  42.        
  43.         // Load and transform input data directly into shared memory
  44.         if(col < W_unroll && (tile * TILE_WIDTH + ty) < H_unroll) {
  45.             // Calculate original input indices
  46.             int unfoldedIdx = tile * TILE_WIDTH + ty;
  47.             int c = unfoldedIdx / (K * K);
  48.             int pixelOffset = unfoldedIdx % (K * K);
  49.             int kh = pixelOffset / K;
  50.             int kw = pixelOffset % K;
  51.             int h_out = col / Width_out;
  52.             int w_out = col % Width_out;
  53.            
  54.             // Load from input with transformed indices
  55.             shared_input[ty][tx] = input[
  56.                 batch_idx * (Channel * Height * Width) +
  57.                 c * (Height * Width) +
  58.                 (h_out + kh) * Width +
  59.                 (w_out + kw)
  60.             ];
  61.         } else {
  62.             shared_input[ty][tx] = 0.0f;
  63.         }
  64.        
  65.         __syncthreads();
  66.        
  67.         // Compute partial dot product for this tile
  68.         if(row < Map_out && col < W_unroll) {
  69.             for(int k = 0; k < TILE_WIDTH; k++) {
  70.                 acc += shared_mask[ty][k] * shared_input[k][tx];
  71.             }
  72.         }
  73.        
  74.         __syncthreads();
  75.     }
  76.    
  77.     // Write output with transformed indices to match desired format
  78.     if(row < Map_out && col < W_unroll) {
  79.         int h_out = col / Width_out;
  80.         int w_out = col % Width_out;
  81.         output[
  82.             batch_idx * (Map_out * Height_out * Width_out) +
  83.             row * (Height_out * Width_out) +
  84.             h_out * Width_out +
  85.             w_out
  86.         ] = acc;
  87.     }
  88. }
  89.  
  90. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
  91.                                                   const float *host_mask, float **device_output_ptr,
  92.                                                   float **device_input_ptr, float **device_mask_ptr,
  93.                                                   const int Batch, const int Map_out, const int Channel,
  94.                                                   const int Height, const int Width, const int K) {
  95.     // Calculate sizes
  96.     const int Height_out = Height - K + 1;
  97.     const int Width_out = Width - K + 1;
  98.    
  99.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  100.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  101.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  102.  
  103.     cudaMalloc((void**)device_input_ptr, input_size);
  104.     cudaMalloc((void**)device_mask_ptr, mask_size);
  105.     cudaMalloc((void**)device_output_ptr, output_size);
  106.  
  107.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  108.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  109. }
  110.  
  111. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
  112.                                            const float *device_mask, const int Batch, const int Map_out,
  113.                                            const int Channel, const int Height, const int Width, const int K) {
  114.     const int Height_out = Height - K + 1;
  115.     const int Width_out = Width - K + 1;
  116.     const int W_unroll = Height_out * Width_out;
  117.    
  118.     // Configure kernel launch parameters
  119.     dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
  120.     dim3 gridDim(
  121.         (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH,     // Width dimension
  122.         (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,      // Height dimension
  123.         Batch                                         // Batch dimension
  124.     );
  125.    
  126.     // Launch fused kernel
  127.     fused_unroll_matmul_kernel<<<gridDim, blockDim>>>(
  128.         device_input, device_mask, device_output,
  129.         Batch, Map_out, Channel, Height, Width, K
  130.     );
  131. }
  132.  
  133. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
  134.                                                   float *device_input, float *device_mask,
  135.                                                   const int Batch, const int Map_out, const int Channel,
  136.                                                   const int Height, const int Width, const int K) {
  137.     const int Height_out = Height - K + 1;
  138.     const int Width_out = Width - K + 1;
  139.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  140.  
  141.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  142.    
  143.     cudaFree(device_output);
  144.     cudaFree(device_input);
  145.     cudaFree(device_mask);
  146. }
  147.  
  148. // Host function: Retrieves and prints device properties
  149. __host__ void GPUInterface::get_device_properties()
  150. {
  151.     int deviceCount;
  152.     cudaGetDeviceCount(&deviceCount);
  153.  
  154.     for(int dev = 0; dev < deviceCount; dev++)
  155.     {
  156.         cudaDeviceProp deviceProp;
  157.         cudaGetDeviceProperties(&deviceProp, dev);
  158.  
  159.         std::cout << "Device " << dev << " name: " << deviceProp.name << std::endl;
  160.         std::cout << "Computational capabilities: " << deviceProp.major << "." << deviceProp.minor << std::endl;
  161.         std::cout << "Max Global memory size: " << deviceProp.totalGlobalMem << std::endl;
  162.         std::cout << "Max Constant memory size: " << deviceProp.totalConstMem << std::endl;
  163.         std::cout << "Max Shared memory size per block: " << deviceProp.sharedMemPerBlock << std::endl;
  164.         std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
  165.         std::cout << "Max block dimensions: " << deviceProp.maxThreadsDim[0] << " x, "
  166.                   << deviceProp.maxThreadsDim[1] << " y, " << deviceProp.maxThreadsDim[2]
  167.                   << " z" << std::endl;
  168.         std::cout << "Max grid dimensions: " << deviceProp.maxGridSize[0] << " x, "
  169.                   << deviceProp.maxGridSize[1] << " y, " << deviceProp.maxGridSize[2]
  170.                   << " z" << std::endl;
  171.         std::cout << "Warp Size: " << deviceProp.warpSize << std::endl;
  172.     }
  173. }
  174.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement