Advertisement
phystota

FP_16_optimization

Nov 19th, 2024 (edited)
33
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 12.00 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include <cuda_fp16.h>  // Include for FP16 support
  4. #include "gpu-new-forward.h"
  5.  
  6. #define TILE_WIDTH 16
  7. #define BLOCK_SIZE 512
  8.  
  9. __global__ void convert_float_to_half_kernel(const float *input, __half *output, int size) {
  10.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  11.     if (idx < size) {
  12.         output[idx] = __float2half(input[idx]);
  13.     }
  14. }
  15.  
  16. __global__ void convert_half_to_float_kernel(const __half *input, float *output, int size) {
  17.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  18.     if (idx < size) {
  19.         output[idx] = __half2float(input[idx]);
  20.     }
  21. }
  22.  
  23. __global__ void matrix_unrolling_kernel(const __half *input, __half *output,
  24.                                         const int Batch, const int Channel,
  25.                                         const int Height, const int Width,
  26.                                         const int K) {
  27.     /*
  28.     Modify this function to implement the input matrix unrolling kernel.
  29.  
  30.     Function parameter definitions:
  31.     input - input
  32.     output - output
  33.     Batch - batch_size (number of images in x)
  34.     Channel - number of input feature maps
  35.     Height - input height dimension
  36.     Width - input width dimension
  37.     K - kernel height and width (K x K)
  38.     */
  39.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  40.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  41.  
  42.     // Calculate output dimensions
  43.     const size_t Height_out = Height - K + 1;
  44.     const size_t Width_out = Width - K + 1;
  45.     const size_t W_unroll = Height_out * Width_out;
  46.     const size_t H_unroll = Channel * K * K;
  47.     const size_t W_total_unroll = Batch * W_unroll;
  48.  
  49.     // Calculate thread indices
  50.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;        // Channel/map index
  51.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;   // Combined height-width position
  52.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;// Batch index
  53.  
  54.     // Extract height and width positions
  55.     const size_t h_out = hw_pos / Width_out;    // Height position
  56.     const size_t w_out = hw_pos % Width_out;    // Width position
  57.  
  58.     // Boundary check
  59.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  60.         return;
  61.     }
  62.  
  63.     // Calculate position in unrolled matrix
  64.     const size_t w_unroll = h_out * Width_out + w_out;
  65.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  66.     const size_t w_base = c * K * K;
  67.  
  68.     // Perform unrolling
  69.     for (int p = 0; p < K; p++) {
  70.         for (int q = 0; q < K; q++) {
  71.             int h_unroll = w_base + p * K + q;
  72.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  73.         }
  74.     }
  75.  
  76.     #undef in_4d
  77.     #undef out_3d
  78. }
  79.  
  80. // Tiled matrix multiplication kernel. Computes C = AB
  81. // You don't need to modify this kernel.
  82. __global__ void matrixMultiplyShared(const __half *A, const __half *B, float *C,
  83.                                      int numARows, int numAColumns,
  84.                                      int numBRows, int numBColumns,
  85.                                      int numCRows, int numCColumns)
  86. {
  87.     __shared__ __half tileA[TILE_WIDTH][TILE_WIDTH];
  88.     __shared__ __half tileB[TILE_WIDTH][TILE_WIDTH];
  89.  
  90.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  91.  
  92.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  93.     float val = 0.0f;
  94.  
  95.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  96.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  97.             tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
  98.         } else {
  99.             tileA[ty][tx] = __float2half(0.0f);
  100.         }
  101.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  102.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  103.         } else {
  104.             tileB[ty][tx] = __float2half(0.0f);
  105.         }
  106.         __syncthreads();
  107.  
  108.         if (row < numCRows && col < numCColumns) {
  109.             for (int i = 0; i < TILE_WIDTH; i++) {
  110.                 val += __half2float(tileA[ty][i]) * __half2float(tileB[i][tx]);
  111.             }
  112.         }
  113.         __syncthreads();
  114.     }
  115.  
  116.     if (row < numCRows && col < numCColumns) {
  117.         C[row * numCColumns + col] = val;
  118.     }
  119. }
  120.  
  121. // Permutes the matmul result.
  122. // The output feature map after matmul is of shape Map_out x Batch x Height_out x Width_out,
  123. // and we need to permute it into Batch x Map_out x Height_out x Width_out.
  124. // You don't need to modify this kernel.
  125. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  126.                                       int Batch, int image_size) {
  127.     int b = blockIdx.y;
  128.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  129.     if (x < image_size) {
  130.         for (int m = 0; m < Map_out; m++) {
  131.             output[b * Map_out * image_size + m * image_size + x] =
  132.                     input[m * Batch * image_size + b * image_size + x];
  133.         }
  134.     }
  135. }
  136.  
  137. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask,
  138.                                                     float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr,
  139.                                                     const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  140. {
  141.     //  allocating memory
  142.  
  143.     // Calculate sizes
  144.     const int Height_out = Height - K + 1;
  145.     const int Width_out = Width - K + 1;
  146.  
  147.     const int input_size = Batch * Channel * Height * Width;
  148.     const int mask_size = Map_out * Channel * K * K;
  149.     const int output_size = Batch * Map_out * Height_out * Width_out;
  150.  
  151.     // Allocate device memory for output (float)
  152.     cudaMalloc((void**)device_output_ptr, output_size * sizeof(float));
  153.  
  154.     // Allocate device memory for input and mask in float
  155.     float *device_input_float;
  156.     float *device_mask_float;
  157.     cudaMalloc((void**)&device_input_float, input_size * sizeof(float));
  158.     cudaMalloc((void**)&device_mask_float, mask_size * sizeof(float));
  159.  
  160.     // Copy host input and mask to device input and mask (float)
  161.     cudaMemcpy(device_input_float, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
  162.     cudaMemcpy(device_mask_float, host_mask, mask_size * sizeof(float), cudaMemcpyHostToDevice);
  163.  
  164.     // Allocate device memory for input and mask in half precision
  165.     __half *device_input_half;
  166.     __half *device_mask_half;
  167.     cudaMalloc((void**)&device_input_half, input_size * sizeof(__half));
  168.     cudaMalloc((void**)&device_mask_half, mask_size * sizeof(__half));
  169.  
  170.     // Convert input and mask from float to half precision on device
  171.     int threads_per_block = 1024;
  172.     int blocks_per_grid_input = (input_size + threads_per_block - 1) / threads_per_block;
  173.     convert_float_to_half_kernel<<<blocks_per_grid_input, threads_per_block>>>(device_input_float, device_input_half, input_size);
  174.  
  175.     int blocks_per_grid_mask = (mask_size + threads_per_block - 1) / threads_per_block;
  176.     convert_float_to_half_kernel<<<blocks_per_grid_mask, threads_per_block>>>(device_mask_float, device_mask_half, mask_size);
  177.  
  178.     // Free the float input and mask
  179.     cudaFree(device_input_float);
  180.     cudaFree(device_mask_float);
  181.  
  182.     // Pass back the half precision pointers as float pointers
  183.     *device_input_ptr = (float*)device_input_half;
  184.     *device_mask_ptr = (float*)device_mask_half;
  185. }
  186.  
  187. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask,
  188.                                              const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  189. {
  190.     const int Height_out = Height - K + 1;
  191.     const int Width_out = Width - K + 1;
  192.     const int Height_unrolled = Channel * K * K;
  193.     const int Width_unrolled = Batch * Height_out * Width_out;
  194.  
  195.     // Reinterpret input and mask pointers as half precision
  196.     const __half *device_input_half = reinterpret_cast<const __half*>(device_input);
  197.     const __half *device_mask_half = reinterpret_cast<const __half*>(device_mask);
  198.  
  199.     // Allocating temporary storage for unrolling matrix
  200.     __half *unrolled_matrix;  // Pointer to device memory for storing the unrolled matrix
  201.     float *matmul_output;    // Pointer to device memory for storing the result of matrix multiplication
  202.     cudaMalloc((void**)&unrolled_matrix, (size_t) Height_unrolled * Width_unrolled * sizeof(__half));
  203.     cudaMalloc((void**)&matmul_output, (size_t) Map_out * Width_unrolled * sizeof(float));
  204.  
  205.     // Set the kernel dimensions and call the matrix unrolling kernel.
  206.     dim3 blockDim(4, 256, 1);
  207.     dim3 gridDim(
  208.         (Channel + blockDim.x - 1) / blockDim.x,                    // Channel dimension
  209.         (Height_out * Width_out + blockDim.y - 1) / blockDim.y,     // Combined Height/Width
  210.         (Batch + blockDim.z - 1) / blockDim.z);                     // Batch dimension
  211.  
  212.     matrix_unrolling_kernel<<<gridDim, blockDim>>>(device_input_half, unrolled_matrix, Batch, Channel, Height, Width, K);
  213.  
  214.     // Set the kernel dimensions and call the matmul kernel
  215.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  216.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  217.     matrixMultiplyShared<<<dimGrid, dimBlock>>>(device_mask_half, unrolled_matrix, matmul_output, Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
  218.                                                 Map_out, Width_unrolled);
  219.  
  220.     // Permute the result of matrix multiplication
  221.     const int out_image_size = Height_out * Width_out;
  222.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  223.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(matmul_output, device_output, Map_out, Batch, out_image_size);
  224.  
  225.     cudaFree(matmul_output);
  226.     cudaFree(unrolled_matrix);
  227. }
  228.  
  229. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask,
  230.                                                     const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  231. {
  232.     // Calculate output size
  233.     const int Height_out = Height - K + 1;
  234.     const int Width_out = Width - K + 1;
  235.     const int output_size = Batch * Map_out * Height_out * Width_out;
  236.  
  237.     // Copy the output back to host
  238.     cudaMemcpy(host_output, device_output, output_size * sizeof(float), cudaMemcpyDeviceToHost);
  239.  
  240.     // Free device memory
  241.     cudaFree(device_output);
  242.     cudaFree(device_input);  // device_input is __half* cast to float*
  243.     cudaFree(device_mask);   // device_mask is __half* cast to float*
  244. }
  245.  
  246. __host__ void GPUInterface::get_device_properties()
  247. {
  248.     int deviceCount;
  249.     cudaGetDeviceCount(&deviceCount);
  250.  
  251.     for(int dev = 0; dev < deviceCount; dev++)
  252.     {
  253.         cudaDeviceProp deviceProp;
  254.         cudaGetDeviceProperties(&deviceProp, dev);
  255.  
  256.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  257.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  258.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  259.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  260.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  261.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  262.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  263.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  264.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  265.     }
  266. }
  267.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement