Advertisement
phystota

multiplication_kernel_coarsening

Dec 1st, 2024
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.78 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 512
  7. #define COARSENING_FACTOR 2
  8.  
  9. // Matrix unrolling kernel remains the same
  10. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  11.                                       const int Batch, const int Channel,
  12.                                       const int Height, const int Width,
  13.                                       const int K) {
  14.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  15.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  16.  
  17.     const size_t Height_out = Height - K + 1;
  18.     const size_t Width_out = Width - K + 1;
  19.     const size_t W_unroll = Height_out * Width_out;
  20.     const size_t H_unroll = Channel * K * K;
  21.     const size_t W_total_unroll = Batch * W_unroll;
  22.  
  23.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  24.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  25.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  26.  
  27.     const size_t h_out = hw_pos / Width_out;
  28.     const size_t w_out = hw_pos % Width_out;
  29.  
  30.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  31.         return;
  32.     }
  33.  
  34.     const size_t w_unroll = h_out * Width_out + w_out;
  35.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  36.     const size_t w_base = c * K * K;
  37.  
  38.     for (int p = 0; p < K; p++) {
  39.         for (int q = 0; q < K; q++) {
  40.             int h_unroll = w_base + p * K + q;
  41.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  42.         }
  43.     }
  44.  
  45.     #undef in_4d
  46.     #undef out_3d
  47. }
  48.  
  49. // Coarsened matrix multiplication kernel
  50. __global__ void matrixMultiplySharedCoarsened(const float *A, const float *B, float *C,
  51.                                             int numARows, int numAColumns,
  52.                                             int numBRows, int numBColumns,
  53.                                             int numCRows, int numCColumns)
  54. {
  55.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  56.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  57.    
  58.     int bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x, ty = threadIdx.y;
  59.     int row = by * TILE_WIDTH + ty;
  60.     int baseCol = (bx * TILE_WIDTH + tx) * COARSENING_FACTOR;
  61.    
  62.     float vals[COARSENING_FACTOR] = {0.0f};
  63.    
  64.     for (int tileIdx = 0; tileIdx < (numAColumns - 1) / TILE_WIDTH + 1; tileIdx++) {
  65.         if (row < numARows && tileIdx * TILE_WIDTH + tx < numAColumns) {
  66.             tileA[ty][tx] = A[(size_t)row * numAColumns + tileIdx * TILE_WIDTH + tx];
  67.         } else {
  68.             tileA[ty][tx] = 0;
  69.         }
  70.        
  71.         for (int c = 0; c < COARSENING_FACTOR; c++) {
  72.             int col = baseCol + c;
  73.             if (tileIdx * TILE_WIDTH + ty < numBRows && col < numBColumns) {
  74.                 tileB[ty][tx] = B[(size_t)(tileIdx * TILE_WIDTH + ty) * numBColumns + col];
  75.             } else {
  76.                 tileB[ty][tx] = 0;
  77.             }
  78.             __syncthreads();
  79.            
  80.             if (row < numCRows) {
  81.                 for (int k = 0; k < TILE_WIDTH; k++) {
  82.                     vals[c] += tileA[ty][k] * tileB[k][tx];
  83.                 }
  84.             }
  85.             __syncthreads();
  86.         }
  87.     }
  88.    
  89.     if (row < numCRows) {
  90.         for (int c = 0; c < COARSENING_FACTOR; c++) {
  91.             int col = baseCol + c;
  92.             if (col < numCColumns) {
  93.                 C[row * numCColumns + col] = vals[c];
  94.             }
  95.         }
  96.     }
  97. }
  98.  
  99. // Permute kernel remains the same
  100. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  101.                                     int Batch, int image_size) {
  102.     int b = blockIdx.y;
  103.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  104.     if (x < image_size) {
  105.         for (int m = 0; m < Map_out; m++) {
  106.             output[b * Map_out * image_size + m * image_size + x] =
  107.                     input[m * Batch * image_size + b * image_size + x];
  108.         }
  109.     }
  110. }
  111.  
  112. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  113. {
  114.     const int Height_out = Height - K + 1;
  115.     const int Width_out = Width - K + 1;
  116.    
  117.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  118.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  119.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  120.  
  121.     cudaMalloc((void**)device_input_ptr, input_size);
  122.     cudaMalloc((void**)device_mask_ptr, mask_size);
  123.     cudaMalloc((void**)device_output_ptr, output_size);
  124.  
  125.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  126.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  127. }
  128.  
  129. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  130. {
  131.     const int Height_out = Height - K + 1;
  132.     const int Width_out = Width - K + 1;
  133.     const int Height_unrolled = Channel * K * K;
  134.     const int Width_unrolled = Batch * Height_out * Width_out;
  135.  
  136.     // Allocate temporary storage
  137.     float *unrolled_matrix;
  138.     float *matmul_output;
  139.     cudaMalloc((void**)&unrolled_matrix, (size_t)Batch * Channel * K * K * Height_out * Width_out * sizeof(float));
  140.     cudaMalloc((void**)&matmul_output, (size_t)Batch * Map_out * Height_out * Width_out * sizeof(float));
  141.  
  142.     // Matrix unrolling
  143.     dim3 unrollBlockDim(4, 256, 1);
  144.     dim3 unrollGridDim(
  145.         (Channel + unrollBlockDim.x - 1) / unrollBlockDim.x,
  146.         (Height_out * Width_out + unrollBlockDim.y - 1) / unrollBlockDim.y,
  147.         (Batch + unrollBlockDim.z - 1) / unrollBlockDim.z
  148.     );
  149.  
  150.     matrix_unrolling_kernel<<<unrollGridDim, unrollBlockDim>>>(
  151.         device_input, unrolled_matrix, Batch, Channel, Height, Width, K
  152.     );
  153.  
  154.     // Coarsened matrix multiplication
  155.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  156.     dim3 dimGrid(
  157.         (Width_unrolled - 1)/(TILE_WIDTH * COARSENING_FACTOR) + 1,
  158.         (Map_out - 1)/TILE_WIDTH + 1,
  159.         1
  160.     );
  161.  
  162.     matrixMultiplySharedCoarsened<<<dimGrid, dimBlock>>>(
  163.         device_mask, unrolled_matrix, matmul_output,
  164.         Map_out, Height_unrolled,
  165.         Height_unrolled, Width_unrolled,
  166.         Map_out, Width_unrolled
  167.     );
  168.  
  169.     // Matrix permutation
  170.     const int out_image_size = Height_out * Width_out;
  171.     dim3 permuteGridDim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  172.     matrix_permute_kernel<<<permuteGridDim, BLOCK_SIZE>>>(
  173.         matmul_output, device_output, Map_out, Batch, out_image_size
  174.     );
  175.  
  176.     // Clean up temporary storage
  177.     cudaFree(matmul_output);
  178.     cudaFree(unrolled_matrix);
  179. }
  180.  
  181. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  182. {
  183.     const int Height_out = Height - K + 1;
  184.     const int Width_out = Width - K + 1;
  185.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  186.  
  187.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  188.  
  189.     cudaFree(device_output);
  190.     cudaFree(device_input);
  191.     cudaFree(device_mask);
  192. }
  193.  
  194. __host__ void GPUInterface::get_device_properties()
  195. {
  196.     int deviceCount;
  197.     cudaGetDeviceCount(&deviceCount);
  198.  
  199.     for(int dev = 0; dev < deviceCount; dev++)
  200.     {
  201.         cudaDeviceProp deviceProp;
  202.         cudaGetDeviceProperties(&deviceProp, dev);
  203.  
  204.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  205.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  206.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  207.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  208.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  209.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  210.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  211.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  212.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  213.     }
  214. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement