Advertisement
phystota

constant memory optimization

Nov 30th, 2024
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 9.60 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 512
  7.  
  8. inline void cudaCheckError(cudaError_t error, const char *file, int line) {
  9.     if (error != cudaSuccess) {
  10.         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n",
  11.                 file, line, static_cast<unsigned int>(error),
  12.                 cudaGetErrorString(error));
  13.         exit(EXIT_FAILURE);
  14.     }
  15. }
  16.  
  17. // Macro to make error checking easier to use
  18. #define CUDA_CHECK(err) cudaCheckError(err, __FILE__, __LINE__)
  19.  
  20. // Add constant memory for the mask
  21. // Maximum size calculation: typical maximum values might be
  22. // Map_out=16, Channel=4, K=7 -> 16 * 4 * 7 * 7 = 3136 elements
  23. __constant__ float const_mask[4000];  // Conservative size that should handle common cases
  24.  
  25. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  26.                                         const int Batch, const int Channel,
  27.                                         const int Height, const int Width,
  28.                                         const int K) {
  29.  
  30.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  31.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  32.  
  33.     // Calculate output dimensions
  34.     const size_t Height_out = Height - K + 1;
  35.     const size_t Width_out = Width - K + 1;
  36.     const size_t W_unroll = Height_out * Width_out;
  37.     const size_t H_unroll = Channel * K * K;
  38.     const size_t W_total_unroll = Batch * W_unroll;
  39.  
  40.     // Calculate thread indices
  41.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  42.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  43.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  44.  
  45.     // Extract height and width positions
  46.     const size_t h_out = hw_pos / Width_out;
  47.     const size_t w_out = hw_pos % Width_out;
  48.  
  49.     // Boundary check
  50.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  51.         return;
  52.     }
  53.  
  54.     // Calculate position in unrolled matrix
  55.     const size_t w_unroll = h_out * Width_out + w_out;
  56.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  57.     const size_t w_base = c * K * K;
  58.  
  59.     // Perform unrolling
  60.     for (int p = 0; p < K; p++) {
  61.         for (int q = 0; q < K; q++) {
  62.             int h_unroll = w_base + p * K + q;
  63.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  64.         }
  65.     }
  66.  
  67.     #undef in_4d
  68.     #undef out_3d
  69. }
  70.  
  71. __global__ void matrixMultiplyShared(const float *B, float *C,
  72.                                      int numARows, int numAColumns,
  73.                                      int numBRows, int numBColumns,
  74.                                      int numCRows, int numCColumns)
  75. {
  76.     // Declare shared memory for both matrices
  77.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  78.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  79.  
  80.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  81.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  82.     float val = 0;
  83.  
  84.     // Process tiles
  85.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  86.         // Load mask from constant memory into shared memory
  87.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  88.             tileA[ty][tx] = const_mask[row * numAColumns + tileId * TILE_WIDTH + tx];
  89.         } else {
  90.             tileA[ty][tx] = 0;
  91.         }
  92.        
  93.         // Load tile from matrix B into shared memory
  94.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  95.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  96.         } else {
  97.             tileB[ty][tx] = 0;
  98.         }
  99.        
  100.         // Make sure all threads have loaded their data
  101.         __syncthreads();
  102.  
  103.         // Compute partial dot product using shared memory
  104.         if (row < numCRows && col < numCColumns) {
  105.             for (int i = 0; i < TILE_WIDTH; i++) {
  106.                 val += tileA[ty][i] * tileB[i][tx];
  107.             }
  108.         }
  109.        
  110.         // Synchronize before next iteration
  111.         __syncthreads();
  112.     }
  113.  
  114.     // Write final result to global memory
  115.     if (row < numCRows && col < numCColumns) {
  116.         C[row * numCColumns + col] = val;
  117.     }
  118. }
  119.  
  120. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  121.                                       int Batch, int image_size) {
  122.     int b = blockIdx.y;
  123.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  124.     if (x < image_size) {
  125.         for (int m = 0; m < Map_out; m++) {
  126.             output[b * Map_out * image_size + m * image_size + x] =
  127.                     input[m * Batch * image_size + b * image_size + x];
  128.         }
  129.     }
  130. }
  131.  
  132. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  133. {
  134.     // Calculate sizes
  135.     const int Height_out = Height - K + 1;
  136.     const int Width_out = Width - K + 1;
  137.    
  138.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  139.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  140.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  141.  
  142.     // Verify mask size
  143.     if (Map_out * Channel * K * K > 4000) {
  144.         std::cerr << "Error: Mask size exceeds constant memory allocation" << std::endl;
  145.         exit(-1);
  146.     }
  147.  
  148.     // Copy mask to constant memory instead of global memory
  149.     cudaMemcpyToSymbol(const_mask, host_mask, mask_size);
  150.  
  151.     // Allocate memory for input and output
  152.     cudaMalloc((void**)device_input_ptr, input_size);
  153.     cudaMalloc((void**)device_output_ptr, output_size);
  154.  
  155.     // Copy input to device
  156.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  157.  
  158.     // Set mask pointer to NULL since we're using constant memory
  159.     *device_mask_ptr = NULL;
  160. }
  161.  
  162. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  163. {
  164.     const int Height_out = Height - K + 1;
  165.     const int Width_out = Width - K + 1;
  166.     const int Height_unrolled = Channel * K * K;
  167.     const int Width_unrolled = Batch * Height_out * Width_out;
  168.  
  169.     float *unrolled_matrix;
  170.     float *matmul_output;
  171.     cudaMalloc((void**)&unrolled_matrix, (size_t) Batch * Channel * K * K * Height_out * Width_out * sizeof(float));
  172.     cudaMalloc((void**)&matmul_output, (Batch * Map_out * Height_out * Width_out) * sizeof(float));
  173.  
  174.     dim3 blockDim(4, 256, 1);
  175.     dim3 gridDim(
  176.         (Channel + blockDim.x - 1) / blockDim.x,
  177.         (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
  178.         (Batch + blockDim.z - 1) / blockDim.z
  179.     );
  180.  
  181.     matrix_unrolling_kernel<<<gridDim, blockDim>>>(
  182.         device_input, unrolled_matrix,
  183.         Batch, Channel, Height, Width, K
  184.     );
  185.  
  186.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  187.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  188.    
  189.     matrixMultiplyShared<<<dimGrid, dimBlock>>>(
  190.         unrolled_matrix, matmul_output,
  191.         Map_out, Height_unrolled,
  192.         Height_unrolled, Width_unrolled,
  193.         Map_out, Width_unrolled
  194.     );
  195.  
  196.     const int out_image_size = Height_out * Width_out;
  197.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  198.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(
  199.         matmul_output, device_output,
  200.         Map_out, Batch, out_image_size
  201.     );
  202.  
  203.     cudaFree(matmul_output);
  204.     cudaFree(unrolled_matrix);
  205. }
  206.  
  207. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  208. {
  209.     const int Height_out = Height - K + 1;
  210.     const int Width_out = Width - K + 1;
  211.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  212.  
  213.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  214.  
  215.     cudaFree(device_output);
  216.     cudaFree(device_input);
  217. }
  218.  
  219. __host__ void GPUInterface::get_device_properties()
  220. {
  221.     int deviceCount;
  222.     cudaGetDeviceCount(&deviceCount);
  223.  
  224.     for(int dev = 0; dev < deviceCount; dev++)
  225.     {
  226.         cudaDeviceProp deviceProp;
  227.         cudaGetDeviceProperties(&deviceProp, dev);
  228.  
  229.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  230.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  231.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  232.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  233.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  234.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  235.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  236.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  237.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  238.     }
  239. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement