Advertisement
phystota

__restrict__final

Nov 29th, 2024
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 9.45 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 512
  7.  
  8. __global__ void matrix_unrolling_kernel(const float * __restrict__ input,
  9.                                       float * __restrict__ output,
  10.                                       const int Batch, const int Channel,
  11.                                       const int Height, const int Width,
  12.                                       const int K) {
  13.  
  14.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  15.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  16.  
  17.     // Calculate output dimensions
  18.     const size_t Height_out = Height - K + 1;
  19.     const size_t Width_out = Width - K + 1;
  20.     const size_t W_unroll = Height_out * Width_out;
  21.     const size_t H_unroll = Channel * K * K;
  22.     const size_t W_total_unroll = Batch * W_unroll;
  23.  
  24.     // Calculate thread indices
  25.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  26.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  27.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  28.  
  29.     // Extract height and width positions
  30.     const size_t h_out = hw_pos / Width_out;
  31.     const size_t w_out = hw_pos % Width_out;
  32.  
  33.     // Boundary check
  34.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  35.         return;
  36.     }
  37.  
  38.     // Calculate position in unrolled matrix
  39.     const size_t w_unroll = h_out * Width_out + w_out;
  40.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  41.     const size_t w_base = c * K * K;
  42.  
  43.     // Perform unrolling
  44.     for (int p = 0; p < K; p++) {
  45.         for (int q = 0; q < K; q++) {
  46.             int h_unroll = w_base + p * K + q;
  47.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  48.         }
  49.     }
  50.  
  51.     #undef in_4d
  52.     #undef out_3d
  53. }
  54.  
  55. __global__ void matrixMultiplyShared(const float * __restrict__ A,
  56.                                    const float * __restrict__ B,
  57.                                    float * __restrict__ C,
  58.                                    int numARows, int numAColumns,
  59.                                    int numBRows, int numBColumns,
  60.                                    int numCRows, int numCColumns)
  61. {
  62.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  63.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  64.  
  65.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  66.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  67.     float val = 0;
  68.  
  69.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  70.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  71.             tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
  72.         } else {
  73.             tileA[ty][tx] = 0;
  74.         }
  75.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  76.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  77.         } else {
  78.             tileB[ty][tx] = 0;
  79.         }
  80.         __syncthreads();
  81.  
  82.         if (row < numCRows && col < numCColumns) {
  83.             for (int i = 0; i < TILE_WIDTH; i++) {
  84.                 val += tileA[ty][i] * tileB[i][tx];
  85.             }
  86.         }
  87.         __syncthreads();
  88.     }
  89.  
  90.     if (row < numCRows && col < numCColumns) {
  91.         C[row * numCColumns + col] = val;
  92.     }
  93. }
  94.  
  95. __global__ void matrix_permute_kernel(const float * __restrict__ input,
  96.                                     float * __restrict__ output,
  97.                                     int Map_out, int Batch, int image_size) {
  98.     int b = blockIdx.y;
  99.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  100.     if (x < image_size) {
  101.         for (int m = 0; m < Map_out; m++) {
  102.             output[b * Map_out * image_size + m * image_size + x] =
  103.                     input[m * Batch * image_size + b * image_size + x];
  104.         }
  105.     }
  106. }
  107.  
  108. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output,
  109.                                                   const float *host_input,
  110.                                                   const float *host_mask,
  111.                                                   float **device_output_ptr,
  112.                                                   float **device_input_ptr,
  113.                                                   float **device_mask_ptr,
  114.                                                   const int Batch, const int Map_out,
  115.                                                   const int Channel, const int Height,
  116.                                                   const int Width, const int K)
  117. {
  118.     // Calculate sizes
  119.     const int Height_out = Height - K + 1;
  120.     const int Width_out = Width - K + 1;
  121.    
  122.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  123.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  124.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  125.  
  126.     cudaMalloc((void**)device_input_ptr, input_size);
  127.     cudaMalloc((void**)device_mask_ptr, mask_size);
  128.     cudaMalloc((void**)device_output_ptr, output_size);
  129.  
  130.     cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
  131.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  132. }
  133.  
  134.  
  135. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  136. {
  137.     const int Height_out = Height - K + 1;
  138.     const int Width_out = Width - K + 1;
  139.     const int Height_unrolled = Channel * K * K;
  140.     const int Width_unrolled = Batch * Height_out * Width_out;
  141.  
  142.     //allocating temping storage of unrolling matrix
  143.     float *unrolled_matrix;  // Pointer to device memory for storing the unrolled matrix
  144.     float *matmul_output;    // Pointer to device memory for storing the result of matrix multiplication
  145.     cudaMalloc((void**)&unrolled_matrix, (size_t) Batch * Channel * K * K * Height_out * Width_out * sizeof(float));
  146.     cudaMalloc((void**)&matmul_output, (Batch * Map_out * Height_out * Width_out) * sizeof(float));
  147.  
  148.     // TODO: Set the kernel dimensions and call the matrix unrolling kernel.
  149.     // dim3 gridDim((Channel * Width_unroll + BLOCK_SIZE - 1) / BLOCK_SIZE, Batch, 1);
  150.     dim3 blockDim(4,256,1);  
  151.     dim3 gridDim(
  152.     (Channel + blockDim.x - 1) / blockDim.x,                    // Maps dimension
  153.     (Height_out * Width_out + blockDim.y - 1) / blockDim.y,     // Combined Height/Width
  154.     ceil(1.0*Batch/blockDim.z));                                                      // Batch dimension
  155.  
  156.  
  157.     matrix_unrolling_kernel<<<gridDim, blockDim>>>(device_input, unrolled_matrix, Batch, Channel, Height, Width, K);
  158.  
  159.     // TODO: Set the kernel dimensions and call the matmul kernel
  160.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  161.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  162.     matrixMultiplyShared<<<dimGrid, dimBlock>>>(device_mask, unrolled_matrix, matmul_output, Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
  163.     Map_out, Width_unrolled);
  164.  
  165.     // Permute the result of matrix multiplication
  166.     const int out_image_size = Height_out * Width_out;
  167.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  168.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(matmul_output, device_output, Map_out, Batch, out_image_size);
  169.  
  170.     cudaFree(matmul_output);
  171.     cudaFree(unrolled_matrix);
  172. }
  173.  
  174.  
  175. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  176. {
  177.  
  178.     // Calculate output size
  179.     const int Height_out = Height - K + 1;
  180.     const int Width_out = Width - K + 1;
  181.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  182.  
  183.     // TODO: Copy the output back to host
  184.     cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
  185.  
  186.     // TODO: Free device memory
  187.     cudaFree(device_output);
  188.     cudaFree(device_input);
  189.     cudaFree(device_mask);
  190. }
  191.  
  192.  
  193. __host__ void GPUInterface::get_device_properties()
  194. {
  195.     int deviceCount;
  196.     cudaGetDeviceCount(&deviceCount);
  197.  
  198.     for(int dev = 0; dev < deviceCount; dev++)
  199.     {
  200.         cudaDeviceProp deviceProp;
  201.         cudaGetDeviceProperties(&deviceProp, dev);
  202.  
  203.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  204.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  205.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  206.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  207.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  208.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  209.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  210.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  211.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  212.     }
  213. }
  214.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement