Advertisement
phystota

m3-forward_skeleton

Nov 19th, 2024
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.31 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 256
  7.  
  8. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  9.                                         const int Batch, const int Channel,
  10.                                         const int Height, const int Width,
  11.                                         const int K) {
  12.     /*
  13.     Modify this function to implement the input matrix unrolling kernel.
  14.  
  15.     Function paramter definitions:
  16.     input - input
  17.     output - output
  18.     Batch - batch_size (number of images in x)
  19.     Channel - number of input feature maps
  20.     Height - input height dimension
  21.     Width - input width dimension
  22.     K - kernel height and width (K x K)
  23.     */
  24.     const int Height_out = Height - K + 1;
  25.     const int Width_out = Width - K + 1;
  26.     (void)Height_out; // silence declared but never referenced warning. remove this line when you start working
  27.     (void)Width_out; // silence declared but never referenced warning. remove this line when you start working
  28.  
  29.     // We have some nice #defs for you below to simplify indexing. Feel free to use them, or create your own.
  30.     // An example use of these macros:
  31.     // float a = in_4d(0,0,0,0)
  32.  
  33.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  34.  
  35.     // TODO: Insert your input matrix unrolling kernel code here
  36.    
  37.  
  38.     #undef in_4d
  39. }
  40.  
  41. // Tiled matrix multiplication kernel. Computes C = AB
  42. // You don't need to modify this kernel.
  43. __global__ void matrixMultiplyShared(const float *A, const float *B, float *C,
  44.                                      int numARows, int numAColumns,
  45.                                      int numBRows, int numBColumns,
  46.                                      int numCRows, int numCColumns)
  47. {
  48.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  49.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  50.  
  51.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  52.  
  53.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  54.     float val = 0;
  55.  
  56.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  57.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  58.             tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
  59.         } else {
  60.             tileA[ty][tx] = 0;
  61.         }
  62.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  63.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  64.         } else {
  65.             tileB[ty][tx] = 0;
  66.         }
  67.         __syncthreads();
  68.  
  69.         if (row < numCRows && col < numCColumns) {
  70.             for (int i = 0; i < TILE_WIDTH; i++) {
  71.                 val += tileA[ty][i] * tileB[i][tx];
  72.             }
  73.         }
  74.         __syncthreads();
  75.     }
  76.  
  77.     if (row < numCRows && col < numCColumns) {
  78.         C[row * numCColumns + col] = val;
  79.     }
  80. }
  81.  
  82. // Permutes the matmul result.
  83. // The output feature map after matmul is of shape Map_out x Batch x Height_out x Width_out,
  84. // and we need to permute it into Batch x Map_out x Height_out x Width_out.
  85. // You don't need to modify this kernel.
  86. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  87.                                       int Batch, int image_size) {
  88.     int b = blockIdx.y;
  89.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  90.     if (x < image_size) {
  91.         for (int m = 0; m < Map_out; m++) {
  92.             output[b * Map_out * image_size + m * image_size + x] =
  93.                     input[m * Batch * image_size + b * image_size + x];
  94.         }
  95.     }
  96. }
  97.  
  98. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  99. {
  100.     // TODO: Allocate memory and copy over the relevant data structures to the GPU
  101.  
  102.     // We pass double pointers for you to initialize the relevant device pointers,
  103.     //  which are passed to the other two functions.
  104.  
  105.     // Useful snippet for error checking
  106.     // cudaError_t error = cudaGetLastError();
  107.     // if(error != cudaSuccess)
  108.     // {
  109.     //     std::cout<<"CUDA error: "<<cudaGetErrorString(error)<<std::endl;
  110.     //     exit(-1);
  111.     // }
  112.  
  113. }
  114.  
  115.  
  116. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  117. {
  118.     const int Height_out = Height - K + 1;
  119.     const int Width_out = Width - K + 1;
  120.     const int Height_unrolled = Channel * K * K;
  121.     const int Width_unrolled = Batch * Height_out * Width_out;
  122.  
  123.     float *unrolled_matrix;  // Pointer to device memory for storing the unrolled matrix
  124.     float *matmul_output;    // Pointer to device memory for storing the result of matrix multiplication
  125.     cudaMalloc((void**)&unrolled_matrix, (size_t) Batch * Channel * K * K * Height_out * Width_out * sizeof(float));
  126.     cudaMalloc((void**)&matmul_output, (Batch * Map_out * Height_out * Width_out) * sizeof(float));
  127.  
  128.     // TODO: Set the kernel dimensions and call the matrix unrolling kernel.
  129.  
  130.     // TODO: Set the kernel dimensions and call the matmul kernel
  131.  
  132.     // Permute the result of matrix multiplication
  133.     const int out_image_size = Height_out * Width_out;
  134.     dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
  135.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(
  136.         matmul_output, device_output, Map_out, Batch, out_image_size
  137.     );
  138.  
  139.     cudaFree(matmul_output);
  140.     cudaFree(unrolled_matrix);
  141. }
  142.  
  143.  
  144. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  145. {
  146.     // TODO: Copy the output back to host
  147.  
  148.     // TODO: Free device memory
  149.  
  150. }
  151.  
  152.  
  153. __host__ void GPUInterface::get_device_properties()
  154. {
  155.     int deviceCount;
  156.     cudaGetDeviceCount(&deviceCount);
  157.  
  158.     for(int dev = 0; dev < deviceCount; dev++)
  159.     {
  160.         cudaDeviceProp deviceProp;
  161.         cudaGetDeviceProperties(&deviceProp, dev);
  162.  
  163.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  164.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  165.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  166.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  167.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  168.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  169.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  170.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  171.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  172.     }
  173. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement