Advertisement
phystota

STREAMS??

Nov 26th, 2024
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 11.14 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include "gpu-new-forward.h"
  4.  
  5. #define TILE_WIDTH 16
  6. #define BLOCK_SIZE 512
  7. #define NUM_STREAMS 4 // Number of CUDA streams
  8.  
  9. // Global variables
  10. float *host_input_pinned;
  11. float *host_output_pinned;
  12. cudaStream_t streams[NUM_STREAMS];
  13.  
  14. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  15.                                         const int Batch, const int Channel,
  16.                                         const int Height, const int Width,
  17.                                         const int K, const int batch_start) {
  18.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  19.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  20.  
  21.     const size_t Height_out = Height - K + 1;
  22.     const size_t Width_out = Width - K + 1;
  23.     const size_t W_unroll = Height_out * Width_out;
  24.     const size_t H_unroll = Channel * K * K;
  25.  
  26.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  27.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  28.     const size_t batch_idx = batch_start + blockIdx.z;
  29.  
  30.     const size_t h_out = hw_pos / Width_out;
  31.     const size_t w_out = hw_pos % Width_out;
  32.  
  33.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= (batch_start + Batch)) {
  34.         return;
  35.     }
  36.  
  37.     const size_t w_unroll = h_out * Width_out + w_out;
  38.     const size_t w_total_unroll = (batch_idx - batch_start) * W_unroll + w_unroll;
  39.     const size_t w_base = c * K * K;
  40.  
  41.     for (int p = 0; p < K; p++) {
  42.         for (int q = 0; q < K; q++) {
  43.             int h_unroll = w_base + p * K + q;
  44.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  45.         }
  46.     }
  47.  
  48.     #undef in_4d
  49.     #undef out_3d
  50. }
  51.  
  52. __global__ void matrixMultiplyShared(const float *A, const float *B, float *C,
  53.                                      int numARows, int numAColumns,
  54.                                      int numBRows, int numBColumns,
  55.                                      int numCRows, int numCColumns)
  56. {
  57.     __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
  58.     __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
  59.  
  60.     int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
  61.  
  62.     int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
  63.     float val = 0;
  64.  
  65.     for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
  66.         if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
  67.             tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
  68.         } else {
  69.             tileA[ty][tx] = 0;
  70.         }
  71.         if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
  72.             tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
  73.         } else {
  74.             tileB[ty][tx] = 0;
  75.         }
  76.         __syncthreads();
  77.  
  78.         if (row < numCRows && col < numCColumns) {
  79.             for (int i = 0; i < TILE_WIDTH; i++) {
  80.                 val += tileA[ty][i] * tileB[i][tx];
  81.             }
  82.         }
  83.         __syncthreads();
  84.     }
  85.  
  86.     if (row < numCRows && col < numCColumns) {
  87.         C[row * numCColumns + col] = val;
  88.     }
  89. }
  90.  
  91. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  92.                                       int Batch, int image_size, int batch_start) {
  93.     int b = batch_start + blockIdx.y;
  94.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  95.     if (x < image_size) {
  96.         for (int m = 0; m < Map_out; m++) {
  97.             output[b * Map_out * image_size + m * image_size + x] =
  98.                     input[m * Batch * image_size + (b - batch_start) * image_size + x];
  99.         }
  100.     }
  101. }
  102.  
  103. __host__ void conv_forward_gpu_part(float *device_output, const float *device_input, const float *device_mask,
  104.                                     const int batch_start, const int batch_size,
  105.                                     const int Map_out, const int Channel, const int Height, const int Width, const int K,
  106.                                     cudaStream_t stream)
  107. {
  108.     const int Height_out = Height - K + 1;
  109.     const int Width_out = Width - K + 1;
  110.     const int Height_unrolled = Channel * K * K;
  111.     const int Width_unrolled = batch_size * Height_out * Width_out;
  112.  
  113.     // Allocate temporary storage for unrolled matrix and matmul output
  114.     float *unrolled_matrix;
  115.     float *matmul_output;
  116.     cudaMalloc((void**)&unrolled_matrix, (size_t) Height_unrolled * Width_unrolled * sizeof(float));
  117.     cudaMalloc((void**)&matmul_output, (Map_out * Width_unrolled) * sizeof(float));
  118.  
  119.     // Launch matrix unrolling kernel
  120.     dim3 blockDim(16, 16, 1);
  121.     dim3 gridDim(
  122.         (Channel + blockDim.x - 1) / blockDim.x,
  123.         (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
  124.         batch_size
  125.     );
  126.     matrix_unrolling_kernel<<<gridDim, blockDim, 0, stream>>>(
  127.         device_input, unrolled_matrix,
  128.         batch_size, Channel, Height, Width, K, batch_start
  129.     );
  130.  
  131.     // Launch matrix multiplication kernel
  132.     dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
  133.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  134.     matrixMultiplyShared<<<dimGrid, dimBlock, 0, stream>>>(
  135.         device_mask, unrolled_matrix, matmul_output,
  136.         Map_out, Height_unrolled,
  137.         Height_unrolled, Width_unrolled,
  138.         Map_out, Width_unrolled
  139.     );
  140.  
  141.     // Launch matrix permute kernel
  142.     const int out_image_size = Height_out * Width_out;
  143.     dim3 permute_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, batch_size, 1);
  144.     matrix_permute_kernel<<<permute_grid_dim, BLOCK_SIZE, 0, stream>>>(
  145.         matmul_output, device_output, Map_out, batch_size, out_image_size, batch_start
  146.     );
  147.  
  148.     // Free temporary device memory
  149.     cudaFree(unrolled_matrix);
  150.     cudaFree(matmul_output);
  151. }
  152.  
  153. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask,
  154.                                                     float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr,
  155.                                                     const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
  156. {
  157.     // Calculate sizes
  158.     const int Height_out = Height - K + 1;
  159.     const int Width_out = Width - K + 1;
  160.     const int input_size = Batch * Channel * Height * Width * sizeof(float);
  161.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  162.     const int mask_size = Map_out * Channel * K * K * sizeof(float);
  163.  
  164.     // Allocate pinned host memory
  165.     cudaMallocHost((void**)&host_input_pinned, input_size);
  166.     cudaMallocHost((void**)&host_output_pinned, output_size);
  167.     memcpy(host_input_pinned, host_input, input_size);
  168.  
  169.     // Allocate device memory
  170.     cudaMalloc((void**)device_input_ptr, input_size);
  171.     cudaMalloc((void**)device_output_ptr, output_size);
  172.     cudaMalloc((void**)device_mask_ptr, mask_size);
  173.  
  174.     // Copy mask to device
  175.     cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
  176.  
  177.     // Create CUDA streams
  178.     for (int i = 0; i < NUM_STREAMS; i++) {
  179.         cudaStreamCreate(&streams[i]);
  180.     }
  181.  
  182.     // Divide Batch into chunks
  183.     int chunk_size = (Batch + NUM_STREAMS - 1) / NUM_STREAMS;
  184.  
  185.     for (int i = 0; i < NUM_STREAMS; i++) {
  186.         int batch_start = i * chunk_size;
  187.         int batch_end = min(batch_start + chunk_size, Batch);
  188.         int batch_size = batch_end - batch_start;
  189.  
  190.         if (batch_size > 0) {
  191.             size_t input_offset = batch_start * Channel * Height * Width;
  192.             size_t output_offset = batch_start * Map_out * Height_out * Width_out;
  193.             size_t input_chunk_size = batch_size * Channel * Height * Width * sizeof(float);
  194.             size_t output_chunk_size = batch_size * Map_out * Height_out * Width_out * sizeof(float);
  195.  
  196.             // Asynchronously copy input data to device
  197.             cudaMemcpyAsync(*device_input_ptr + input_offset, host_input_pinned + input_offset,
  198.                             input_chunk_size, cudaMemcpyHostToDevice, streams[i]);
  199.  
  200.             // Launch kernels in the stream
  201.             conv_forward_gpu_part(*device_output_ptr, *device_input_ptr, *device_mask_ptr,
  202.                                   batch_start, batch_size, Map_out, Channel, Height, Width, K, streams[i]);
  203.  
  204.             // Asynchronously copy output data back to host
  205.             cudaMemcpyAsync(host_output_pinned + output_offset, *device_output_ptr + output_offset,
  206.                             output_chunk_size, cudaMemcpyDeviceToHost, streams[i]);
  207.         }
  208.     }
  209.  
  210.     // Synchronize all streams
  211.     for (int i = 0; i < NUM_STREAMS; i++) {
  212.         cudaStreamSynchronize(streams[i]);
  213.     }
  214.  
  215.     // Copy the output from pinned memory to the original host_output
  216.     memcpy((void*)host_output, host_output_pinned, output_size);
  217. }
  218.  
  219. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask,
  220.                                              const int Batch, const int Map_out, const int Channel,
  221.                                              const int Height, const int Width, const int K)
  222. {
  223.     // This function is now handled within conv_forward_gpu_prolog
  224. }
  225.  
  226. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input,
  227.                                                     float *device_mask, const int Batch, const int Map_out,
  228.                                                     const int Channel, const int Height, const int Width, const int K)
  229. {
  230.     // Free device memory
  231.     cudaFree(device_output);
  232.     cudaFree(device_input);
  233.     cudaFree(device_mask);
  234.  
  235.     // Free pinned host memory
  236.     cudaFreeHost(host_input_pinned);
  237.     cudaFreeHost(host_output_pinned);
  238.  
  239.     // Destroy streams
  240.     for (int i = 0; i < NUM_STREAMS; i++) {
  241.         cudaStreamDestroy(streams[i]);
  242.     }
  243. }
  244.  
  245. __host__ void GPUInterface::get_device_properties()
  246. {
  247.     int deviceCount;
  248.     cudaGetDeviceCount(&deviceCount);
  249.  
  250.     for(int dev = 0; dev < deviceCount; dev++)
  251.     {
  252.         cudaDeviceProp deviceProp;
  253.         cudaGetDeviceProperties(&deviceProp, dev);
  254.  
  255.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  256.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  257.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  258.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  259.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  260.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  261.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  262.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  263.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  264.     }
  265. }
  266.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement