Advertisement
phystota

tensor_cores_doesn't_work

Dec 6th, 2024
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 12.64 KB | None | 0 0
  1. #include <cmath>
  2. #include <iostream>
  3. #include <cuda_fp16.h>  // Required for half precision operations
  4. #include <mma.h>        // Required for tensor core operations
  5. #include "gpu-new-forward.h"
  6.  
  7. using namespace nvcuda;  // Namespace for tensor core operations
  8.  
  9. #define TILE_WIDTH 16   // Tile width compatible with tensor cores
  10. #define BLOCK_SIZE 512  // Block size for other kernels
  11.  
  12. // Error checking macro
  13. #define CUDA_CHECK_ERROR(call)                                                        \
  14.     do {                                                                              \
  15.         cudaError_t err = call;                                                       \
  16.         if (err != cudaSuccess) {                                                     \
  17.             std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << " - "     \
  18.                       << cudaGetErrorString(err) << std::endl;                        \
  19.             exit(EXIT_FAILURE);                                                       \
  20.         }                                                                             \
  21.     } while (0)
  22.  
  23. // Original matrix unrolling kernel (unchanged)
  24. __global__ void matrix_unrolling_kernel(const float *input, float *output,
  25.                                         const int Batch, const int Channel,
  26.                                         const int Height, const int Width,
  27.                                         const int K) {
  28.     #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
  29.     #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
  30.  
  31.     const size_t Height_out = Height - K + 1;
  32.     const size_t Width_out = Width - K + 1;
  33.     const size_t W_unroll = Height_out * Width_out;
  34.     const size_t H_unroll = Channel * K * K;
  35.     const size_t W_total_unroll = Batch * W_unroll;
  36.  
  37.     const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
  38.     const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
  39.     const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;
  40.  
  41.     const size_t h_out = hw_pos / Width_out;
  42.     const size_t w_out = hw_pos % Width_out;
  43.  
  44.     if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
  45.         return;
  46.     }
  47.  
  48.     const size_t w_unroll = h_out * Width_out + w_out;
  49.     const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
  50.     const size_t w_base = c * K * K;
  51.  
  52.     for (int p = 0; p < K; p++) {
  53.         for (int q = 0; q < K; q++) {
  54.             int h_unroll = w_base + p * K + q;
  55.             out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
  56.         }
  57.     }
  58.  
  59.     #undef in_4d
  60.     #undef out_3d
  61. }
  62.  
  63. // Helper kernel to convert float to half precision
  64. __global__ void convertToHalf(half *out, const float *in, int size) {
  65.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  66.     if (idx < size) {
  67.         out[idx] = __float2half(in[idx]);
  68.     }
  69. }
  70.  
  71. // Optimized matrix multiplication kernel using tensor cores
  72. __global__ void matrixMultiplyTensorCores(const half *A, const half *B, float *C,
  73.                                          int numARows, int numAColumns,
  74.                                          int numBRows, int numBColumns,
  75.                                          int numCRows, int numCColumns)
  76. {
  77.     // Define tensor core operation dimensions
  78.     const int WMMA_M = 16;
  79.     const int WMMA_N = 16;
  80.     const int WMMA_K = 16;
  81.  
  82.     // Calculate the warp and lane indices
  83.     int warpM = (blockIdx.y * blockDim.y + threadIdx.y) / 32;
  84.     int warpN = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
  85.  
  86.     // Declare fragments
  87.     wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
  88.     wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
  89.     wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag;
  90.  
  91.     // Initialize the output to zero
  92.     wmma::fill_fragment(acc_frag, 0.0f);
  93.  
  94.     // Calculate the starting row and column for A and B
  95.     int aRow = warpM * WMMA_M;
  96.     int bCol = warpN * WMMA_N;
  97.  
  98.     // Loop over K
  99.     for (int k = 0; k < numAColumns; k += WMMA_K) {
  100.         if (aRow < numARows && bCol < numCColumns && (k + WMMA_K) <= numAColumns && (k + WMMA_K) <= numBRows) {
  101.             // Load the inputs
  102.             wmma::load_matrix_sync(a_frag, A + aRow * numAColumns + k, numAColumns);
  103.             wmma::load_matrix_sync(b_frag, B + k * numBColumns + bCol, numBColumns);
  104.  
  105.             // Perform the matrix multiplication
  106.             wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
  107.         }
  108.     }
  109.  
  110.     // Store the output
  111.     if (aRow < numCRows && bCol < numCColumns) {
  112.         wmma::store_matrix_sync(C + aRow * numCColumns + bCol, acc_frag, numCColumns, wmma::mem_row_major);
  113.     }
  114. }
  115.  
  116. // Original matrix permute kernel (unchanged)
  117. __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
  118.                                       int Batch, int image_size) {
  119.     int b = blockIdx.y;
  120.     int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  121.     if (x < image_size) {
  122.         for (int m = 0; m < Map_out; m++) {
  123.             output[b * Map_out * image_size + m * image_size + x] =
  124.                     input[m * Batch * image_size + b * image_size + x];
  125.         }
  126.     }
  127. }
  128.  
  129. __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output,
  130.     const float *host_input, const float *host_mask, float **device_output_ptr,
  131.     float **device_input_ptr, float **device_mask_ptr, const int Batch,
  132.     const int Map_out, const int Channel, const int Height, const int Width, const int K)
  133. {
  134.     // Calculate output dimensions
  135.     const int Height_out = Height - K + 1;
  136.     const int Width_out = Width - K + 1;
  137.  
  138.     // Calculate memory sizes
  139.     const size_t input_size = Batch * Channel * Height * Width * sizeof(float);
  140.     const size_t mask_size = Map_out * Channel * K * K * sizeof(float);
  141.     const size_t output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  142.  
  143.     // Allocate device memory with error checking
  144.     CUDA_CHECK_ERROR(cudaMalloc((void**)device_input_ptr, input_size));
  145.     CUDA_CHECK_ERROR(cudaMalloc((void**)device_mask_ptr, mask_size));
  146.     CUDA_CHECK_ERROR(cudaMalloc((void**)device_output_ptr, output_size));
  147.  
  148.     // Copy input data to device with error checking
  149.     CUDA_CHECK_ERROR(cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice));
  150.     CUDA_CHECK_ERROR(cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice));
  151. }
  152.  
  153. __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
  154.     const float *device_mask, const int Batch, const int Map_out, const int Channel,
  155.     const int Height, const int Width, const int K)
  156. {
  157.     const int Height_out = Height - K + 1;
  158.     const int Width_out = Width - K + 1;
  159.     const int Height_unrolled = Channel * K * K;
  160.     const int Width_unrolled = Batch * Height_out * Width_out;
  161.  
  162.     // Allocate intermediate matrices
  163.     float *unrolled_matrix;
  164.     float *matmul_output;
  165.     half *A_half, *B_half; // For tensor cores
  166.  
  167.     size_t unrolled_size = static_cast<size_t>(Batch) * Channel * K * K * Height_out * Width_out * sizeof(float);
  168.     size_t matmul_size = static_cast<size_t>(Batch) * Map_out * Height_out * Width_out * sizeof(float);
  169.  
  170.     CUDA_CHECK_ERROR(cudaMalloc((void**)&unrolled_matrix, unrolled_size));
  171.     CUDA_CHECK_ERROR(cudaMalloc((void**)&matmul_output, matmul_size));
  172.    
  173.     // Allocate half-precision memory
  174.     size_t A_half_size = static_cast<size_t>(Map_out) * Height_unrolled * sizeof(half);
  175.     size_t B_half_size = static_cast<size_t>(Height_unrolled) * Width_unrolled * sizeof(half);
  176.     CUDA_CHECK_ERROR(cudaMalloc((void**)&A_half, A_half_size));
  177.     CUDA_CHECK_ERROR(cudaMalloc((void**)&B_half, B_half_size));
  178.  
  179.     // Set dimensions for unrolling
  180.     dim3 blockDim_unroll(16, 16, 1);  // Adjusted for better occupancy
  181.     dim3 gridDim_unroll(
  182.         (Channel + blockDim_unroll.x - 1) / blockDim_unroll.x,
  183.         (Height_out * Width_out + blockDim_unroll.y - 1) / blockDim_unroll.y,
  184.         Batch
  185.     );
  186.  
  187.     // Perform matrix unrolling
  188.     matrix_unrolling_kernel<<<gridDim_unroll, blockDim_unroll>>>(device_input, unrolled_matrix,
  189.         Batch, Channel, Height, Width, K);
  190.     CUDA_CHECK_ERROR(cudaGetLastError());
  191.     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
  192.  
  193.     // Convert device_mask and unrolled_matrix from float to half
  194.     // Assuming device_mask is of size Map_out * Channel * K * K
  195.     int size_A = Map_out * Channel * K * K;
  196.     int size_B = static_cast<int>(Batch) * Height_unrolled * Width_out; // Adjust as per actual data
  197.     // However, to match the sizes, we need to ensure that A and B are properly sized
  198.  
  199.     // Calculate total number of elements for A and B
  200.     int total_A_elements = Map_out * Height_unrolled;
  201.     int total_B_elements = Height_unrolled * Width_unrolled;
  202.  
  203.     // Launch convertToHalf kernel for A (mask)
  204.     int threads = 256;
  205.     int blocks_A = (total_A_elements + threads - 1) / threads;
  206.     convertToHalf<<<blocks_A, threads>>>(A_half, device_mask, total_A_elements);
  207.     CUDA_CHECK_ERROR(cudaGetLastError());
  208.  
  209.     // Launch convertToHalf kernel for B (unrolled_matrix)
  210.     int blocks_B = (total_B_elements + threads - 1) / threads;
  211.     convertToHalf<<<blocks_B, threads>>>(B_half, unrolled_matrix, total_B_elements);
  212.     CUDA_CHECK_ERROR(cudaGetLastError());
  213.     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
  214.  
  215.     // Set dimensions for tensor core matrix multiplication
  216.     dim3 dimGrid((Width_unrolled + TILE_WIDTH - 1) / TILE_WIDTH,
  217.                  (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,
  218.                  1);
  219.     dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
  220.  
  221.     // Perform matrix multiplication using tensor cores
  222.     matrixMultiplyTensorCores<<<dimGrid, dimBlock>>>(A_half, B_half, matmul_output,
  223.         Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
  224.         Map_out, Width_unrolled);
  225.     CUDA_CHECK_ERROR(cudaGetLastError());
  226.     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
  227.  
  228.     // Permute the result
  229.     const int out_image_size = Height_out * Width_out;
  230.     dim3 permute_kernel_grid_dim((out_image_size + BLOCK_SIZE - 1) / BLOCK_SIZE, Batch, 1);
  231.     matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(matmul_output,
  232.         device_output, Map_out, Batch, out_image_size);
  233.     CUDA_CHECK_ERROR(cudaGetLastError());
  234.     CUDA_CHECK_ERROR(cudaDeviceSynchronize());
  235.  
  236.     // Free intermediate buffers
  237.     CUDA_CHECK_ERROR(cudaFree(matmul_output));
  238.     CUDA_CHECK_ERROR(cudaFree(unrolled_matrix));
  239.     CUDA_CHECK_ERROR(cudaFree(A_half));
  240.     CUDA_CHECK_ERROR(cudaFree(B_half));
  241. }
  242.  
  243. __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch,
  244.     const int Map_out, const int Channel, const int Height, const int Width, const int K)
  245. {
  246.     const int Height_out = Height - K + 1;
  247.     const int Width_out = Width - K + 1;
  248.     const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
  249.  
  250.     // Copy output back to host with error checking
  251.     CUDA_CHECK_ERROR(cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost));
  252.  
  253.     // Free device memory with error checking
  254.     CUDA_CHECK_ERROR(cudaFree(device_output));
  255.     CUDA_CHECK_ERROR(cudaFree(device_input));
  256.     CUDA_CHECK_ERROR(cudaFree(device_mask));
  257. }
  258.  
  259. __host__ void GPUInterface::get_device_properties()
  260. {
  261.     int deviceCount;
  262.     CUDA_CHECK_ERROR(cudaGetDeviceCount(&deviceCount));
  263.  
  264.     for(int dev = 0; dev < deviceCount; dev++)
  265.     {
  266.         cudaDeviceProp deviceProp;
  267.         CUDA_CHECK_ERROR(cudaGetDeviceProperties(&deviceProp, dev));
  268.  
  269.         std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
  270.         std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
  271.         std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
  272.         std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
  273.         std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
  274.         std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
  275.         std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
  276.         std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
  277.         std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
  278.     }
  279. }
  280.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement