Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cmath>
- #include <iostream>
- #include <cuda_fp16.h>
- #include "gpu-new-forward.h"
- #define TILE_WIDTH 16
- #define BLOCK_SIZE 512
- // Helper function to convert FP32 to FP16 on GPU
- __global__ void convertFP32ToFP16(const float* input, half* output, int size) {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < size) {
- output[idx] = __float2half(input[idx]);
- }
- }
- // Helper function to convert FP16 to FP32 on GPU
- __global__ void convertFP16ToFP32(const half* input, float* output, int size) {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < size) {
- output[idx] = __half2float(input[idx]);
- }
- }
- __global__ void fused_unroll_matmul_kernel_fp16(const half2 *input, const half2 *mask, half2 *output,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int H_unroll = Channel * K * K;
- const int W_unroll = Height_out * Width_out;
- // Using half2 for shared memory to enable vectorized loads
- __shared__ half2 shared_mask[TILE_WIDTH][TILE_WIDTH];
- __shared__ half2 shared_input[TILE_WIDTH][TILE_WIDTH];
- const int tx = threadIdx.x;
- const int ty = threadIdx.y;
- const int row = blockIdx.y * TILE_WIDTH + ty;
- const int col = blockIdx.x * TILE_WIDTH + tx;
- const int batch_idx = blockIdx.z;
- // Use half2 for accumulation to leverage hardware capabilities
- half2 acc = __float2half2_rn(0.0f);
- const int numTiles = (H_unroll / 2 + TILE_WIDTH - 1) / TILE_WIDTH;
- for(int tile = 0; tile < numTiles; tile++) {
- if(row < Map_out && (tile * TILE_WIDTH + tx) < (H_unroll/2)) {
- shared_mask[ty][tx] = mask[row * (H_unroll/2) + tile * TILE_WIDTH + tx];
- } else {
- shared_mask[ty][tx] = __float2half2_rn(0.0f);
- }
- if(col < W_unroll && (tile * TILE_WIDTH + ty) < (H_unroll/2)) {
- int unfoldedIdx = (tile * TILE_WIDTH + ty) * 2;
- int c = unfoldedIdx / (K * K);
- int pixelOffset = unfoldedIdx % (K * K);
- int kh = pixelOffset / K;
- int kw = pixelOffset % K;
- int h_out = col / Width_out;
- int w_out = col % Width_out;
- // Load two consecutive elements using half2
- shared_input[ty][tx] = input[
- batch_idx * (Channel * Height * Width / 2) +
- c * (Height * Width / 2) +
- (h_out + kh) * Width / 2 +
- (w_out + kw) / 2
- ];
- } else {
- shared_input[ty][tx] = __float2half2_rn(0.0f);
- }
- __syncthreads();
- if(row < Map_out && col < W_unroll) {
- for(int k = 0; k < TILE_WIDTH; k++) {
- // Use half2 arithmetic operations
- acc = __hadd2(acc, __hmul2(shared_mask[ty][k], shared_input[k][tx]));
- }
- }
- __syncthreads();
- }
- if(row < Map_out && col < W_unroll) {
- int h_out = col / Width_out;
- int w_out = col % Width_out;
- output[
- batch_idx * (Map_out * Height_out * Width_out / 2) +
- row * (Height_out * Width_out / 2) +
- h_out * Width_out / 2 +
- w_out / 2
- ] = acc;
- }
- }
- __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
- const float *host_mask, float **device_output_ptr,
- float **device_input_ptr, float **device_mask_ptr,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- // Allocate FP32 memory on device first
- cudaMalloc((void**)device_input_ptr, Batch * Channel * Height * Width * sizeof(float));
- cudaMalloc((void**)device_mask_ptr, Map_out * Channel * K * K * sizeof(float));
- cudaMalloc((void**)device_output_ptr, Batch * Map_out * Height_out * Width_out * sizeof(float));
- // Copy FP32 data to device
- cudaMemcpy(*device_input_ptr, host_input, Batch * Channel * Height * Width * sizeof(float), cudaMemcpyHostToDevice);
- cudaMemcpy(*device_mask_ptr, host_mask, Map_out * Channel * K * K * sizeof(float), cudaMemcpyHostToDevice);
- // Allocate FP16 memory on device
- half *device_input_fp16, *device_mask_fp16, *device_output_fp16;
- const int input_size = Batch * Channel * Height * Width;
- const int mask_size = Map_out * Channel * K * K;
- const int output_size = Batch * Map_out * Height_out * Width_out;
- cudaMalloc((void**)&device_input_fp16, input_size * sizeof(half));
- cudaMalloc((void**)&device_mask_fp16, mask_size * sizeof(half));
- cudaMalloc((void**)&device_output_fp16, output_size * sizeof(half));
- // Convert on GPU using helper kernels
- dim3 blockDim(256);
- dim3 gridDim((input_size + 255) / 256);
- convertFP32ToFP16<<<gridDim, blockDim>>>(*device_input_ptr, device_input_fp16, input_size);
- gridDim.x = (mask_size + 255) / 256;
- convertFP32ToFP16<<<gridDim, blockDim>>>(*device_mask_ptr, device_mask_fp16, mask_size);
- // Store FP16 pointers
- *device_input_ptr = (float*)device_input_fp16;
- *device_mask_ptr = (float*)device_mask_fp16;
- *device_output_ptr = (float*)device_output_fp16;
- }
- __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
- const float *device_mask, const int Batch, const int Map_out,
- const int Channel, const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int W_unroll = Height_out * Width_out;
- dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
- dim3 gridDim(
- (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH,
- (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,
- Batch
- );
- // Cast to half2 pointers for vectorized operations
- fused_unroll_matmul_kernel_fp16<<<gridDim, blockDim>>>(
- (half2*)device_input, (half2*)device_mask, (half2*)device_output,
- Batch, Map_out, Channel, Height, Width, K
- );
- }
- __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
- float *device_input, float *device_mask,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int output_size = Batch * Map_out * Height_out * Width_out;
- // Allocate temporary FP32 buffer on device
- float* device_output_fp32;
- cudaMalloc((void**)&device_output_fp32, output_size * sizeof(float));
- // Convert back to FP32 on GPU
- dim3 blockDim(256);
- dim3 gridDim((output_size + 255) / 256);
- convertFP16ToFP32<<<gridDim, blockDim>>>((half*)device_output, device_output_fp32, output_size);
- // Copy final FP32 results back to host
- cudaMemcpy(host_output, device_output_fp32, output_size * sizeof(float), cudaMemcpyDeviceToHost);
- // Clean up
- cudaFree(device_output_fp32);
- cudaFree((half*)device_output);
- cudaFree((half*)device_input);
- cudaFree((half*)device_mask);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement