sbatch_5000_fused_fp16_optimization

#include <cmath>
#include <iostream>
#include <cuda_fp16.h>
#include "gpu-new-forward.h"

#define TILE_WIDTH 16
#define BLOCK_SIZE 512

// Helper function to convert FP32 to FP16 on GPU
__global__ void convertFP32ToFP16(const float* input, half* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = __float2half(input[idx]);
    }
}

// Helper function to convert FP16 to FP32 on GPU
__global__ void convertFP16ToFP32(const half* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = __half2float(input[idx]);
    }
}

__global__ void fused_unroll_matmul_kernel_fp16(const half2 *input, const half2 *mask, half2 *output,
                                               const int Batch, const int Map_out, const int Channel,
                                               const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int H_unroll = Channel * K * K;
    const int W_unroll = Height_out * Width_out;

    // Using half2 for shared memory to enable vectorized loads
    __shared__ half2 shared_mask[TILE_WIDTH][TILE_WIDTH];
    __shared__ half2 shared_input[TILE_WIDTH][TILE_WIDTH];

    const int tx = threadIdx.x;
    const int ty = threadIdx.y;
    const int row = blockIdx.y * TILE_WIDTH + ty;
    const int col = blockIdx.x * TILE_WIDTH + tx;
    const int batch_idx = blockIdx.z;

    // Use half2 for accumulation to leverage hardware capabilities
    half2 acc = __float2half2_rn(0.0f);

    const int numTiles = (H_unroll / 2 + TILE_WIDTH - 1) / TILE_WIDTH;

    for(int tile = 0; tile < numTiles; tile++) {
        if(row < Map_out && (tile * TILE_WIDTH + tx) < (H_unroll/2)) {
            shared_mask[ty][tx] = mask[row * (H_unroll/2) + tile * TILE_WIDTH + tx];
        } else {
            shared_mask[ty][tx] = __float2half2_rn(0.0f);
        }

        if(col < W_unroll && (tile * TILE_WIDTH + ty) < (H_unroll/2)) {
            int unfoldedIdx = (tile * TILE_WIDTH + ty) * 2;
            int c = unfoldedIdx / (K * K);
            int pixelOffset = unfoldedIdx % (K * K);
            int kh = pixelOffset / K;
            int kw = pixelOffset % K;
            int h_out = col / Width_out;
            int w_out = col % Width_out;

            // Load two consecutive elements using half2
            shared_input[ty][tx] = input[
                batch_idx * (Channel * Height * Width / 2) +
                c * (Height * Width / 2) +
                (h_out + kh) * Width / 2 +
                (w_out + kw) / 2
            ];
        } else {
            shared_input[ty][tx] = __float2half2_rn(0.0f);
        }

        __syncthreads();

        if(row < Map_out && col < W_unroll) {
            for(int k = 0; k < TILE_WIDTH; k++) {
                // Use half2 arithmetic operations
                acc = __hadd2(acc, __hmul2(shared_mask[ty][k], shared_input[k][tx]));
            }
        }

        __syncthreads();
    }

    if(row < Map_out && col < W_unroll) {
        int h_out = col / Width_out;
        int w_out = col % Width_out;
        output[
            batch_idx * (Map_out * Height_out * Width_out / 2) +
            row * (Height_out * Width_out / 2) +
            h_out * Width_out / 2 +
            w_out / 2
        ] = acc;
    }
}

__host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
                                                  const float *host_mask, float **device_output_ptr,
                                                  float **device_input_ptr, float **device_mask_ptr,
                                                  const int Batch, const int Map_out, const int Channel,
                                                  const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;

    // Allocate FP32 memory on device first
    cudaMalloc((void**)device_input_ptr, Batch * Channel * Height * Width * sizeof(float));
    cudaMalloc((void**)device_mask_ptr, Map_out * Channel * K * K * sizeof(float));
    cudaMalloc((void**)device_output_ptr, Batch * Map_out * Height_out * Width_out * sizeof(float));

    // Copy FP32 data to device
    cudaMemcpy(*device_input_ptr, host_input, Batch * Channel * Height * Width * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(*device_mask_ptr, host_mask, Map_out * Channel * K * K * sizeof(float), cudaMemcpyHostToDevice);

    // Allocate FP16 memory on device
    half *device_input_fp16, *device_mask_fp16, *device_output_fp16;
    const int input_size = Batch * Channel * Height * Width;
    const int mask_size = Map_out * Channel * K * K;
    const int output_size = Batch * Map_out * Height_out * Width_out;

    cudaMalloc((void**)&device_input_fp16, input_size * sizeof(half));
    cudaMalloc((void**)&device_mask_fp16, mask_size * sizeof(half));
    cudaMalloc((void**)&device_output_fp16, output_size * sizeof(half));

    // Convert on GPU using helper kernels
    dim3 blockDim(256);
    dim3 gridDim((input_size + 255) / 256);
    convertFP32ToFP16<<<gridDim, blockDim>>>(*device_input_ptr, device_input_fp16, input_size);

    gridDim.x = (mask_size + 255) / 256;
    convertFP32ToFP16<<<gridDim, blockDim>>>(*device_mask_ptr, device_mask_fp16, mask_size);

    // Store FP16 pointers
    *device_input_ptr = (float*)device_input_fp16;
    *device_mask_ptr = (float*)device_mask_fp16;
    *device_output_ptr = (float*)device_output_fp16;
}

__host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
                                           const float *device_mask, const int Batch, const int Map_out,
                                           const int Channel, const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int W_unroll = Height_out * Width_out;

    dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
    dim3 gridDim(
        (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH,
        (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,
        Batch
    );

    // Cast to half2 pointers for vectorized operations
    fused_unroll_matmul_kernel_fp16<<<gridDim, blockDim>>>(
        (half2*)device_input, (half2*)device_mask, (half2*)device_output,
        Batch, Map_out, Channel, Height, Width, K
    );
}

__host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
                                                  float *device_input, float *device_mask,
                                                  const int Batch, const int Map_out, const int Channel,
                                                  const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int output_size = Batch * Map_out * Height_out * Width_out;

    // Allocate temporary FP32 buffer on device
    float* device_output_fp32;
    cudaMalloc((void**)&device_output_fp32, output_size * sizeof(float));

    // Convert back to FP32 on GPU
    dim3 blockDim(256);
    dim3 gridDim((output_size + 255) / 256);
    convertFP16ToFP32<<<gridDim, blockDim>>>((half*)device_output, device_output_fp32, output_size);

    // Copy final FP32 results back to host
    cudaMemcpy(host_output, device_output_fp32, output_size * sizeof(float), cudaMemcpyDeviceToHost);

    // Clean up
    cudaFree(device_output_fp32);
    cudaFree((half*)device_output);
    cudaFree((half*)device_input);
    cudaFree((half*)device_mask);
}