FP16_working

#include <cmath>
#include <iostream>
#include "gpu-new-forward.h"
#include <cuda_fp16.h> // Added for FP16 support

#define TILE_WIDTH 16
#define BLOCK_SIZE 512

// Added constant memory for FP16 mask storage
__constant__ half KERNEL_DEVICE_CST[3136];

// New kernel to convert FP32 to FP16
__global__ void convertFloatToHalf(half *output, const float *input, const int numElements) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numElements) {
        output[idx] = __float2half(input[idx]);
    }
}

// Modified matrix unrolling kernel to work with FP16
__global__ void matrix_unrolling_kernel_fp16(const half *input, half *output,
                                           const int Batch, const int Channel,
                                           const int Height, const int Width,
                                           const int K) {

    #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
    #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]

    const size_t Height_out = Height - K + 1;
    const size_t Width_out = Width - K + 1;
    const size_t W_unroll = Height_out * Width_out;
    const size_t H_unroll = Channel * K * K;
    const size_t W_total_unroll = Batch * W_unroll;

    const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
    const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
    const size_t batch_idx = blockIdx.z * blockDim.z + threadIdx.z;

    const size_t h_out = hw_pos / Width_out;
    const size_t w_out = hw_pos % Width_out;

    if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= Batch) {
        return;
    }

    const size_t w_unroll = h_out * Width_out + w_out;
    const size_t w_total_unroll = batch_idx * W_unroll + w_unroll;
    const size_t w_base = c * K * K;

    // Using FP16 operations for the unrolling
    for (int p = 0; p < K; p++) {
        for (int q = 0; q < K; q++) {
            int h_unroll = w_base + p * K + q;
            out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
        }
    }

    #undef in_4d
    #undef out_3d
}

// Modified matrix multiplication kernel for FP16
__global__ void matrixMultiplyShared_fp16(const half *A, const half *B, float *C,
                                         int numARows, int numAColumns,
                                         int numBRows, int numBColumns,
                                         int numCRows, int numCColumns) {
    __shared__ half tileA[TILE_WIDTH][TILE_WIDTH];
    __shared__ half tileB[TILE_WIDTH][TILE_WIDTH];

    int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
    int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
    half acc = __float2half(0.0f);

    for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
        if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
            tileA[ty][tx] = A[(size_t)row * numAColumns + tileId * TILE_WIDTH + tx];
        } else {
            tileA[ty][tx] = __float2half(0.0f);
        }

        if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
            tileB[ty][tx] = B[((size_t)tileId * TILE_WIDTH + ty) * numBColumns + col];
        } else {
            tileB[ty][tx] = __float2half(0.0f);
        }

        __syncthreads();

        if (row < numCRows && col < numCColumns) {
            for (int i = 0; i < TILE_WIDTH; i++) {
                // Using FP16 multiplication and addition
                acc = __hadd(acc, __hmul(tileA[ty][i], tileB[i][tx]));
            }
        }
        __syncthreads();
    }

    if (row < numCRows && col < numCColumns) {
        // Convert back to FP32 for output
        C[row * numCColumns + col] = __half2float(acc);
    }
}

// Keep the original matrix_permute_kernel as is since it works with FP32 output
__global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
                                    int Batch, int image_size) {
    int b = blockIdx.y;
    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    if (x < image_size) {
        for (int m = 0; m < Map_out; m++) {
            output[b * Map_out * image_size + m * image_size + x] =
                    input[m * Batch * image_size + b * image_size + x];
        }
    }
}

__host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask, float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
    // Calculate output dimensions and sizes
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;

    const int input_size = Batch * Channel * Height * Width;
    const int mask_size = Map_out * Channel * K * K;
    const int output_size = Batch * Map_out * Height_out * Width_out;

    // Allocate memory for FP16 versions
    half *device_input_fp16, *device_mask_fp16;
    cudaMalloc(&device_input_fp16, input_size * sizeof(half));
    cudaMalloc(&device_mask_fp16, mask_size * sizeof(half));
    cudaMalloc(device_output_ptr, output_size * sizeof(float));

    // Temporary FP32 storage for conversion
    float *device_input_fp32, *device_mask_fp32;
    cudaMalloc(&device_input_fp32, input_size * sizeof(float));
    cudaMalloc(&device_mask_fp32, mask_size * sizeof(float));

    // Copy FP32 data to device
    cudaMemcpy(device_input_fp32, host_input, input_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_mask_fp32, host_mask, mask_size * sizeof(float), cudaMemcpyHostToDevice);

    // Convert to FP16
    const int blockSize = 256;
    dim3 gridDim_input((input_size + blockSize - 1) / blockSize);
    dim3 gridDim_mask((mask_size + blockSize - 1) / blockSize);

    convertFloatToHalf<<<gridDim_input, blockSize>>>(device_input_fp16, device_input_fp32, input_size);
    convertFloatToHalf<<<gridDim_mask, blockSize>>>(device_mask_fp16, device_mask_fp32, mask_size);

    // Copy mask to constant memory
    cudaMemcpyToSymbol(KERNEL_DEVICE_CST, device_mask_fp16, mask_size * sizeof(half));

    // Store FP16 pointers in the provided FP32 pointers
    *device_input_ptr = reinterpret_cast<float*>(device_input_fp16);
    *device_mask_ptr = reinterpret_cast<float*>(device_mask_fp16);

    // Clean up temporary storage
    cudaFree(device_input_fp32);
    cudaFree(device_mask_fp32);
}

__host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
    // Cast back to FP16 pointers
    const half *input_fp16 = reinterpret_cast<const half*>(device_input);
    const half *mask_fp16 = reinterpret_cast<const half*>(device_mask);

    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int Height_unrolled = Channel * K * K;
    const int Width_unrolled = Batch * Height_out * Width_out;

    // Allocate unrolled matrix in FP16
    half *unrolled_matrix;
    cudaMalloc(&unrolled_matrix, (size_t)Height_unrolled * Width_unrolled * sizeof(half));

    // Set dimensions for matrix unrolling
    dim3 blockDim(4, 256, 1);
    dim3 gridDim(
        (Channel + blockDim.x - 1) / blockDim.x,
        (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
        (Batch + blockDim.z - 1) / blockDim.z
    );

    // Perform matrix unrolling in FP16
    matrix_unrolling_kernel_fp16<<<gridDim, blockDim>>>(
        input_fp16, unrolled_matrix,
        Batch, Channel, Height, Width, K
    );

    // Matrix multiplication dimensions
    dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    // Temporary storage for matmul result
    float *matmul_output;
    cudaMalloc(&matmul_output, Width_unrolled * Map_out * sizeof(float));

    // Perform matrix multiplication with FP16 inputs
    matrixMultiplyShared_fp16<<<dimGrid, dimBlock>>>(
        mask_fp16, unrolled_matrix, matmul_output,
        Map_out, Height_unrolled, Height_unrolled, Width_unrolled,
        Map_out, Width_unrolled
    );

    // Permute the result
    const int out_image_size = Height_out * Width_out;
    dim3 permute_kernel_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, Batch, 1);
    matrix_permute_kernel<<<permute_kernel_grid_dim, BLOCK_SIZE>>>(
        matmul_output, device_output, Map_out, Batch, out_image_size
    );

    // Clean up
    cudaFree(unrolled_matrix);
    cudaFree(matmul_output);
}

__host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input, float *device_mask, const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K) {
    // Calculate output size
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    // Copy output back to host
    cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);

    // Free device memory (cast to half* for FP16 allocations)
    cudaFree(device_output);
    cudaFree(reinterpret_cast<half*>(device_input));
    cudaFree(reinterpret_cast<half*>(device_mask));
}


// Host function: Get Device Properties
__host__ void GPUInterface::get_device_properties()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for(int dev = 0; dev < deviceCount; dev++)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
        std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
        std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
        std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
        std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
        std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
        std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
        std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
        std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
    }
}