kernel_fuisoin_final

#include <cmath>
#include <iostream>
#include "gpu-new-forward.h"

#define TILE_WIDTH 16
#define BLOCK_SIZE 256

__global__ void fused_unroll_matmul_kernel(const float *input, const float *mask, float *output,
                                         const int Batch, const int Map_out, const int Channel,
                                         const int Height, const int Width, const int K) {
    // Calculate dimensions
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int H_unroll = Channel * K * K;
    const int W_unroll = Height_out * Width_out;

    // Shared memory for input and mask tiles
    __shared__ float shared_mask[TILE_WIDTH][TILE_WIDTH];
    __shared__ float shared_input[TILE_WIDTH][TILE_WIDTH];

    // Calculate thread and block indices
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;
    const int row = blockIdx.y * TILE_WIDTH + ty;
    const int col = blockIdx.x * TILE_WIDTH + tx;
    const int batch_idx = blockIdx.z;

    // Each thread accumulates one element of the output
    float acc = 0.0f;

    // Calculate how many tiles we need
    const int numTiles = (H_unroll + TILE_WIDTH - 1) / TILE_WIDTH;

    // Loop over tiles
    for(int tile = 0; tile < numTiles; tile++) {
        // Load mask tile - each thread loads one element
        if(row < Map_out && (tile * TILE_WIDTH + tx) < H_unroll) {
            shared_mask[ty][tx] = mask[row * H_unroll + tile * TILE_WIDTH + tx];
        } else {
            shared_mask[ty][tx] = 0.0f;
        }

        // Load and transform input data directly into shared memory
        if(col < W_unroll && (tile * TILE_WIDTH + ty) < H_unroll) {
            // Calculate original input indices
            int unfoldedIdx = tile * TILE_WIDTH + ty;
            int c = unfoldedIdx / (K * K);
            int pixelOffset = unfoldedIdx % (K * K);
            int kh = pixelOffset / K;
            int kw = pixelOffset % K;
            int h_out = col / Width_out;
            int w_out = col % Width_out;

            // Load from input with transformed indices
            shared_input[ty][tx] = input[
                batch_idx * (Channel * Height * Width) +
                c * (Height * Width) +
                (h_out + kh) * Width +
                (w_out + kw)
            ];
        } else {
            shared_input[ty][tx] = 0.0f;
        }

        __syncthreads();

        // Compute partial dot product for this tile
        if(row < Map_out && col < W_unroll) {
            for(int k = 0; k < TILE_WIDTH; k++) {
                acc += shared_mask[ty][k] * shared_input[k][tx];
            }
        }

        __syncthreads();
    }

    // Write output with transformed indices to match desired format
    if(row < Map_out && col < W_unroll) {
        int h_out = col / Width_out;
        int w_out = col % Width_out;
        output[
            batch_idx * (Map_out * Height_out * Width_out) +
            row * (Height_out * Width_out) +
            h_out * Width_out +
            w_out
        ] = acc;
    }
}

__host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
                                                  const float *host_mask, float **device_output_ptr,
                                                  float **device_input_ptr, float **device_mask_ptr,
                                                  const int Batch, const int Map_out, const int Channel,
                                                  const int Height, const int Width, const int K) {
    // Calculate sizes
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;

    const int input_size = Batch * Channel * Height * Width * sizeof(float);
    const int mask_size = Map_out * Channel * K * K * sizeof(float);
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    cudaMalloc((void**)device_input_ptr, input_size);
    cudaMalloc((void**)device_mask_ptr, mask_size);
    cudaMalloc((void**)device_output_ptr, output_size);

    cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
    cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
}

__host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
                                           const float *device_mask, const int Batch, const int Map_out,
                                           const int Channel, const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int W_unroll = Height_out * Width_out;

    // Configure kernel launch parameters
    dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
    dim3 gridDim(
        (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH,     // Width dimension
        (Map_out + TILE_WIDTH - 1) / TILE_WIDTH,      // Height dimension
        Batch                                         // Batch dimension
    );

    // Launch fused kernel
    fused_unroll_matmul_kernel<<<gridDim, blockDim>>>(
        device_input, device_mask, device_output,
        Batch, Map_out, Channel, Height, Width, K
    );
}

__host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
                                                  float *device_input, float *device_mask,
                                                  const int Batch, const int Map_out, const int Channel,
                                                  const int Height, const int Width, const int K) {
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);

    cudaFree(device_output);
    cudaFree(device_input);
    cudaFree(device_mask);
}

// Host function: Retrieves and prints device properties
__host__ void GPUInterface::get_device_properties()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for(int dev = 0; dev < deviceCount; dev++)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        std::cout << "Device " << dev << " name: " << deviceProp.name << std::endl;
        std::cout << "Computational capabilities: " << deviceProp.major << "." << deviceProp.minor << std::endl;
        std::cout << "Max Global memory size: " << deviceProp.totalGlobalMem << std::endl;
        std::cout << "Max Constant memory size: " << deviceProp.totalConstMem << std::endl;
        std::cout << "Max Shared memory size per block: " << deviceProp.sharedMemPerBlock << std::endl;
        std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
        std::cout << "Max block dimensions: " << deviceProp.maxThreadsDim[0] << " x, "
                  << deviceProp.maxThreadsDim[1] << " y, " << deviceProp.maxThreadsDim[2]
                  << " z" << std::endl;
        std::cout << "Max grid dimensions: " << deviceProp.maxGridSize[0] << " x, "
                  << deviceProp.maxGridSize[1] << " y, " << deviceProp.maxGridSize[2]
                  << " z" << std::endl;
        std::cout << "Warp Size: " << deviceProp.warpSize << std::endl;
    }
}