cublas_batch_optimization

#include <cmath>
#include <iostream>
#include <cublas_v2.h>
#include "gpu-new-forward.h"

#define TILE_WIDTH 16
#define BLOCK_SIZE 512

__global__ void matrix_unrolling_kernel(const float *input, float *output,
                                      const int Batch, const int Channel,
                                      const int Height, const int Width,
                                      const int K) {
    #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
    #define unroll_3d(i2, i1, i0) output[(i2) * (Height_unroll * Width_unroll) + (i1) * Width_unroll + i0]

    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int Height_unroll = Channel * K * K;
    const int Width_unroll = Height_out * Width_out;

    const int c = blockIdx.x * blockDim.x + threadIdx.x;
    const int hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
    const int batch_idx = blockIdx.z * blockDim.z + threadIdx.z;

    if (c >= Channel || hw_pos >= Height_out * Width_out || batch_idx >= Batch) {
        return;
    }

    const int h_out = hw_pos / Width_out;
    const int w_out = hw_pos % Width_out;
    const int w_base = c * K * K;

    // Unroll the input into a matrix suitable for batched matrix multiplication
    for (int p = 0; p < K; p++) {
        for (int q = 0; q < K; q++) {
            int h_unroll = w_base + p * K + q;
            unroll_3d(batch_idx, h_unroll, h_out * Width_out + w_out) =
                in_4d(batch_idx, c, h_out + p, w_out + q);
        }
    }

    #undef in_4d
    #undef unroll_3d
}

__global__ void matrix_permute_kernel(const float *input, float *output,
                                    const int Map_out, const int Batch,
                                    const int Height_out, const int Width_out) {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int b = blockIdx.y;

    const int image_size = Height_out * Width_out;

    if (idx < image_size && b < Batch) {
        for (int m = 0; m < Map_out; m++) {
            output[b * Map_out * image_size + m * image_size + idx] =
                input[b * Map_out * image_size + m * image_size + idx];
        }
    }
}

__host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
    const float *host_mask, float **device_output_ptr, float **device_input_ptr,
    float **device_mask_ptr, const int Batch, const int Map_out, const int Channel,
    const int Height, const int Width, const int K)
{
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;

    const int input_size = Batch * Channel * Height * Width * sizeof(float);
    const int mask_size = Map_out * Channel * K * K * sizeof(float);
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    // Allocate device memory
    cudaMalloc((void**)device_input_ptr, input_size);
    cudaMalloc((void**)device_mask_ptr, mask_size);
    cudaMalloc((void**)device_output_ptr, output_size);

    // Transfer input data to device
    cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
    cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
}

__host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
    const float *device_mask, const int Batch, const int Map_out, const int Channel,
    const int Height, const int Width, const int K)
{
    // Calculate dimensions
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int Height_unroll = Channel * K * K;
    const int Width_unroll = Height_out * Width_out;

    // Allocate memory for intermediate results
    float *unrolled_input = nullptr;
    float *matmul_output = nullptr;

    const size_t unrolled_size = (size_t)Batch * Height_unroll * Width_unroll * sizeof(float);
    const size_t matmul_size = (size_t)Batch * Map_out * Width_unroll * sizeof(float);

    cudaMalloc(&unrolled_input, unrolled_size);
    cudaMalloc(&matmul_output, matmul_size);

    // Configure and launch unrolling kernel
    dim3 blockDim(8, 16, 4);  // Optimized for better occupancy
    dim3 gridDim(
        (Channel + blockDim.x - 1) / blockDim.x,
        ((Height_out * Width_out) + blockDim.y - 1) / blockDim.y,
        (Batch + blockDim.z - 1) / blockDim.z
    );

    matrix_unrolling_kernel<<<gridDim, blockDim>>>(
        device_input, unrolled_input,
        Batch, Channel, Height, Width, K
    );

    // Create and configure cuBLAS handle
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set up parameters for strided batched GEMM
    const float alpha = 1.0f;
    const float beta = 0.0f;

    // Calculate strides for batched operation
    const long long int strideA = 0;  // Mask is shared across batches
    const long long int strideB = Height_unroll * Width_unroll;  // Stride between input matrices
    const long long int strideC = Map_out * Width_unroll;        // Stride between output matrices

    // Perform batched matrix multiplication
    // Note: cuBLAS uses column-major order, so we transpose the operation
    // C[b] = A * B[b], where b is the batch index
    cublasSgemmStridedBatched(handle,
        CUBLAS_OP_N,    // No operation on A (mask)
        CUBLAS_OP_N,    // No operation on B (unrolled input)
        Width_unroll,   // M: number of rows of B and C
        Map_out,        // N: number of columns of A and C
        Height_unroll,  // K: number of columns of B and rows of A
        &alpha,
        unrolled_input, Width_unroll, strideB,  // Matrix B (input)
        device_mask, Height_unroll, strideA,    // Matrix A (mask)
        &beta,
        matmul_output, Width_unroll, strideC,   // Matrix C (output)
        Batch                                    // Number of matrices
    );

    // Configure and launch permute kernel
    dim3 permute_block(BLOCK_SIZE);
    dim3 permute_grid((Height_out * Width_out + BLOCK_SIZE - 1) / BLOCK_SIZE, Batch);

    matrix_permute_kernel<<<permute_grid, permute_block>>>(
        matmul_output, device_output,
        Map_out, Batch, Height_out, Width_out
    );

    // Cleanup
    cublasDestroy(handle);
    cudaFree(unrolled_input);
    cudaFree(matmul_output);
}

__host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
    float *device_input, float *device_mask, const int Batch, const int Map_out,
    const int Channel, const int Height, const int Width, const int K)
{
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    // Copy result back to host
    cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(device_output);
    cudaFree(device_input);
    cudaFree(device_mask);
}

__host__ void GPUInterface::get_device_properties()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for(int dev = 0; dev < deviceCount; dev++)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
        std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
        std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
        std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
        std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
        std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
        std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
        std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
        std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
    }
}