cublas_1

#include <cmath>
#include <iostream>
#include <cublas_v2.h>
#include "gpu-new-forward.h"

#define TILE_WIDTH 16
#define BLOCK_SIZE 512

// Error checking macro for cuBLAS
#define CUBLAS_CHECK(call) \
    do { \
        cublasStatus_t status = call; \
        if (status != CUBLAS_STATUS_SUCCESS) { \
            std::cerr << "cuBLAS error at " << __FILE__ << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    } while(0)

// The matrix unrolling kernel remains the same as it's already optimized
__global__ void matrix_unrolling_kernel(const float *input, float *output,
                                      const int Batch, const int Channel,
                                      const int Height, const int Width,
                                      const int K) {
    #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
    #define out_2d(i1, i0) output[(i0) * (Channel * K * K) + i1]  // Fixed column-major indexing

    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int Width_unrolled = Batch * Height_out * Width_out;

    const int c = blockIdx.x * blockDim.x + threadIdx.x;
    const int hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
    const int batch_idx = blockIdx.z * blockDim.z + threadIdx.z;

    if (c >= Channel || hw_pos >= (Height_out * Width_out) || batch_idx >= Batch) return;

    const int h_out = hw_pos / Width_out;
    const int w_out = hw_pos % Width_out;
    const int w_unroll = batch_idx * Height_out * Width_out + h_out * Width_out + w_out;

    // Write in column-major format for cuBLAS
    for (int p = 0; p < K; p++) {
        for (int q = 0; q < K; q++) {
            const int h_unroll = c * K * K + p * K + q;
            out_2d(h_unroll, w_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
        }
    }

    #undef in_4d
    #undef out_2d
}

// The permute kernel remains the same
__global__ void matrix_permute_kernel(const float *input, float *output,
                                    const int Map_out, const int Batch,
                                    const int Height_out, const int Width_out) {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int total_elements = Batch * Height_out * Width_out;

    if (tid < total_elements) {
        const int b = tid / (Height_out * Width_out);
        const int h = (tid / Width_out) % Height_out;
        const int w = tid % Width_out;

        // Fixed indexing for correct output layout
        #define in_2d(i1, i0) input[(i0) * Map_out + i1]
        #define out_4d(n, c, h, w) output[(n) * (Map_out * Height_out * Width_out) + (c) * (Height_out * Width_out) + (h) * Width_out + w]

        for (int m = 0; m < Map_out; m++) {
            out_4d(b, m, h, w) = in_2d(m, tid);
        }

        #undef in_2d
        #undef out_4d
    }
}

class CuBLASHandler {
private:
    static cublasHandle_t handle;
    static bool initialized;

public:
    static cublasHandle_t& getHandle() {
        if (!initialized) {
            CUBLAS_CHECK(cublasCreate(&handle));
            initialized = true;
        }
        return handle;
    }

    static void destroy() {
        if (initialized) {
            CUBLAS_CHECK(cublasDestroy(handle));
            initialized = false;
        }
    }
};


cublasHandle_t CuBLASHandler::handle;
bool CuBLASHandler::initialized = false;

__host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
    const float *host_mask, float **device_output_ptr, float **device_input_ptr,
    float **device_mask_ptr, const int Batch, const int Map_out, const int Channel,
    const int Height, const int Width, const int K)
{
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;

    cudaMalloc((void**)device_input_ptr, Batch * Channel * Height * Width * sizeof(float));
    cudaMalloc((void**)device_mask_ptr, Map_out * Channel * K * K * sizeof(float));
    cudaMalloc((void**)device_output_ptr, Batch * Map_out * Height_out * Width_out * sizeof(float));

    cudaMemcpy(*device_input_ptr, host_input, Batch * Channel * Height * Width * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(*device_mask_ptr, host_mask, Map_out * Channel * K * K * sizeof(float), cudaMemcpyHostToDevice);
}

__host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
    const float *device_mask, const int Batch, const int Map_out, const int Channel,
    const int Height, const int Width, const int K)
{
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int Height_unrolled = Channel * K * K;
    const int Width_unrolled = Batch * Height_out * Width_out;

    float *unrolled_matrix, *matmul_output;
    cudaMalloc((void**)&unrolled_matrix, (size_t)Height_unrolled * Width_unrolled * sizeof(float));
    cudaMalloc((void**)&matmul_output, (size_t)Map_out * Width_unrolled * sizeof(float));

    // Unroll input matrix
    dim3 blockDim(4, 256, 1);
    dim3 gridDim(
        (Channel + blockDim.x - 1) / blockDim.x,
        (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
        (Batch + blockDim.z - 1) / blockDim.z
    );

    matrix_unrolling_kernel<<<gridDim, blockDim>>>(
        device_input, unrolled_matrix,
        Batch, Channel, Height, Width, K
    );

    // Perform matrix multiplication
    cublasHandle_t handle = CuBLASHandler::getHandle();
    float alpha = 1.0f, beta = 0.0f;

    // Fixed cuBLAS call with correct dimensions and leading dimensions
    CUBLAS_CHECK(cublasSgemm(
        handle,
        CUBLAS_OP_T,    // Transpose mask matrix
        CUBLAS_OP_N,    // No transpose for unrolled matrix
        Map_out,        // M: rows of op(A)
        Width_unrolled, // N: cols of op(B)
        Height_unrolled,// K: cols of op(A)
        &alpha,
        device_mask,    // A matrix
        Height_unrolled,// LDA: leading dimension of A
        unrolled_matrix,// B matrix
        Height_unrolled,// LDB: leading dimension of B
        &beta,
        matmul_output,  // C matrix
        Map_out         // LDC: leading dimension of C
    ));

    // Reshape output
    const int total_threads = Batch * Height_out * Width_out;
    const int num_blocks = (total_threads + BLOCK_SIZE - 1) / BLOCK_SIZE;

    matrix_permute_kernel<<<num_blocks, BLOCK_SIZE>>>(
        matmul_output, device_output,
        Map_out, Batch, Height_out, Width_out
    );

    cudaFree(matmul_output);
    cudaFree(unrolled_matrix);
}

__host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
    float *device_input, float *device_mask, const int Batch, const int Map_out,
    const int Channel, const int Height, const int Width, const int K)
{
    // Calculate output size
    const int Height_out = Height - K + 1;
    const int Width_out = Width - K + 1;
    const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);

    // Copy result back to host
    cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(device_output);
    cudaFree(device_input);
    cudaFree(device_mask);

    // Clean up cuBLAS
    CuBLASHandler::destroy();
}

__host__ void GPUInterface::get_device_properties()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for(int dev = 0; dev < deviceCount; dev++)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
        std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
        std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
        std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
        std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
        std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
        std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
        std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
        std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
    }
}