Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cmath>
- #include <iostream>
- #include "gpu-new-forward.h"
- #define TILE_WIDTH 16
- #define BLOCK_SIZE 512
- #define NUM_STREAMS 4 // Number of CUDA streams
- // Global variables
- float *host_input_pinned;
- float *host_output_pinned;
- cudaStream_t streams[NUM_STREAMS];
- __global__ void matrix_unrolling_kernel(const float *input, float *output,
- const int Batch, const int Channel,
- const int Height, const int Width,
- const int K, const int batch_start) {
- #define in_4d(i3, i2, i1, i0) input[(i3) * (Channel * Height * Width) + (i2) * (Height * Width) + (i1) * (Width) + i0]
- #define out_3d(i1, i0) output[(i1) * (Batch * W_unroll) + i0]
- const size_t Height_out = Height - K + 1;
- const size_t Width_out = Width - K + 1;
- const size_t W_unroll = Height_out * Width_out;
- const size_t H_unroll = Channel * K * K;
- const size_t c = blockIdx.x * blockDim.x + threadIdx.x;
- const size_t hw_pos = blockIdx.y * blockDim.y + threadIdx.y;
- const size_t batch_idx = batch_start + blockIdx.z;
- const size_t h_out = hw_pos / Width_out;
- const size_t w_out = hw_pos % Width_out;
- if (c >= Channel || h_out >= Height_out || w_out >= Width_out || batch_idx >= (batch_start + Batch)) {
- return;
- }
- const size_t w_unroll = h_out * Width_out + w_out;
- const size_t w_total_unroll = (batch_idx - batch_start) * W_unroll + w_unroll;
- const size_t w_base = c * K * K;
- for (int p = 0; p < K; p++) {
- for (int q = 0; q < K; q++) {
- int h_unroll = w_base + p * K + q;
- out_3d(h_unroll, w_total_unroll) = in_4d(batch_idx, c, h_out + p, w_out + q);
- }
- }
- #undef in_4d
- #undef out_3d
- }
- __global__ void matrixMultiplyShared(const float *A, const float *B, float *C,
- int numARows, int numAColumns,
- int numBRows, int numBColumns,
- int numCRows, int numCColumns)
- {
- __shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
- __shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
- int by = blockIdx.y, bx = blockIdx.x, ty = threadIdx.y, tx = threadIdx.x;
- int row = by * TILE_WIDTH + ty, col = bx * TILE_WIDTH + tx;
- float val = 0;
- for (int tileId = 0; tileId < (numAColumns - 1) / TILE_WIDTH + 1; tileId++) {
- if (row < numARows && tileId * TILE_WIDTH + tx < numAColumns) {
- tileA[ty][tx] = A[(size_t) row * numAColumns + tileId * TILE_WIDTH + tx];
- } else {
- tileA[ty][tx] = 0;
- }
- if (col < numBColumns && tileId * TILE_WIDTH + ty < numBRows) {
- tileB[ty][tx] = B[((size_t) tileId * TILE_WIDTH + ty) * numBColumns + col];
- } else {
- tileB[ty][tx] = 0;
- }
- __syncthreads();
- if (row < numCRows && col < numCColumns) {
- for (int i = 0; i < TILE_WIDTH; i++) {
- val += tileA[ty][i] * tileB[i][tx];
- }
- }
- __syncthreads();
- }
- if (row < numCRows && col < numCColumns) {
- C[row * numCColumns + col] = val;
- }
- }
- __global__ void matrix_permute_kernel(const float *input, float *output, int Map_out,
- int Batch, int image_size, int batch_start) {
- int b = batch_start + blockIdx.y;
- int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
- if (x < image_size) {
- for (int m = 0; m < Map_out; m++) {
- output[b * Map_out * image_size + m * image_size + x] =
- input[m * Batch * image_size + (b - batch_start) * image_size + x];
- }
- }
- }
- __host__ void conv_forward_gpu_part(float *device_output, const float *device_input, const float *device_mask,
- const int batch_start, const int batch_size,
- const int Map_out, const int Channel, const int Height, const int Width, const int K,
- cudaStream_t stream)
- {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int Height_unrolled = Channel * K * K;
- const int Width_unrolled = batch_size * Height_out * Width_out;
- // Allocate temporary storage for unrolled matrix and matmul output
- float *unrolled_matrix;
- float *matmul_output;
- cudaMalloc((void**)&unrolled_matrix, (size_t) Height_unrolled * Width_unrolled * sizeof(float));
- cudaMalloc((void**)&matmul_output, (Map_out * Width_unrolled) * sizeof(float));
- // Launch matrix unrolling kernel
- dim3 blockDim(16, 16, 1);
- dim3 gridDim(
- (Channel + blockDim.x - 1) / blockDim.x,
- (Height_out * Width_out + blockDim.y - 1) / blockDim.y,
- batch_size
- );
- matrix_unrolling_kernel<<<gridDim, blockDim, 0, stream>>>(
- device_input, unrolled_matrix,
- batch_size, Channel, Height, Width, K, batch_start
- );
- // Launch matrix multiplication kernel
- dim3 dimGrid((Width_unrolled - 1)/TILE_WIDTH + 1, (Map_out - 1)/TILE_WIDTH + 1, 1);
- dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
- matrixMultiplyShared<<<dimGrid, dimBlock, 0, stream>>>(
- device_mask, unrolled_matrix, matmul_output,
- Map_out, Height_unrolled,
- Height_unrolled, Width_unrolled,
- Map_out, Width_unrolled
- );
- // Launch matrix permute kernel
- const int out_image_size = Height_out * Width_out;
- dim3 permute_grid_dim((out_image_size - 1) / BLOCK_SIZE + 1, batch_size, 1);
- matrix_permute_kernel<<<permute_grid_dim, BLOCK_SIZE, 0, stream>>>(
- matmul_output, device_output, Map_out, batch_size, out_image_size, batch_start
- );
- // Free temporary device memory
- cudaFree(unrolled_matrix);
- cudaFree(matmul_output);
- }
- __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input, const float *host_mask,
- float **device_output_ptr, float **device_input_ptr, float **device_mask_ptr,
- const int Batch, const int Map_out, const int Channel, const int Height, const int Width, const int K)
- {
- // Calculate sizes
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int input_size = Batch * Channel * Height * Width * sizeof(float);
- const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
- const int mask_size = Map_out * Channel * K * K * sizeof(float);
- // Allocate pinned host memory
- cudaMallocHost((void**)&host_input_pinned, input_size);
- cudaMallocHost((void**)&host_output_pinned, output_size);
- memcpy(host_input_pinned, host_input, input_size);
- // Allocate device memory
- cudaMalloc((void**)device_input_ptr, input_size);
- cudaMalloc((void**)device_output_ptr, output_size);
- cudaMalloc((void**)device_mask_ptr, mask_size);
- // Copy mask to device
- cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
- // Create CUDA streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- cudaStreamCreate(&streams[i]);
- }
- // Divide Batch into chunks
- int chunk_size = (Batch + NUM_STREAMS - 1) / NUM_STREAMS;
- for (int i = 0; i < NUM_STREAMS; i++) {
- int batch_start = i * chunk_size;
- int batch_end = min(batch_start + chunk_size, Batch);
- int batch_size = batch_end - batch_start;
- if (batch_size > 0) {
- size_t input_offset = batch_start * Channel * Height * Width;
- size_t output_offset = batch_start * Map_out * Height_out * Width_out;
- size_t input_chunk_size = batch_size * Channel * Height * Width * sizeof(float);
- size_t output_chunk_size = batch_size * Map_out * Height_out * Width_out * sizeof(float);
- // Asynchronously copy input data to device
- cudaMemcpyAsync(*device_input_ptr + input_offset, host_input_pinned + input_offset,
- input_chunk_size, cudaMemcpyHostToDevice, streams[i]);
- // Launch kernels in the stream
- conv_forward_gpu_part(*device_output_ptr, *device_input_ptr, *device_mask_ptr,
- batch_start, batch_size, Map_out, Channel, Height, Width, K, streams[i]);
- // Asynchronously copy output data back to host
- cudaMemcpyAsync(host_output_pinned + output_offset, *device_output_ptr + output_offset,
- output_chunk_size, cudaMemcpyDeviceToHost, streams[i]);
- }
- }
- // Synchronize all streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- cudaStreamSynchronize(streams[i]);
- }
- // Copy the output from pinned memory to the original host_output
- memcpy((void*)host_output, host_output_pinned, output_size);
- }
- __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input, const float *device_mask,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K)
- {
- // This function is now handled within conv_forward_gpu_prolog
- }
- __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output, float *device_input,
- float *device_mask, const int Batch, const int Map_out,
- const int Channel, const int Height, const int Width, const int K)
- {
- // Free device memory
- cudaFree(device_output);
- cudaFree(device_input);
- cudaFree(device_mask);
- // Free pinned host memory
- cudaFreeHost(host_input_pinned);
- cudaFreeHost(host_output_pinned);
- // Destroy streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- cudaStreamDestroy(streams[i]);
- }
- }
- __host__ void GPUInterface::get_device_properties()
- {
- int deviceCount;
- cudaGetDeviceCount(&deviceCount);
- for(int dev = 0; dev < deviceCount; dev++)
- {
- cudaDeviceProp deviceProp;
- cudaGetDeviceProperties(&deviceProp, dev);
- std::cout<<"Device "<<dev<<" name: "<<deviceProp.name<<std::endl;
- std::cout<<"Computational capabilities: "<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
- std::cout<<"Max Global memory size: "<<deviceProp.totalGlobalMem<<std::endl;
- std::cout<<"Max Constant memory size: "<<deviceProp.totalConstMem<<std::endl;
- std::cout<<"Max Shared memory size per block: "<<deviceProp.sharedMemPerBlock<<std::endl;
- std::cout<<"Max threads per block: "<<deviceProp.maxThreadsPerBlock<<std::endl;
- std::cout<<"Max block dimensions: "<<deviceProp.maxThreadsDim[0]<<" x, "<<deviceProp.maxThreadsDim[1]<<" y, "<<deviceProp.maxThreadsDim[2]<<" z"<<std::endl;
- std::cout<<"Max grid dimensions: "<<deviceProp.maxGridSize[0]<<" x, "<<deviceProp.maxGridSize[1]<<" y, "<<deviceProp.maxGridSize[2]<<" z"<<std::endl;
- std::cout<<"Warp Size: "<<deviceProp.warpSize<<std::endl;
- }
- }
Add Comment
Please, Sign In to add comment