Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cmath>
- #include <iostream>
- #include "gpu-new-forward.h"
- #define TILE_WIDTH 16
- #define BLOCK_SIZE 256
- __global__ void fused_unroll_matmul_kernel(const float *input, const float *mask, float *output,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- // Calculate dimensions
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int H_unroll = Channel * K * K;
- const int W_unroll = Height_out * Width_out;
- // Shared memory for input and mask tiles
- __shared__ float shared_mask[TILE_WIDTH][TILE_WIDTH];
- __shared__ float shared_input[TILE_WIDTH][TILE_WIDTH];
- // Calculate thread and block indices
- const int tx = threadIdx.x;
- const int ty = threadIdx.y;
- const int row = blockIdx.y * TILE_WIDTH + ty;
- const int col = blockIdx.x * TILE_WIDTH + tx;
- const int batch_idx = blockIdx.z;
- // Each thread accumulates one element of the output
- float acc = 0.0f;
- // Calculate how many tiles we need
- const int numTiles = (H_unroll + TILE_WIDTH - 1) / TILE_WIDTH;
- // Loop over tiles
- for(int tile = 0; tile < numTiles; tile++) {
- // Load mask tile - each thread loads one element
- if(row < Map_out && (tile * TILE_WIDTH + tx) < H_unroll) {
- shared_mask[ty][tx] = mask[row * H_unroll + tile * TILE_WIDTH + tx];
- } else {
- shared_mask[ty][tx] = 0.0f;
- }
- // Load and transform input data directly into shared memory
- if(col < W_unroll && (tile * TILE_WIDTH + ty) < H_unroll) {
- // Calculate original input indices
- int unfoldedIdx = tile * TILE_WIDTH + ty;
- int c = unfoldedIdx / (K * K);
- int pixelOffset = unfoldedIdx % (K * K);
- int kh = pixelOffset / K;
- int kw = pixelOffset % K;
- int h_out = col / Width_out;
- int w_out = col % Width_out;
- // Load from input with transformed indices
- shared_input[ty][tx] = input[
- batch_idx * (Channel * Height * Width) +
- c * (Height * Width) +
- (h_out + kh) * Width +
- (w_out + kw)
- ];
- } else {
- shared_input[ty][tx] = 0.0f;
- }
- __syncthreads();
- // Compute partial dot product for this tile
- if(row < Map_out && col < W_unroll) {
- for(int k = 0; k < TILE_WIDTH; k++) {
- acc += shared_mask[ty][k] * shared_input[k][tx];
- }
- }
- __syncthreads();
- }
- // Write output with transformed indices to match desired format
- if(row < Map_out && col < W_unroll) {
- int h_out = col / Width_out;
- int w_out = col % Width_out;
- output[
- batch_idx * (Map_out * Height_out * Width_out) +
- row * (Height_out * Width_out) +
- h_out * Width_out +
- w_out
- ] = acc;
- }
- }
- __host__ void GPUInterface::conv_forward_gpu_prolog(const float *host_output, const float *host_input,
- const float *host_mask, float **device_output_ptr,
- float **device_input_ptr, float **device_mask_ptr,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- // Calculate sizes
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int input_size = Batch * Channel * Height * Width * sizeof(float);
- const int mask_size = Map_out * Channel * K * K * sizeof(float);
- const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
- cudaMalloc((void**)device_input_ptr, input_size);
- cudaMalloc((void**)device_mask_ptr, mask_size);
- cudaMalloc((void**)device_output_ptr, output_size);
- cudaMemcpy(*device_input_ptr, host_input, input_size, cudaMemcpyHostToDevice);
- cudaMemcpy(*device_mask_ptr, host_mask, mask_size, cudaMemcpyHostToDevice);
- }
- __host__ void GPUInterface::conv_forward_gpu(float *device_output, const float *device_input,
- const float *device_mask, const int Batch, const int Map_out,
- const int Channel, const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int W_unroll = Height_out * Width_out;
- // Configure kernel launch parameters
- dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
- dim3 gridDim(
- (W_unroll + TILE_WIDTH - 1) / TILE_WIDTH, // Width dimension
- (Map_out + TILE_WIDTH - 1) / TILE_WIDTH, // Height dimension
- Batch // Batch dimension
- );
- // Launch fused kernel
- fused_unroll_matmul_kernel<<<gridDim, blockDim>>>(
- device_input, device_mask, device_output,
- Batch, Map_out, Channel, Height, Width, K
- );
- }
- __host__ void GPUInterface::conv_forward_gpu_epilog(float *host_output, float *device_output,
- float *device_input, float *device_mask,
- const int Batch, const int Map_out, const int Channel,
- const int Height, const int Width, const int K) {
- const int Height_out = Height - K + 1;
- const int Width_out = Width - K + 1;
- const int output_size = Batch * Map_out * Height_out * Width_out * sizeof(float);
- cudaMemcpy(host_output, device_output, output_size, cudaMemcpyDeviceToHost);
- cudaFree(device_output);
- cudaFree(device_input);
- cudaFree(device_mask);
- }
- // Host function: Retrieves and prints device properties
- __host__ void GPUInterface::get_device_properties()
- {
- int deviceCount;
- cudaGetDeviceCount(&deviceCount);
- for(int dev = 0; dev < deviceCount; dev++)
- {
- cudaDeviceProp deviceProp;
- cudaGetDeviceProperties(&deviceProp, dev);
- std::cout << "Device " << dev << " name: " << deviceProp.name << std::endl;
- std::cout << "Computational capabilities: " << deviceProp.major << "." << deviceProp.minor << std::endl;
- std::cout << "Max Global memory size: " << deviceProp.totalGlobalMem << std::endl;
- std::cout << "Max Constant memory size: " << deviceProp.totalConstMem << std::endl;
- std::cout << "Max Shared memory size per block: " << deviceProp.sharedMemPerBlock << std::endl;
- std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
- std::cout << "Max block dimensions: " << deviceProp.maxThreadsDim[0] << " x, "
- << deviceProp.maxThreadsDim[1] << " y, " << deviceProp.maxThreadsDim[2]
- << " z" << std::endl;
- std::cout << "Max grid dimensions: " << deviceProp.maxGridSize[0] << " x, "
- << deviceProp.maxGridSize[1] << " y, " << deviceProp.maxGridSize[2]
- << " z" << std::endl;
- std::cout << "Warp Size: " << deviceProp.warpSize << std::endl;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement