temp

#define NUM_STREAMS 4 // Number of CUDA streams
cudaStream_t streams[NUM_STREAMS];

// Create CUDA streams
for (int i = 0; i < NUM_STREAMS; i++) {
    cudaStreamCreate(&streams[i]);
}

// Divide Batch into chunks and assign to streams
for (int i = 0; i < NUM_STREAMS; i++) {
    // Calculate batch_start, batch_size, offsets, etc.

    // Asynchronously copy input data to device
    cudaMemcpyAsync(*device_input_ptr + input_offset, host_input_pinned + input_offset,
                   input_chunk_size, cudaMemcpyHostToDevice, streams[i]);

    // Launch kernels in the stream
    conv_forward_gpu_part(*device_output_ptr, *device_input_ptr, *device_mask_ptr,
                          batch_start, batch_size, Map_out, Channel, Height, Width, K, streams[i]);

    // Asynchronously copy output data back to host
    cudaMemcpyAsync(host_output_pinned + output_offset, *device_output_ptr + output_offset,
                   output_chunk_size, cudaMemcpyDeviceToHost, streams[i]);
}

// Synchronize all streams
for (int i = 0; i < NUM_STREAMS; i++) {
    cudaStreamSynchronize(streams[i]);
}