Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define NUM_STREAMS 4 // Number of CUDA streams
- cudaStream_t streams[NUM_STREAMS];
- // Create CUDA streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- cudaStreamCreate(&streams[i]);
- }
- // Divide Batch into chunks and assign to streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- // Calculate batch_start, batch_size, offsets, etc.
- // Asynchronously copy input data to device
- cudaMemcpyAsync(*device_input_ptr + input_offset, host_input_pinned + input_offset,
- input_chunk_size, cudaMemcpyHostToDevice, streams[i]);
- // Launch kernels in the stream
- conv_forward_gpu_part(*device_output_ptr, *device_input_ptr, *device_mask_ptr,
- batch_start, batch_size, Map_out, Channel, Height, Width, K, streams[i]);
- // Asynchronously copy output data back to host
- cudaMemcpyAsync(host_output_pinned + output_offset, *device_output_ptr + output_offset,
- output_chunk_size, cudaMemcpyDeviceToHost, streams[i]);
- }
- // Synchronize all streams
- for (int i = 0; i < NUM_STREAMS; i++) {
- cudaStreamSynchronize(streams[i]);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement