Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __device__ float produce(int iteration, unsigned idx);
- __device__ void consume(int iteration, unsigned idx, float value);
- __global__ void kernel(int iterations)
- {
- /*
- * Tuning parameters: Number of samples to buffer
- * and number of producer vs. consumer threads
- */
- constexpr unsigned bufsize = 128;
- const unsigned producer_count = 32;
- const unsigned consumer_count = blockDim.x - producer_count;
- /*
- * Simple double-buffer setup to make a producer-consumer pipeline
- */
- __shared__ float buf[2][bufsize];
- const bool is_producer = threadIdx.x < producer_count;
- /*
- * Fill first buffer. This can use all threads
- */
- for(unsigned i = threadIdx.x; i < bufsize; i += blockDim.x)
- buf[0][i] = produce(0, i);
- for(int i = 1; i < iterations; ++i) {
- __syncthreads();
- int cur_buf = i & 1;
- int last_buf = (i - 1) & 1;
- if(is_producer) {
- // fill next buffer
- for(unsigned j = threadIdx.x; j < bufsize; j += producer_count)
- buf[cur_buf][j] = produce(i, j);
- }
- else {
- const unsigned consumer_idx = threadIdx.x - producer_count;
- // consume last buffer
- for(unsigned j = consumer_idx; j < bufsize; j += consumer_count)
- consume(i - 1, j, buf[last_buf][j]);
- }
- }
- __syncthreads();
- /*
- * Consume last buffer. Can use all threads
- */
- int last_buf = (iterations - 1) & 1;
- for(unsigned i = threadIdx.x; i < bufsize; i += blockDim.x)
- consume(iterations - 1, i, buf[last_buf][i]);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement