Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <string>
- #include <cstdio>
- #include <cstring>
- #include <thread>
- #include <fcntl.h>
- /* Compile this using Clang. GCC gets about 6.0 fps, Clang gets slightly more */
- void YUV444pBlend(const unsigned char* __restrict__ frame1,
- const unsigned char* __restrict__ frame2,
- const unsigned char* __restrict__ frame3,
- const unsigned char* __restrict__ frame4,
- std::size_t num_bytes,
- unsigned char* __restrict__ target)
- {
- #if 0
- typedef unsigned long unit;
- const unit* source1 = (const unit*)__builtin_assume_aligned(frame1,32);
- const unit* source2 = (const unit*)__builtin_assume_aligned(frame2,32);
- const unit* source3 = (const unit*)__builtin_assume_aligned(frame3,32);
- const unit* source4 = (const unit*)__builtin_assume_aligned(frame4,32);
- unit* destination = (unit*)__builtin_assume_aligned(target,32);
- #pragma omp parallel for simd schedule(static) num_threads(4)
- for(std::size_t p=0; p<num_bytes/sizeof(unit); ++p)
- {
- unit word1 = ((const unit*)__builtin_assume_aligned(source1,32))[p];
- unit word2 = ((const unit*)__builtin_assume_aligned(source2,32))[p];
- unit word3 = ((const unit*)__builtin_assume_aligned(source3,32))[p];
- unit word4 = ((const unit*)__builtin_assume_aligned(source4,32))[p];
- const unit mask = (~(unit)(0)) / 0x1111; // 000F000F...
- unit lo1 = word1 & mask, mid1 = (word1>>4) & mask, next1 = (word1>>8) & mask, hi1 = (word1>>12) & mask;
- unit lo2 = word2 & mask, mid2 = (word2>>4) & mask, next2 = (word2>>8) & mask, hi2 = (word2>>12) & mask;
- unit lo3 = word3 & mask, mid3 = (word3>>4) & mask, next3 = (word3>>8) & mask, hi3 = (word3>>12) & mask;
- unit lo4 = word4 & mask, mid4 = (word4>>4) & mask, next4 = (word4>>8) & mask, hi4 = (word4>>12) & mask;
- unit losum = lo1*23u + (lo2 +lo3)*54u + lo4*125u; // 0#xx
- unit midsum = mid1*23u + (mid2 +mid3)*54u + mid4*125u; // 0#xx
- unit nextsum = next1*23u + (next2+next3)*54u + next4*125u; // 0#xx
- unit hisum = hi1*23u + (hi2 +hi3)*54u + hi4*125u; // 0#xx
- unit result = ((losum & (mask<<8)) >> 8)
- | ((midsum & (mask<<8)) >> 4)
- | ((nextsum & (mask<<8)) )
- | ((hisum & (mask<<8)) << 4);
- ((unit*)__builtin_assume_aligned(destination,32))[p] = result;
- }
- /* (a*0.3 + b*0.7)*0.3 + (c*0.3 + d*0.7)*0.7
- = a*0.09 + b*0.21 + c*0.21 + d*0.49
- =
- = Approximate these factors, 0.09, 0.21, 0.49 using 23, 54, 125
- */
- #else
- #pragma omp simd //parallel for simd num_threads(2)
- for(std::size_t p=0; p<num_bytes; ++p)
- {
- unsigned char byte1 = frame1[p], byte2 = frame2[p], byte3 = frame3[p], byte4 = frame4[p];
- unsigned char lo1 = byte1, lo2 = byte2, lo3 = byte3, lo4 = byte4;
- unsigned char hi1 = byte1&0xF0, hi2 = byte2&0xF0, hi3 = byte3&0xF0, hi4 = byte4&0xF0;
- unsigned lo = lo1*23u + (lo2+lo3)*54u + lo4*125u; // 0000x#xx
- unsigned hi = hi1*23u + (hi2+hi3)*54u + hi4*125u; // 000x#xxx
- target[p] = ((hi >> 8)&0xF) + (lo>>8);
- }
- #endif
- }
- static std::size_t num_bytes;
- static void pipe_resize(FILE* fp)
- {
- /*
- sudo sysctl fs.pipe-user-pages-soft=0
- sudo sysctl fs.pipe-max-size=$[1048576*512]
- sudo setcap 'CAP_SYS_RESOURCE=+ep' tblend2
- sudo setcap 'CAP_SYS_RESOURCE=+ep' tblend
- */
- int prev_err=0;
- for(unsigned power=41; power>10; --power)
- {
- int r = 0;
- for(int tries=0; tries<4000; ++tries)
- {
- r = fcntl(fileno(fp), F_SETPIPE_SZ, 1ul<<power);
- if(r >= 0) break;
- }
- if(r >= 0)
- {
- std::fprintf(stderr, "Pipe size successfully set to %lu (r=%d)\n", 1ul<<power, r);
- break;
- }
- else
- {
- if(errno != prev_err)
- {
- std::fprintf(stderr, "Failed to set pipe size to %lu; ", 1ul<<power);
- std::perror("fcntl");
- prev_err=errno;
- }
- }
- }
- int s = fcntl(fileno(fp), F_GETPIPE_SZ);
- if(s > 0)
- std::fprintf(stderr, "Pipe size is %d bytes\n", s);
- else
- std::perror("fcntl");
- }
- int main(int argc, char** argv)
- {
- std::size_t num_pixels = std::stoi(argv[1]) * std::stoi(argv[2]);
- num_bytes = num_pixels * 3;
- std::size_t interval = num_bytes*4;
- unsigned char* buffer = new unsigned char[interval];
- unsigned char* outbuf = new unsigned char[num_bytes];
- unsigned char* buffer1 = new unsigned char[interval];
- unsigned char* outbuf1 = new unsigned char[num_bytes];
- std::thread writer, processor;
- pipe_resize(stdin);
- pipe_resize(stdout);
- for(;;)
- {
- std::swap(buffer,buffer1);
- std::size_t p = 0;
- while(p < interval)
- {
- std::size_t eat = interval - p;
- int r = std::fread(buffer+p, 1, eat, stdin);
- if(r <= 0) break;
- p += r;
- }
- if(!p) break;
- if(p < interval)
- {
- std::memset(buffer+p, 0, interval-p);
- }
- if(processor.joinable()) processor.join();
- processor = std::thread([buffer,&outbuf,&outbuf1,&writer]()
- {
- std::swap(outbuf,outbuf1);
- YUV444pBlend(buffer ,
- buffer + num_bytes,
- buffer + num_bytes*2,
- buffer + num_bytes*3,
- num_bytes,
- outbuf);
- if(writer.joinable()) writer.join();
- writer = std::thread([outbuf]()
- {
- std::size_t p = 0;
- while(p < num_bytes)
- {
- std::size_t eat = num_bytes - p;
- int r = std::fwrite(outbuf + p, 1, eat, stdout);
- if(r <= 0) break;
- p += r;
- }
- });
- });
- }
- if(processor.joinable()) processor.join();
- if(writer.joinable()) writer.join();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement