Advertisement
Bisqwit

math_simd.hh in prender2

Apr 6th, 2020
694
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 35.56 KB | None | 0 0
  1. #ifdef __SSE__
  2. #include <xmmintrin.h>
  3. #endif
  4. #ifdef __SSE2__
  5. #include <emmintrin.h>
  6.  
  7. #endif
  8. #ifdef __SSE3__
  9. #include <pmmintrin.h>
  10. #endif
  11. #ifdef __SSSE3__
  12. #include <tmmintrin.h>
  13. #endif
  14. #ifdef __SSE4_1__
  15. #include <smmintrin.h>
  16. #endif
  17.  
  18. #ifdef __SSE__
  19.   //#define ENABLE_SSE_FLOAT_2
  20.   //#define ENABLE_SSE_FLOAT_3
  21.   //#define ENABLE_SSE_FLOAT_4
  22.   //#define ENABLE_SSE_FLOAT_5
  23.   //#define ENABLE_SSE_FLOAT_6
  24.   //#define ENABLE_SSE_FLOAT_7
  25.   //#define ENABLE_SSE_FLOAT_8
  26.   //#define ENABLE_SSE_FLOAT_9
  27.   //#define ENABLE_SSE_FLOAT_10
  28. #endif
  29. #ifdef __SSE2__
  30.   //#define ENABLE_SSE2_DOUBLE_2
  31.   #define ENABLE_SSE2_DOUBLE_4
  32.   #define ENABLE_SSE2_DOUBLE_3
  33.   #define ENABLE_SSE2_DOUBLE_6
  34. #endif
  35. #ifdef __AVX__
  36.   //#define ENABLE_AVX_DOUBLE_3
  37.   //#define ENABLE_AVX_DOUBLE_4
  38.   //#define ENABLE_AVX_DOUBLE_8
  39. #endif
  40.  
  41. #ifdef __SSE__
  42. /* A single XMM register can fit (__m128):
  43.  *     Four floats
  44.  *     Two doubles
  45.  * If AVX is enabled (__m256),
  46.  *     Eight floats
  47.  *     Four doubles
  48.  */
  49. template<unsigned N,
  50.          unsigned TableSize = (N+3)/4,
  51.          unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
  52.         >
  53. struct float_SSE_vecFull
  54. {
  55. public:
  56.     typedef float T;
  57.     typedef float_SSE_vecFull<N,TableSize,Extra> me;
  58.     __m128 d[TableSize] __attribute__((aligned(16)));
  59.     T extra[Extra];
  60.     static constexpr unsigned cap = TableSize*4;
  61. public:
  62.     float_SSE_vecFull(): d{} {}
  63.  
  64.     /*
  65.     float_SSE_vecFull(T a,T b=0,T c=0,T e=0)
  66.         : d{}  { d[0] = _mm_set_ps(e,c,b,a); }
  67.     float_SSE_vecFull(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
  68.         : d{}  { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); }
  69.     float_SSE_vecFull(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
  70.         : d{}  { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); d[2] = _mm_set_ps(m,l,k,j); }
  71.     template<typename... U>
  72.         float_SSE_vecFull(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
  73.     */
  74. public:
  75.     VEC_GENERIC_METHODS(float_SSE_vecFull)
  76. public:
  77.     float_SSE_vecFull(const me& b)
  78.     {
  79.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  80.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  81.     }
  82.     float_SSE_vecFull(me&& b)
  83.     {
  84.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  85.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  86.     }
  87.     float_SSE_vecFull(const vec<float,N>& b)
  88.     {
  89.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  90.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  91.     }
  92.     float_SSE_vecFull(vec<float,N>&& b)
  93.     {
  94.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  95.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  96.     }
  97.     template<typename U>
  98.     float_SSE_vecFull(const vec<U,N>& b)
  99.     {
  100.         for(unsigned n=0; n<TableSize; ++n)
  101.             d[n] = _mm_set_ps( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
  102.         for(unsigned n=0; n<Extra; ++n)
  103.             extra[n] = b[cap+n];
  104.     }
  105.     float_SSE_vecFull& operator=(const float_SSE_vecFull& b) = default;
  106.     float_SSE_vecFull& operator=(float_SSE_vecFull&& b) = default;
  107.  
  108.     T operator[](unsigned n) const { return n<cap ? d[n/4][n%4] : extra[n-cap]; }
  109.     inline void set(unsigned n, T b) { if(n < cap) d[n/4][n%4] = b; else extra[n-cap] = b; }
  110.     T HorizontalSum() const
  111.     {
  112.         __m128 result = d[0];
  113.         for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, d[a]);
  114.         T out {};
  115.     #ifdef __SSE3__
  116.         if(N-Extra > 2) result = _mm_hadd_ps(result, result);
  117.         if(N-Extra > 1) result = _mm_hadd_ps(result, result);
  118.         out = result[0];
  119.     #else
  120.         if(N-Extra == 3) out = result[0] + result[1] + result[2];
  121.         else {
  122.             if(N-Extra > 2) result = _mm_add_ps(result, _mm_movehl_ps(result, result));
  123.             if(N-Extra > 1) out = result[0] + result[1];
  124.             out = result[0];
  125.         }
  126.     #endif
  127.         for(unsigned n=0; n<Extra; ++n) out += extra[n];
  128.         return out;
  129.     }
  130.  
  131.     vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], b.d[n]);
  132.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
  133.                                                    return result; }
  134.     vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], b.d[n]);
  135.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
  136.                                                    return result; }
  137.     vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], b.d[n]);
  138.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
  139.                                                    return result; }
  140.     vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], b.d[n]);
  141.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
  142.                                                    return result; }
  143.     vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], _mm_set1_ps(b));
  144.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
  145.                                                    return result; }
  146.     vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], _mm_set1_ps(b));
  147.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
  148.                                                    return result; }
  149.     vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], _mm_set1_ps(b));
  150.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
  151.                                                    return result; }
  152.     vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], _mm_set1_ps(b));
  153.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
  154.                                                    return result; }
  155.     vec<T,N> operator- () const
  156.     {
  157.         const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
  158.         vec<T,N> result;
  159.         for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_ps(d[n], mask);
  160.         for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
  161.         return result;
  162.     }
  163.     void clamp(float min, float max)
  164.     {
  165.         __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
  166.         for(unsigned n=0; n<TableSize; ++n)
  167.             d[n] = _mm_min_ps(_mm_max_ps(d[n], mi), ma);
  168.         for(unsigned n=0; n<Extra; ++n)
  169.             extra[n] = std::min(std::max(extra[n], min), max);
  170.     }
  171. };
  172. template<unsigned N,
  173.          unsigned TableSize = (N+3)/4,
  174.          unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
  175.         >
  176. struct float_SSE_vec
  177. {
  178. public:
  179.     typedef float T;
  180.     typedef float_SSE_vec<N,TableSize,Extra> me;
  181.     T d[N+Extra] __attribute__((aligned(16)));
  182.     static constexpr unsigned cap = TableSize*4;
  183.     static constexpr unsigned TableExt = TableSize + !!Extra;
  184. public:
  185.     float_SSE_vec(): d{} {}
  186. public:
  187.     VEC_GENERIC_METHODS(float_SSE_vec)
  188. public:
  189.     float_SSE_vec(const me& b) noexcept = default;
  190.     float_SSE_vec(me&& b) noexcept = default;
  191.     float_SSE_vec(const vec<float,N>& b) noexcept
  192.     {
  193.         for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
  194.     }
  195.     float_SSE_vec(vec<float,N>&& b) noexcept
  196.     {
  197.         for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
  198.     }
  199.  
  200.     template<typename U>
  201.     float_SSE_vec(const vec<U,N>& b)
  202.     {
  203.         for(unsigned n=0; n<N; ++n) d[n] = b[n];
  204.     }
  205.     float_SSE_vec& operator=(const float_SSE_vec& b) noexcept = default;
  206.     float_SSE_vec& operator=(float_SSE_vec&& b) noexcept = default;
  207.  
  208. private:
  209.     inline __m128 Make(unsigned w) const { return _mm_load_ps(&d[w*4]); }
  210.     void Extract(unsigned w, __m128 v) { _mm_store_ps(&d[w*4], v); }
  211.     void ExtractLast(__m128 v) { for(unsigned n=0; n<Extra; ++n) d[TableSize*4+n] = v[n]; }
  212. public:
  213.     inline T operator[](unsigned n) const { return d[n]; }
  214.     inline void set(unsigned n, T b) { d[n] = b; }
  215.  
  216.     T HorizontalSum() const
  217.     {
  218.         T out {};
  219.         if(TableSize > 0)
  220.         {
  221.             __m128 result = Make(0);
  222.             for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, Make(a));
  223.         #ifdef __SSE3__
  224.             result = _mm_hadd_ps(result, result);
  225.             result = _mm_hadd_ps(result, result);
  226.             out = result[0];
  227.         #else
  228.             result = _mm_add_ps(result, _mm_movehl_ps(result, result));
  229.             out = result[0] + result[1];
  230.         #endif
  231.         }
  232.         for(unsigned n=0; n<Extra; ++n) out += d[TableSize*4 + n];
  233.         return out;
  234.     }
  235.  
  236.     vec<T,N> operator+ (const vec<T,N>& b) const
  237.     {
  238.         vec<T,N> result;
  239.         /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), b.Make(n)));
  240.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] + b[TableSize*4]); }
  241.         else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), b.Make(TableSize))); }*/
  242.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b.d[n];
  243.         return result;
  244.     }
  245.     vec<T,N> operator- (const vec<T,N>& b) const
  246.     {
  247.         vec<T,N> result;
  248.         /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), b.Make(n)));
  249.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] - b[TableSize*4]); }
  250.         else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), b.Make(TableSize))); }*/
  251.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b.d[n];
  252.         return result;
  253.     }
  254.     vec<T,N> operator* (const vec<T,N>& b) const
  255.     {
  256.         vec<T,N> result;
  257.         /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), b.Make(n)));
  258.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] * b[TableSize*4]); }
  259.         else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), b.Make(TableSize))); }*/
  260.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b.d[n];
  261.         return result;
  262.     }
  263.     vec<T,N> operator/ (const vec<T,N>& b) const
  264.     {
  265.         vec<T,N> result;
  266.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), b.Make(n)));
  267.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] / b[TableSize*4]); }
  268.         else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), b.Make(TableSize))); }
  269.         return result;
  270.     }
  271.     vec<T,N> operator+ (T b) const
  272.     {
  273.         vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
  274.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), bb));
  275.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] + b); }
  276.         else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), bb)); }*/
  277.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b;
  278.         return result;
  279.     }
  280.     vec<T,N> operator- (T b) const
  281.     {
  282.         vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
  283.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), bb));
  284.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] - b); }
  285.         else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), bb)); }*/
  286.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b;
  287.         return result;
  288.     }
  289.     vec<T,N> operator* (T b) const
  290.     {
  291.         vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
  292.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), bb));
  293.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] * b); }
  294.         else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), bb)); }*/
  295.         for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b;
  296.         return result;
  297.     }
  298.     vec<T,N> operator/ (T b) const
  299.     {
  300.         vec<T,N> result; __m128 bb = _mm_set1_ps(b);
  301.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), bb));
  302.         if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] / b); }
  303.         else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), bb)); }
  304.         return result;
  305.     }
  306.     vec<T,N> operator- () const
  307.     {
  308.         vec<T,N> result;
  309.         /*const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
  310.         for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_xor_ps(Make(n), mask));
  311.         if(Extra==1)   { result.set(TableSize*4, -d[TableSize*4]); }
  312.         else if(Extra) { result.ExtractLast(_mm_xor_ps(Make(TableSize), mask)); }*/
  313.         for(unsigned n=0; n<N; ++n) result.d[n] = -d[n];
  314.         return result;
  315.     }
  316.     void clamp(float min, float max)
  317.     {
  318.         __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
  319.         for(unsigned n=0; n<TableSize; ++n) Extract(n, _mm_min_ps(_mm_max_ps(Make(n), mi), ma));
  320.         if(Extra==1)   { d[TableSize*4] = std::min(std::max(d[TableSize*4], min), max); }
  321.         else if(Extra) { ExtractLast(_mm_min_ps(_mm_max_ps(Make(TableSize), mi), ma)); }
  322.     }
  323. };
  324. #endif
  325. #ifdef __SSE2__
  326. template<unsigned N,
  327.          unsigned TableSize = (N+1)/2,
  328.          unsigned Extra = (N > TableSize*2) ? N-TableSize*2 : 0
  329.         >
  330. struct double_SSE2_vec
  331. {
  332. public:
  333.     typedef double T;
  334.     typedef double_SSE2_vec<N,TableSize,Extra> me;
  335.     __m128d d[TableSize] __attribute__((aligned(16)));
  336.     T extra[Extra];
  337.     static constexpr unsigned cap = TableSize*2;
  338. public:
  339.     double_SSE2_vec(): d{} {}
  340.     /*
  341.     double_SSE2_vec(T a,T b=0)
  342.         : d{}  { d[0] = _mm_set_pd(b,a); }
  343.     double_SSE2_vec(T a,T b, T c,T e=0)
  344.         : d{}  { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); }
  345.     double_SSE2_vec(T a,T b, T c,T e, T f,T g=0)
  346.         : d{}  { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); d[2] = _mm_set_pd(g,f); }
  347.     template<typename... U>
  348.     double_SSE2_vec(T,T, T,T, T,T, T,U&&...) = delete;
  349.     */
  350. public:
  351.     VEC_GENERIC_METHODS(double_SSE2_vec)
  352. public:
  353.     double_SSE2_vec(const me& b)
  354.     {
  355.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  356.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  357.     }
  358.     double_SSE2_vec(me&& b)
  359.     {
  360.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  361.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  362.     }
  363.     double_SSE2_vec(const vec<double,N>& b)
  364.     {
  365.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  366.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  367.     }
  368.     double_SSE2_vec(vec<double,N>&& b)
  369.     {
  370.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  371.         for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
  372.     }
  373.     template<typename U>
  374.     double_SSE2_vec(const vec<U,N>& b)
  375.     {
  376.         // FIXME: Array overflow access
  377.         for(unsigned n=0; n<TableSize; ++n)
  378.             d[n] = _mm_set_pd( b[n*2+1], b[n*2+0] );
  379.         for(unsigned n=0; n<Extra; ++n)
  380.             extra[n] = b[cap+n];
  381.     }
  382.     double_SSE2_vec& operator=(const double_SSE2_vec& b) = default;
  383.     double_SSE2_vec& operator=(double_SSE2_vec&& b) = default;
  384.  
  385.     T operator[](unsigned n) const { return n<cap ? d[n/2][n%2] : extra[n-cap]; }
  386.     inline void set(unsigned n, T b) { if(n < cap) d[n/2][n%2] = b; else extra[n-cap] = b; }
  387.     T HorizontalSum() const
  388.     {
  389.         __m128d result = d[0];
  390.         for(unsigned a=1; a<TableSize; ++a) result = _mm_add_pd(result, d[a]);
  391.         T out {};
  392.     #ifdef __SSE3__
  393.         if(N-Extra >= 2) result = _mm_hadd_pd(result, result);
  394.         out = result[0];
  395.     #else
  396.         if(N-Extra >= 2) out = result[0] + result[1];
  397.         else out = result[0];
  398.     #endif
  399.         for(unsigned n=0; n<Extra; ++n) out += extra[n];
  400.         return out;
  401.     }
  402.  
  403.     vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], b.d[n]);
  404.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
  405.                                                    return result; }
  406.     vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], b.d[n]);
  407.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
  408.                                                    return result; }
  409.     vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], b.d[n]);
  410.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
  411.                                                    return result; }
  412.     vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], b.d[n]);
  413.                                                                     for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
  414.                                                    return result; }
  415.     vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], _mm_set1_pd(b));
  416.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
  417.                                                    return result; }
  418.     vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], _mm_set1_pd(b));
  419.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
  420.                                                    return result; }
  421.     vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], _mm_set1_pd(b));
  422.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
  423.                                                    return result; }
  424.     vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], _mm_set1_pd(b));
  425.                                                       for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
  426.                                                    return result; }
  427.     vec<T,N> operator- () const
  428.     {
  429.         const int f = 0x80000000u; __m128d mask = (__m128d)_mm_set_epi32(f,0,f,0);
  430.         vec<T,N> result;
  431.         for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_pd(d[n], mask);
  432.         for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
  433.         return result;
  434.     }
  435.     void clamp(double min, double max)
  436.     {
  437.         __m128d mi = _mm_set1_pd(min), ma = _mm_set1_pd(max);
  438.         for(unsigned n=0; n<TableSize; ++n)
  439.             d[n] = _mm_min_pd(_mm_max_pd(d[n], mi), ma);
  440.         for(unsigned n=0; n<Extra; ++n)
  441.             extra[n] = std::min(std::max(extra[n], min), max);
  442.     }
  443. };
  444. #endif
  445. #ifdef __AVX__
  446. template<unsigned N,
  447.          unsigned TableSize = (N+7)/8>
  448. struct float_AVX_vec
  449. {
  450. public:
  451.     typedef float T;
  452.     typedef float_AVX_vec<N> me;
  453.     __m256 d[TableSize] __attribute__((aligned(16)));
  454. public:
  455.     float_AVX_vec(): d{} {}
  456.  
  457.     float_AVX_vec(T a,T b=0,T c=0,T e=0,T f=0,T g=0,T h=0,T i=0)
  458.         : d{}  { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); }
  459.     float_AVX_vec(T a,T b,T c,T e,T f,T g,T h,T i, T j,T k=0,T l=0,T m=0,T n=0,T o=0,T p=0,T q=0)
  460.         : d{}  { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); d[1] = _mm256_set_ps(q,p,o,n,m,l,k,j); }
  461.     template<typename... U>
  462.         float_AVX_vec(T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T, T,U&&...) = delete;
  463. public:
  464.     VEC_GENERIC_METHODS(float_AVX_vec)
  465. public:
  466.     float_AVX_vec(const me& b)
  467.     {
  468.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  469.     }
  470.     float_AVX_vec(me&& b)
  471.     {
  472.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  473.     }
  474.     float_AVX_vec(const vec<float,N>& b)
  475.     {
  476.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  477.     }
  478.     float_AVX_vec(vec<float,N>&& b)
  479.     {
  480.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  481.     }
  482.     template<typename T>
  483.     float_AVX_vec(const vec<T,N>& b)
  484.     {
  485.         for(unsigned n=0; n<TableSize; ++n)
  486.             d[n] = _mm256_set_ps( b[n*8+7], b[n*8+6], b[n*8+5], b[n*8+4],
  487.                                   b[n*8+3], b[n*8+2], b[n*8+1], b[n*8+0] );
  488.     }
  489.     float_AVX_vec& operator=(const float_AVX_vec& b) = default;
  490.     float_AVX_vec& operator=(float_AVX_vec&& b) = default;
  491.  
  492.     T operator[](unsigned n) const { return d[n/8][n%8]; }
  493.     inline void set(unsigned n, T b) { d[n/8][n%8] = b; }
  494.     T HorizontalSum() const
  495.     {
  496.         __m256 result = d[0];
  497.         for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_ps(result, d[a]);
  498.         // FIXME: verify semantics
  499.         if(N > 4) result = _mm256_hadd_ps(result, result);
  500.         if(N > 2) result = _mm256_hadd_ps(result, result);
  501.         if(N > 1) result = _mm256_hadd_ps(result, result);
  502.         return result[0];
  503.     }
  504.  
  505.     vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], b.d[n]); return result; }
  506.     vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], b.d[n]); return result; }
  507.     vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], b.d[n]); return result; }
  508.     vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], b.d[n]); return result; }
  509.     vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], _mm256_set1_ps(b)); return result; }
  510.     vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], _mm256_set1_ps(b)); return result; }
  511.     vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], _mm256_set1_ps(b)); return result; }
  512.     vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], _mm256_set1_ps(b)); return result; }
  513.     vec<T,N> operator- () const
  514.     {
  515.         const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,f,f,f,f,f,f,f);
  516.         vec<T,N> result;
  517.         for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_ps(d[n], mask);
  518.         return result;
  519.     }
  520.     void clamp(float min, float max)
  521.     {
  522.         __m256 mi = _mm256_set1_ps(min), ma = _mm256_set1_ps(max);
  523.         for(unsigned n=0; n<TableSize; ++n)
  524.             d[n] = _mm256_min_ps(_mm256_max_ps(d[n], mi), ma);
  525.     }
  526. };
  527. template<unsigned N,
  528.          unsigned TableSize = (N+3)/4>
  529. struct double_AVX_vec
  530. {
  531. public:
  532.     typedef float T;
  533.     typedef double_AVX_vec<N> me;
  534.     __m256d d[TableSize] __attribute__((aligned(16)));
  535. public:
  536.     double_AVX_vec(): d{} {}
  537.  
  538.     double_AVX_vec(T a,T b=0,T c=0,T e=0)
  539.         : d{}  { d[0] = _mm256_set_pd(e,c,b,a); }
  540.     double_AVX_vec(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
  541.         : d{}  { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); }
  542.     double_AVX_vec(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
  543.         : d{}  { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); d[2] = _mm256_set_pd(m,l,k,j); }
  544.     template<typename... U>
  545.         double_AVX_vec(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
  546. public:
  547.     VEC_GENERIC_METHODS(double_AVX_vec)
  548. public:
  549.     double_AVX_vec(const me& b)
  550.     {
  551.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  552.     }
  553.     double_AVX_vec(me&& b)
  554.     {
  555.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  556.     }
  557.     double_AVX_vec(const vec<float,N>& b)
  558.     {
  559.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  560.     }
  561.     double_AVX_vec(vec<float,N>&& b)
  562.     {
  563.         for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
  564.     }
  565.     template<typename T>
  566.     double_AVX_vec(const vec<T,N>& b)
  567.     {
  568.         for(unsigned n=0; n<TableSize; ++n)
  569.             d[n] = _mm256_set_pd( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
  570.     }
  571.     double_AVX_vec& operator=(const double_AVX_vec& b) = default;
  572.     double_AVX_vec& operator=(double_AVX_vec&& b) = default;
  573.  
  574.     T operator[](unsigned n) const { return d[n/4][n%4]; }
  575.     inline void set(unsigned n, T b) { d[n/4][n%4] = b; }
  576.     T HorizontalSum() const
  577.     {
  578.         __m256d result = d[0];
  579.         for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_pd(result, d[a]);
  580.         if(N > 2) result = _mm256_hadd_pd(result, result);
  581.         if(N > 1) result = _mm256_hadd_pd(result, result);
  582.         return result[0];
  583.     }
  584.  
  585.     vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], b.d[n]); return result; }
  586.     vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], b.d[n]); return result; }
  587.     vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], b.d[n]); return result; }
  588.     vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], b.d[n]); return result; }
  589.     vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], _mm256_set1_pd(b)); return result; }
  590.     vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], _mm256_set1_pd(b)); return result; }
  591.     vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], _mm256_set1_pd(b)); return result; }
  592.     vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], _mm256_set1_pd(b)); return result; }
  593.     vec<T,N> operator- () const
  594.     {
  595.         const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,0,f,0,f,0,f,0);
  596.         vec<T,N> result;
  597.         for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_pd(d[n], mask);
  598.         return result;
  599.     }
  600.     void clamp(float min, float max)
  601.     {
  602.         __m256 mi = _mm256_set1_pd(min), ma = _mm256_set1_pd(max);
  603.         for(unsigned n=0; n<TableSize; ++n)
  604.             d[n] = _mm256_min_pd(_mm256_max_pd(d[n], mi), ma);
  605.     }
  606. };
  607. #endif
  608.  
  609. #ifdef ENABLE_SSE_FLOAT_2
  610. template<> struct vec<float,2>: public float_SSE_vec<2>
  611. {
  612. public:
  613.     template<typename...T>
  614.     vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}
  615.  
  616.     float CrossProduct(const vec<float,2>& b) const
  617.     {
  618.       // 3210 * xx01
  619.       __m128 d0 = _mm_set_ps(0,0,d[1],d[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
  620.       __m128 tmp = _mm_mul_ps(d0, d1);
  621.       // xxx0 - xxx1
  622.       return tmp[0] - tmp[1];
  623.     }
  624. };
  625. #else
  626. // Both of these are unnecessary for Clang
  627. /*float CrossProduct(const vec<float,2>& a, const vec<float,2>& b)
  628. {
  629.     // 3210 * xx01
  630.     __m128 d0 = _mm_set_ps(0,0,a[1],a[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
  631.     __m128 tmp = _mm_mul_ps(d0, d1);
  632.     // xxx0 - xxx1
  633.     return tmp[0] - tmp[1];
  634. }*/
  635. /*vec<float,2> operator*(const vec<float,2>& a, const vec<float,2>& b)
  636. {
  637.     __m128 d0 = _mm_loadu_ps((const float*)&a);
  638.     __m128 d1 = _mm_loadu_ps((const float*)&b);
  639.     __m128 result = _mm_mul_ps(d0, d1);
  640.     return {result[0], result[1]};
  641. }*/
  642. #endif
  643.  
  644.  
  645. #ifdef ENABLE_SSE2_DOUBLE_2
  646. template<> struct vec<double,2>: public double_SSE2_vec<2>
  647. {
  648. public:
  649.     template<typename...T>
  650.     vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
  651.  
  652.     double CrossProduct(const vec<double,2>& b) const
  653.     {
  654.         // 3210 * xx01
  655.         __m128d tmp = _mm_mul_pd(d[0], _mm_shuffle_pd(b.d[0], b.d[0], 1));
  656.         // xxx0 - xxx1
  657.     #ifdef __SSE3__
  658.         return _mm_hsub_pd(tmp, tmp)[0];
  659.     #else
  660.         return tmp[0] - tmp[1];
  661.     #endif
  662.     }
  663. };
  664. #endif
  665. #ifdef ENABLE_AVX_DOUBLE_4
  666. template<> struct vec<double,4>: public double_AVX_vec<4>
  667. {
  668. public:
  669.     template<typename...T>
  670.     vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
  671. };
  672. #elif defined(ENABLE_SSE2_DOUBLE_4)
  673. template<> struct vec<double,4>: public double_SSE2_vec<4>
  674. {
  675. public:
  676.     template<typename...T>
  677.     vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
  678. };
  679. #endif
  680. #ifdef ENABLE_SSE2_DOUBLE_6
  681. template<> struct vec<double,6>: public double_SSE2_vec<6>
  682. {
  683. public:
  684.     template<typename...T>
  685.     vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
  686. };
  687. #endif
  688. #ifdef ENABLE_AVX_DOUBLE_8
  689. template<> struct vec<double,8>: public double_AVX_vec<8>
  690. {
  691. public:
  692.     template<typename...T>
  693.     vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
  694. };
  695. #endif
  696. #ifdef ENABLE_AVX_DOUBLE_3
  697. template<> struct vec<double,3>: public double_AVX_vec<3>
  698. {
  699. public:
  700.     template<typename...T>
  701.     vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
  702.  
  703.     vec<double,3> CrossProduct(const vec<double,3>& b) const;
  704. };
  705. #elif defined(ENABLE_SSE2_DOUBLE_3)
  706. template<> struct vec<double,3>: public double_SSE2_vec<3,1>
  707. {
  708. public:
  709.     template<typename...T>
  710.     vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
  711.  
  712.     vec<double,3> CrossProduct(const vec<double,3>& b) const;
  713. };
  714. #endif
  715. #if defined(ENABLE_AVX_DOUBLE_3) || defined(ENABLE_SSE2_DOUBLE_3)
  716. vec<double,3> vec<double,3>::CrossProduct(const vec<double,3>& b) const
  717. {
  718.     #define x d[0][0]
  719.     #define y d[0][1]
  720.     #define z d[1][0]
  721.     #ifdef ENABLE_AVX_DOUBLE_8
  722.         vec<double,8> data1{(double)  y, (double)  z, (double)  x, 0, (double)  z, (double)  x, (double)  y, 0};
  723.         vec<double,8> data2{(double)b.z, (double)b.x, (double)b.y, 0, (double)b.y, (double)b.z, (double)b.x, 0};
  724.         auto res = data1 * data2;
  725.         return res.template Limit<4,0>() - res.template Limit<4,4>();
  726.     #else
  727.         vec<double,6> data1{(double)  y, (double)  z, (double)  x,  (double)  z, (double)  x, (double)  y};
  728.         vec<double,6> data2{(double)b.z, (double)b.x, (double)b.y,  (double)b.y, (double)b.z, (double)b.x};
  729.         auto res = data1 * data2;
  730.         return res.template Limit<3,0>() - res.template Limit<3,3>();
  731.     #endif
  732.     #undef x
  733.     #undef y
  734.     #undef z
  735. }
  736.  
  737. #endif
  738.  
  739. #ifdef ENABLE_SSE_FLOAT_3
  740. template<> struct vec<float,3>: public float_SSE_vec<3>
  741. {
  742. public:
  743.     template<typename...T>
  744.     vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}
  745.  
  746.     vec<float,3> CrossProduct(const vec<float,3>& b) const
  747.     {
  748.         vec<float,3> result;
  749.         result.d[0] = _mm_sub_ps(
  750.                       _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,0,2,1)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,1,0,2))),
  751.                       _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,1,0,2)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,0,2,1)))
  752.                     );
  753.         return result;
  754.     }
  755. };
  756. #else
  757. // This does help Clang a bit.
  758. /*#ifdef H1
  759. vec<float,3> CrossProduct(const vec<float,3>& a, const vec<float,3>& b)
  760. {
  761.     __m128 av = _mm_loadu_ps(&a[0]), bv = _mm_loadu_ps(&b[0]);
  762.     __m128 d0 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,0,2,1)), d1 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,1,0,2));
  763.     __m128 d2 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,1,0,2)), d3 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,0,2,1));
  764.     auto res = _mm_sub_ps(_mm_mul_ps(d0,d1), _mm_mul_ps(d2,d3));
  765.     return vec<float,3>{(float)res[0],(float)res[1],(float)res[2]};
  766. }
  767. #endif*/
  768. #endif
  769.  
  770. #ifdef ENABLE_SSE_FLOAT_4
  771. template<> struct vec<float,4>: public float_SSE_vecFull<4> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
  772. #elif defined(ENABLE_AVX_FLOAT_4)
  773. template<> struct vec<float,4>: public float_AVX_vec<4> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  774. #endif
  775. #ifdef ENABLE_AVX_FLOAT_5
  776. template<> struct vec<float,5>: public float_AVX_vec<5> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  777. #elif defined(ENABLE_SSE_FLOAT_5)
  778. template<> struct vec<float,5>: public float_SSE_vec<5,1> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  779. #endif
  780. #ifdef ENABLE_AVX_FLOAT_6
  781. template<> struct vec<float,6>: public float_AVX_vec<6> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  782. #elif defined(ENABLE_SSE_FLOAT_6)
  783. template<> struct vec<float,6>: public float_SSE_vec<6> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  784. #endif
  785. #ifdef ENABLE_AVX_FLOAT_7
  786. template<> struct vec<float,7>: public float_AVX_vec<7> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  787. #elif defined(ENABLE_SSE_FLOAT_7)
  788. template<> struct vec<float,7>: public float_SSE_vec<7> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  789. #endif
  790. #ifdef ENABLE_AVX_FLOAT_8
  791. template<> struct vec<float,8>: public float_AVX_vec<8> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  792. #elif defined(ENABLE_SSE_FLOAT_8)
  793. template<> struct vec<float,8>: public float_SSE_vecFull<8> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
  794. #endif
  795. #ifdef ENABLE_AVX_FLOAT_9
  796. template<> struct vec<float,9>: public float_AVX_vec<9> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  797. #elif defined(ENABLE_SSE_FLOAT_9)
  798. template<> struct vec<float,9>: public float_SSE_vec<9,2> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  799. #endif
  800. #ifdef ENABLE_AVX_FLOAT_10
  801. template<> struct vec<float,10>: public float_AVX_vec<10> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  802. #elif defined(ENABLE_SSE_FLOAT_10)
  803. template<> struct vec<float,10>: public float_SSE_vec<10> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  804. #endif
  805. #ifdef ENABLE_AVX_FLOAT_11
  806. template<> struct vec<float,11>: public float_AVX_vec<11> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  807. #elif defined(ENABLE_SSE_FLOAT_11)
  808. template<> struct vec<float,11>: public float_SSE_vec<11> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
  809. #endif
  810. #ifdef ENABLE_AVX_FLOAT_12
  811. template<> struct vec<float,12>: public float_AVX_vec<12> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
  812. #elif defined(ENABLE_SSE_FLOAT_12)
  813. template<> struct vec<float,12>: public float_SSE_vecFull<12> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
  814. #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement