math_simd.hh in prender2

#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE2__
#include <emmintrin.h>

#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#ifdef __SSSE3__
#include <tmmintrin.h>
#endif
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif

#ifdef __SSE__
  //#define ENABLE_SSE_FLOAT_2
  //#define ENABLE_SSE_FLOAT_3
  //#define ENABLE_SSE_FLOAT_4
  //#define ENABLE_SSE_FLOAT_5
  //#define ENABLE_SSE_FLOAT_6
  //#define ENABLE_SSE_FLOAT_7
  //#define ENABLE_SSE_FLOAT_8
  //#define ENABLE_SSE_FLOAT_9
  //#define ENABLE_SSE_FLOAT_10
#endif
#ifdef __SSE2__
  //#define ENABLE_SSE2_DOUBLE_2
  #define ENABLE_SSE2_DOUBLE_4
  #define ENABLE_SSE2_DOUBLE_3
  #define ENABLE_SSE2_DOUBLE_6
#endif
#ifdef __AVX__
  //#define ENABLE_AVX_DOUBLE_3
  //#define ENABLE_AVX_DOUBLE_4
  //#define ENABLE_AVX_DOUBLE_8
#endif

#ifdef __SSE__
/* A single XMM register can fit (__m128):
 *     Four floats
 *     Two doubles
 * If AVX is enabled (__m256),
 *     Eight floats
 *     Four doubles
 */
template<unsigned N,
         unsigned TableSize = (N+3)/4,
         unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
        >
struct float_SSE_vecFull
{
public:
    typedef float T;
    typedef float_SSE_vecFull<N,TableSize,Extra> me;
    __m128 d[TableSize] __attribute__((aligned(16)));
    T extra[Extra];
    static constexpr unsigned cap = TableSize*4;
public:
    float_SSE_vecFull(): d{} {}

    /*
    float_SSE_vecFull(T a,T b=0,T c=0,T e=0)
        : d{}  { d[0] = _mm_set_ps(e,c,b,a); }
    float_SSE_vecFull(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
        : d{}  { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); }
    float_SSE_vecFull(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
        : d{}  { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); d[2] = _mm_set_ps(m,l,k,j); }
    template<typename... U>
        float_SSE_vecFull(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
    */
public:
    VEC_GENERIC_METHODS(float_SSE_vecFull)
public:
    float_SSE_vecFull(const me& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    float_SSE_vecFull(me&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    float_SSE_vecFull(const vec<float,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    float_SSE_vecFull(vec<float,N>&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    template<typename U>
    float_SSE_vecFull(const vec<U,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm_set_ps( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
        for(unsigned n=0; n<Extra; ++n)
            extra[n] = b[cap+n];
    }
    float_SSE_vecFull& operator=(const float_SSE_vecFull& b) = default;
    float_SSE_vecFull& operator=(float_SSE_vecFull&& b) = default;

    T operator[](unsigned n) const { return n<cap ? d[n/4][n%4] : extra[n-cap]; }
    inline void set(unsigned n, T b) { if(n < cap) d[n/4][n%4] = b; else extra[n-cap] = b; }
    T HorizontalSum() const
    {
        __m128 result = d[0];
        for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, d[a]);
        T out {};
    #ifdef __SSE3__
        if(N-Extra > 2) result = _mm_hadd_ps(result, result);
        if(N-Extra > 1) result = _mm_hadd_ps(result, result);
        out = result[0];
    #else
        if(N-Extra == 3) out = result[0] + result[1] + result[2];
        else {
            if(N-Extra > 2) result = _mm_add_ps(result, _mm_movehl_ps(result, result));
            if(N-Extra > 1) out = result[0] + result[1];
            out = result[0];
        }
    #endif
        for(unsigned n=0; n<Extra; ++n) out += extra[n];
        return out;
    }

    vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
                                                   return result; }
    vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
                                                   return result; }
    vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
                                                   return result; }
    vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
                                                   return result; }
    vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], _mm_set1_ps(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
                                                   return result; }
    vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], _mm_set1_ps(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
                                                   return result; }
    vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], _mm_set1_ps(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
                                                   return result; }
    vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], _mm_set1_ps(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
                                                   return result; }
    vec<T,N> operator- () const
    {
        const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
        vec<T,N> result;
        for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_ps(d[n], mask);
        for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
        return result;
    }
    void clamp(float min, float max)
    {
        __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm_min_ps(_mm_max_ps(d[n], mi), ma);
        for(unsigned n=0; n<Extra; ++n)
            extra[n] = std::min(std::max(extra[n], min), max);
    }
};
template<unsigned N,
         unsigned TableSize = (N+3)/4,
         unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
        >
struct float_SSE_vec
{
public:
    typedef float T;
    typedef float_SSE_vec<N,TableSize,Extra> me;
    T d[N+Extra] __attribute__((aligned(16)));
    static constexpr unsigned cap = TableSize*4;
    static constexpr unsigned TableExt = TableSize + !!Extra;
public:
    float_SSE_vec(): d{} {}
public:
    VEC_GENERIC_METHODS(float_SSE_vec)
public:
    float_SSE_vec(const me& b) noexcept = default;
    float_SSE_vec(me&& b) noexcept = default;
    float_SSE_vec(const vec<float,N>& b) noexcept
    {
        for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
    }
    float_SSE_vec(vec<float,N>&& b) noexcept
    {
        for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
    }

    template<typename U>
    float_SSE_vec(const vec<U,N>& b)
    {
        for(unsigned n=0; n<N; ++n) d[n] = b[n];
    }
    float_SSE_vec& operator=(const float_SSE_vec& b) noexcept = default;
    float_SSE_vec& operator=(float_SSE_vec&& b) noexcept = default;

private:
    inline __m128 Make(unsigned w) const { return _mm_load_ps(&d[w*4]); }
    void Extract(unsigned w, __m128 v) { _mm_store_ps(&d[w*4], v); }
    void ExtractLast(__m128 v) { for(unsigned n=0; n<Extra; ++n) d[TableSize*4+n] = v[n]; }
public:
    inline T operator[](unsigned n) const { return d[n]; }
    inline void set(unsigned n, T b) { d[n] = b; }

    T HorizontalSum() const
    {
        T out {};
        if(TableSize > 0)
        {
            __m128 result = Make(0);
            for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, Make(a));
        #ifdef __SSE3__
            result = _mm_hadd_ps(result, result);
            result = _mm_hadd_ps(result, result);
            out = result[0];
        #else
            result = _mm_add_ps(result, _mm_movehl_ps(result, result));
            out = result[0] + result[1];
        #endif
        }
        for(unsigned n=0; n<Extra; ++n) out += d[TableSize*4 + n];
        return out;
    }

    vec<T,N> operator+ (const vec<T,N>& b) const
    {
        vec<T,N> result;
        /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), b.Make(n)));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] + b[TableSize*4]); }
        else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), b.Make(TableSize))); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b.d[n];
        return result;
    }
    vec<T,N> operator- (const vec<T,N>& b) const
    {
        vec<T,N> result;
        /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), b.Make(n)));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] - b[TableSize*4]); }
        else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), b.Make(TableSize))); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b.d[n];
        return result;
    }
    vec<T,N> operator* (const vec<T,N>& b) const
    {
        vec<T,N> result;
        /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), b.Make(n)));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] * b[TableSize*4]); }
        else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), b.Make(TableSize))); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b.d[n];
        return result;
    }
    vec<T,N> operator/ (const vec<T,N>& b) const
    {
        vec<T,N> result;
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), b.Make(n)));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] / b[TableSize*4]); }
        else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), b.Make(TableSize))); }
        return result;
    }
    vec<T,N> operator+ (T b) const
    {
        vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), bb));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] + b); }
        else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), bb)); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b;
        return result;
    }
    vec<T,N> operator- (T b) const
    {
        vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), bb));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] - b); }
        else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), bb)); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b;
        return result;
    }
    vec<T,N> operator* (T b) const
    {
        vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), bb));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] * b); }
        else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), bb)); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b;
        return result;
    }
    vec<T,N> operator/ (T b) const
    {
        vec<T,N> result; __m128 bb = _mm_set1_ps(b);
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), bb));
        if(Extra==1)   { result.set(TableSize*4, d[TableSize*4] / b); }
        else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), bb)); }
        return result;
    }
    vec<T,N> operator- () const
    {
        vec<T,N> result;
        /*const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
        for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_xor_ps(Make(n), mask));
        if(Extra==1)   { result.set(TableSize*4, -d[TableSize*4]); }
        else if(Extra) { result.ExtractLast(_mm_xor_ps(Make(TableSize), mask)); }*/
        for(unsigned n=0; n<N; ++n) result.d[n] = -d[n];
        return result;
    }
    void clamp(float min, float max)
    {
        __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
        for(unsigned n=0; n<TableSize; ++n) Extract(n, _mm_min_ps(_mm_max_ps(Make(n), mi), ma));
        if(Extra==1)   { d[TableSize*4] = std::min(std::max(d[TableSize*4], min), max); }
        else if(Extra) { ExtractLast(_mm_min_ps(_mm_max_ps(Make(TableSize), mi), ma)); }
    }
};
#endif
#ifdef __SSE2__
template<unsigned N,
         unsigned TableSize = (N+1)/2,
         unsigned Extra = (N > TableSize*2) ? N-TableSize*2 : 0
        >
struct double_SSE2_vec
{
public:
    typedef double T;
    typedef double_SSE2_vec<N,TableSize,Extra> me;
    __m128d d[TableSize] __attribute__((aligned(16)));
    T extra[Extra];
    static constexpr unsigned cap = TableSize*2;
public:
    double_SSE2_vec(): d{} {}
    /*
    double_SSE2_vec(T a,T b=0)
        : d{}  { d[0] = _mm_set_pd(b,a); }
    double_SSE2_vec(T a,T b, T c,T e=0)
        : d{}  { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); }
    double_SSE2_vec(T a,T b, T c,T e, T f,T g=0)
        : d{}  { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); d[2] = _mm_set_pd(g,f); }
    template<typename... U>
    double_SSE2_vec(T,T, T,T, T,T, T,U&&...) = delete;
    */
public:
    VEC_GENERIC_METHODS(double_SSE2_vec)
public:
    double_SSE2_vec(const me& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    double_SSE2_vec(me&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    double_SSE2_vec(const vec<double,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    double_SSE2_vec(vec<double,N>&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
        for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
    }
    template<typename U>
    double_SSE2_vec(const vec<U,N>& b)
    {
        // FIXME: Array overflow access
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm_set_pd( b[n*2+1], b[n*2+0] );
        for(unsigned n=0; n<Extra; ++n)
            extra[n] = b[cap+n];
    }
    double_SSE2_vec& operator=(const double_SSE2_vec& b) = default;
    double_SSE2_vec& operator=(double_SSE2_vec&& b) = default;

    T operator[](unsigned n) const { return n<cap ? d[n/2][n%2] : extra[n-cap]; }
    inline void set(unsigned n, T b) { if(n < cap) d[n/2][n%2] = b; else extra[n-cap] = b; }
    T HorizontalSum() const
    {
        __m128d result = d[0];
        for(unsigned a=1; a<TableSize; ++a) result = _mm_add_pd(result, d[a]);
        T out {};
    #ifdef __SSE3__
        if(N-Extra >= 2) result = _mm_hadd_pd(result, result);
        out = result[0];
    #else
        if(N-Extra >= 2) out = result[0] + result[1];
        else out = result[0];
    #endif
        for(unsigned n=0; n<Extra; ++n) out += extra[n];
        return out;
    }

    vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
                                                   return result; }
    vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
                                                   return result; }
    vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
                                                   return result; }
    vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], b.d[n]);
                                                                    for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
                                                   return result; }
    vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], _mm_set1_pd(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
                                                   return result; }
    vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], _mm_set1_pd(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
                                                   return result; }
    vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], _mm_set1_pd(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
                                                   return result; }
    vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], _mm_set1_pd(b));
                                                      for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
                                                   return result; }
    vec<T,N> operator- () const
    {
        const int f = 0x80000000u; __m128d mask = (__m128d)_mm_set_epi32(f,0,f,0);
        vec<T,N> result;
        for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_pd(d[n], mask);
        for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
        return result;
    }
    void clamp(double min, double max)
    {
        __m128d mi = _mm_set1_pd(min), ma = _mm_set1_pd(max);
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm_min_pd(_mm_max_pd(d[n], mi), ma);
        for(unsigned n=0; n<Extra; ++n)
            extra[n] = std::min(std::max(extra[n], min), max);
    }
};
#endif
#ifdef __AVX__
template<unsigned N,
         unsigned TableSize = (N+7)/8>
struct float_AVX_vec
{
public:
    typedef float T;
    typedef float_AVX_vec<N> me;
    __m256 d[TableSize] __attribute__((aligned(16)));
public:
    float_AVX_vec(): d{} {}

    float_AVX_vec(T a,T b=0,T c=0,T e=0,T f=0,T g=0,T h=0,T i=0)
        : d{}  { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); }
    float_AVX_vec(T a,T b,T c,T e,T f,T g,T h,T i, T j,T k=0,T l=0,T m=0,T n=0,T o=0,T p=0,T q=0)
        : d{}  { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); d[1] = _mm256_set_ps(q,p,o,n,m,l,k,j); }
    template<typename... U>
        float_AVX_vec(T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T, T,U&&...) = delete;
public:
    VEC_GENERIC_METHODS(float_AVX_vec)
public:
    float_AVX_vec(const me& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    float_AVX_vec(me&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    float_AVX_vec(const vec<float,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    float_AVX_vec(vec<float,N>&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    template<typename T>
    float_AVX_vec(const vec<T,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm256_set_ps( b[n*8+7], b[n*8+6], b[n*8+5], b[n*8+4],
                                  b[n*8+3], b[n*8+2], b[n*8+1], b[n*8+0] );
    }
    float_AVX_vec& operator=(const float_AVX_vec& b) = default;
    float_AVX_vec& operator=(float_AVX_vec&& b) = default;

    T operator[](unsigned n) const { return d[n/8][n%8]; }
    inline void set(unsigned n, T b) { d[n/8][n%8] = b; }
    T HorizontalSum() const
    {
        __m256 result = d[0];
        for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_ps(result, d[a]);
        // FIXME: verify semantics
        if(N > 4) result = _mm256_hadd_ps(result, result);
        if(N > 2) result = _mm256_hadd_ps(result, result);
        if(N > 1) result = _mm256_hadd_ps(result, result);
        return result[0];
    }

    vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], b.d[n]); return result; }
    vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], b.d[n]); return result; }
    vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], b.d[n]); return result; }
    vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], b.d[n]); return result; }
    vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], _mm256_set1_ps(b)); return result; }
    vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], _mm256_set1_ps(b)); return result; }
    vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], _mm256_set1_ps(b)); return result; }
    vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], _mm256_set1_ps(b)); return result; }
    vec<T,N> operator- () const
    {
        const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,f,f,f,f,f,f,f);
        vec<T,N> result;
        for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_ps(d[n], mask);
        return result;
    }
    void clamp(float min, float max)
    {
        __m256 mi = _mm256_set1_ps(min), ma = _mm256_set1_ps(max);
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm256_min_ps(_mm256_max_ps(d[n], mi), ma);
    }
};
template<unsigned N,
         unsigned TableSize = (N+3)/4>
struct double_AVX_vec
{
public:
    typedef float T;
    typedef double_AVX_vec<N> me;
    __m256d d[TableSize] __attribute__((aligned(16)));
public:
    double_AVX_vec(): d{} {}

    double_AVX_vec(T a,T b=0,T c=0,T e=0)
        : d{}  { d[0] = _mm256_set_pd(e,c,b,a); }
    double_AVX_vec(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
        : d{}  { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); }
    double_AVX_vec(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
        : d{}  { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); d[2] = _mm256_set_pd(m,l,k,j); }
    template<typename... U>
        double_AVX_vec(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
public:
    VEC_GENERIC_METHODS(double_AVX_vec)
public:
    double_AVX_vec(const me& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    double_AVX_vec(me&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    double_AVX_vec(const vec<float,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    double_AVX_vec(vec<float,N>&& b)
    {
        for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
    }
    template<typename T>
    double_AVX_vec(const vec<T,N>& b)
    {
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm256_set_pd( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
    }
    double_AVX_vec& operator=(const double_AVX_vec& b) = default;
    double_AVX_vec& operator=(double_AVX_vec&& b) = default;

    T operator[](unsigned n) const { return d[n/4][n%4]; }
    inline void set(unsigned n, T b) { d[n/4][n%4] = b; }
    T HorizontalSum() const
    {
        __m256d result = d[0];
        for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_pd(result, d[a]);
        if(N > 2) result = _mm256_hadd_pd(result, result);
        if(N > 1) result = _mm256_hadd_pd(result, result);
        return result[0];
    }

    vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], b.d[n]); return result; }
    vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], b.d[n]); return result; }
    vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], b.d[n]); return result; }
    vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], b.d[n]); return result; }
    vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], _mm256_set1_pd(b)); return result; }
    vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], _mm256_set1_pd(b)); return result; }
    vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], _mm256_set1_pd(b)); return result; }
    vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], _mm256_set1_pd(b)); return result; }
    vec<T,N> operator- () const
    {
        const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,0,f,0,f,0,f,0);
        vec<T,N> result;
        for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_pd(d[n], mask);
        return result;
    }
    void clamp(float min, float max)
    {
        __m256 mi = _mm256_set1_pd(min), ma = _mm256_set1_pd(max);
        for(unsigned n=0; n<TableSize; ++n)
            d[n] = _mm256_min_pd(_mm256_max_pd(d[n], mi), ma);
    }
};
#endif

#ifdef ENABLE_SSE_FLOAT_2
template<> struct vec<float,2>: public float_SSE_vec<2>
{
public:
    template<typename...T>
    vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}

    float CrossProduct(const vec<float,2>& b) const
    {
      // 3210 * xx01
      __m128 d0 = _mm_set_ps(0,0,d[1],d[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
      __m128 tmp = _mm_mul_ps(d0, d1);
      // xxx0 - xxx1
      return tmp[0] - tmp[1];
    }
};
#else
// Both of these are unnecessary for Clang
/*float CrossProduct(const vec<float,2>& a, const vec<float,2>& b)
{
    // 3210 * xx01
    __m128 d0 = _mm_set_ps(0,0,a[1],a[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
    __m128 tmp = _mm_mul_ps(d0, d1);
    // xxx0 - xxx1
    return tmp[0] - tmp[1];
}*/
/*vec<float,2> operator*(const vec<float,2>& a, const vec<float,2>& b)
{
    __m128 d0 = _mm_loadu_ps((const float*)&a);
    __m128 d1 = _mm_loadu_ps((const float*)&b);
    __m128 result = _mm_mul_ps(d0, d1);
    return {result[0], result[1]};
}*/
#endif


#ifdef ENABLE_SSE2_DOUBLE_2
template<> struct vec<double,2>: public double_SSE2_vec<2>
{
public:
    template<typename...T>
    vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}

    double CrossProduct(const vec<double,2>& b) const
    {
        // 3210 * xx01
        __m128d tmp = _mm_mul_pd(d[0], _mm_shuffle_pd(b.d[0], b.d[0], 1));
        // xxx0 - xxx1
    #ifdef __SSE3__
        return _mm_hsub_pd(tmp, tmp)[0];
    #else
        return tmp[0] - tmp[1];
    #endif
    }
};
#endif
#ifdef ENABLE_AVX_DOUBLE_4
template<> struct vec<double,4>: public double_AVX_vec<4>
{
public:
    template<typename...T>
    vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
};
#elif defined(ENABLE_SSE2_DOUBLE_4)
template<> struct vec<double,4>: public double_SSE2_vec<4>
{
public:
    template<typename...T>
    vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
};
#endif
#ifdef ENABLE_SSE2_DOUBLE_6
template<> struct vec<double,6>: public double_SSE2_vec<6>
{
public:
    template<typename...T>
    vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
};
#endif
#ifdef ENABLE_AVX_DOUBLE_8
template<> struct vec<double,8>: public double_AVX_vec<8>
{
public:
    template<typename...T>
    vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
};
#endif
#ifdef ENABLE_AVX_DOUBLE_3
template<> struct vec<double,3>: public double_AVX_vec<3>
{
public:
    template<typename...T>
    vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}

    vec<double,3> CrossProduct(const vec<double,3>& b) const;
};
#elif defined(ENABLE_SSE2_DOUBLE_3)
template<> struct vec<double,3>: public double_SSE2_vec<3,1>
{
public:
    template<typename...T>
    vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}

    vec<double,3> CrossProduct(const vec<double,3>& b) const;
};
#endif
#if defined(ENABLE_AVX_DOUBLE_3) || defined(ENABLE_SSE2_DOUBLE_3)
vec<double,3> vec<double,3>::CrossProduct(const vec<double,3>& b) const
{
    #define x d[0][0]
    #define y d[0][1]
    #define z d[1][0]
    #ifdef ENABLE_AVX_DOUBLE_8
        vec<double,8> data1{(double)  y, (double)  z, (double)  x, 0, (double)  z, (double)  x, (double)  y, 0};
        vec<double,8> data2{(double)b.z, (double)b.x, (double)b.y, 0, (double)b.y, (double)b.z, (double)b.x, 0};
        auto res = data1 * data2;
        return res.template Limit<4,0>() - res.template Limit<4,4>();
    #else
        vec<double,6> data1{(double)  y, (double)  z, (double)  x,  (double)  z, (double)  x, (double)  y};
        vec<double,6> data2{(double)b.z, (double)b.x, (double)b.y,  (double)b.y, (double)b.z, (double)b.x};
        auto res = data1 * data2;
        return res.template Limit<3,0>() - res.template Limit<3,3>();
    #endif
    #undef x
    #undef y
    #undef z
}

#endif

#ifdef ENABLE_SSE_FLOAT_3
template<> struct vec<float,3>: public float_SSE_vec<3>
{
public:
    template<typename...T>
    vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}

    vec<float,3> CrossProduct(const vec<float,3>& b) const
    {
        vec<float,3> result;
        result.d[0] = _mm_sub_ps(
                      _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,0,2,1)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,1,0,2))),
                      _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,1,0,2)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,0,2,1)))
                    );
        return result;
    }
};
#else
// This does help Clang a bit.
/*#ifdef H1
vec<float,3> CrossProduct(const vec<float,3>& a, const vec<float,3>& b)
{
    __m128 av = _mm_loadu_ps(&a[0]), bv = _mm_loadu_ps(&b[0]);
    __m128 d0 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,0,2,1)), d1 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,1,0,2));
    __m128 d2 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,1,0,2)), d3 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,0,2,1));
    auto res = _mm_sub_ps(_mm_mul_ps(d0,d1), _mm_mul_ps(d2,d3));
    return vec<float,3>{(float)res[0],(float)res[1],(float)res[2]};
}
#endif*/
#endif

#ifdef ENABLE_SSE_FLOAT_4
template<> struct vec<float,4>: public float_SSE_vecFull<4> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
#elif defined(ENABLE_AVX_FLOAT_4)
template<> struct vec<float,4>: public float_AVX_vec<4> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_5
template<> struct vec<float,5>: public float_AVX_vec<5> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_5)
template<> struct vec<float,5>: public float_SSE_vec<5,1> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_6
template<> struct vec<float,6>: public float_AVX_vec<6> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_6)
template<> struct vec<float,6>: public float_SSE_vec<6> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_7
template<> struct vec<float,7>: public float_AVX_vec<7> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_7)
template<> struct vec<float,7>: public float_SSE_vec<7> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_8
template<> struct vec<float,8>: public float_AVX_vec<8> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_8)
template<> struct vec<float,8>: public float_SSE_vecFull<8> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_9
template<> struct vec<float,9>: public float_AVX_vec<9> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_9)
template<> struct vec<float,9>: public float_SSE_vec<9,2> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_10
template<> struct vec<float,10>: public float_AVX_vec<10> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_10)
template<> struct vec<float,10>: public float_SSE_vec<10> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_11
template<> struct vec<float,11>: public float_AVX_vec<11> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_11)
template<> struct vec<float,11>: public float_SSE_vec<11> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
#endif
#ifdef ENABLE_AVX_FLOAT_12
template<> struct vec<float,12>: public float_AVX_vec<12> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
#elif defined(ENABLE_SSE_FLOAT_12)
template<> struct vec<float,12>: public float_SSE_vecFull<12> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
#endif