Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #ifdef __SSE__
- #include <xmmintrin.h>
- #endif
- #ifdef __SSE2__
- #include <emmintrin.h>
- #endif
- #ifdef __SSE3__
- #include <pmmintrin.h>
- #endif
- #ifdef __SSSE3__
- #include <tmmintrin.h>
- #endif
- #ifdef __SSE4_1__
- #include <smmintrin.h>
- #endif
- #ifdef __SSE__
- //#define ENABLE_SSE_FLOAT_2
- //#define ENABLE_SSE_FLOAT_3
- //#define ENABLE_SSE_FLOAT_4
- //#define ENABLE_SSE_FLOAT_5
- //#define ENABLE_SSE_FLOAT_6
- //#define ENABLE_SSE_FLOAT_7
- //#define ENABLE_SSE_FLOAT_8
- //#define ENABLE_SSE_FLOAT_9
- //#define ENABLE_SSE_FLOAT_10
- #endif
- #ifdef __SSE2__
- //#define ENABLE_SSE2_DOUBLE_2
- #define ENABLE_SSE2_DOUBLE_4
- #define ENABLE_SSE2_DOUBLE_3
- #define ENABLE_SSE2_DOUBLE_6
- #endif
- #ifdef __AVX__
- //#define ENABLE_AVX_DOUBLE_3
- //#define ENABLE_AVX_DOUBLE_4
- //#define ENABLE_AVX_DOUBLE_8
- #endif
- #ifdef __SSE__
- /* A single XMM register can fit (__m128):
- * Four floats
- * Two doubles
- * If AVX is enabled (__m256),
- * Eight floats
- * Four doubles
- */
- template<unsigned N,
- unsigned TableSize = (N+3)/4,
- unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
- >
- struct float_SSE_vecFull
- {
- public:
- typedef float T;
- typedef float_SSE_vecFull<N,TableSize,Extra> me;
- __m128 d[TableSize] __attribute__((aligned(16)));
- T extra[Extra];
- static constexpr unsigned cap = TableSize*4;
- public:
- float_SSE_vecFull(): d{} {}
- /*
- float_SSE_vecFull(T a,T b=0,T c=0,T e=0)
- : d{} { d[0] = _mm_set_ps(e,c,b,a); }
- float_SSE_vecFull(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
- : d{} { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); }
- float_SSE_vecFull(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
- : d{} { d[0] = _mm_set_ps(e,c,b,a); d[1] = _mm_set_ps(i,h,g,f); d[2] = _mm_set_ps(m,l,k,j); }
- template<typename... U>
- float_SSE_vecFull(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
- */
- public:
- VEC_GENERIC_METHODS(float_SSE_vecFull)
- public:
- float_SSE_vecFull(const me& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- float_SSE_vecFull(me&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- float_SSE_vecFull(const vec<float,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- float_SSE_vecFull(vec<float,N>&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- template<typename U>
- float_SSE_vecFull(const vec<U,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm_set_ps( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
- for(unsigned n=0; n<Extra; ++n)
- extra[n] = b[cap+n];
- }
- float_SSE_vecFull& operator=(const float_SSE_vecFull& b) = default;
- float_SSE_vecFull& operator=(float_SSE_vecFull&& b) = default;
- T operator[](unsigned n) const { return n<cap ? d[n/4][n%4] : extra[n-cap]; }
- inline void set(unsigned n, T b) { if(n < cap) d[n/4][n%4] = b; else extra[n-cap] = b; }
- T HorizontalSum() const
- {
- __m128 result = d[0];
- for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, d[a]);
- T out {};
- #ifdef __SSE3__
- if(N-Extra > 2) result = _mm_hadd_ps(result, result);
- if(N-Extra > 1) result = _mm_hadd_ps(result, result);
- out = result[0];
- #else
- if(N-Extra == 3) out = result[0] + result[1] + result[2];
- else {
- if(N-Extra > 2) result = _mm_add_ps(result, _mm_movehl_ps(result, result));
- if(N-Extra > 1) out = result[0] + result[1];
- out = result[0];
- }
- #endif
- for(unsigned n=0; n<Extra; ++n) out += extra[n];
- return out;
- }
- vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
- return result; }
- vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
- return result; }
- vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
- return result; }
- vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
- return result; }
- vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_ps(d[n], _mm_set1_ps(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
- return result; }
- vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_ps(d[n], _mm_set1_ps(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
- return result; }
- vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_ps(d[n], _mm_set1_ps(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
- return result; }
- vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_ps(d[n], _mm_set1_ps(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
- return result; }
- vec<T,N> operator- () const
- {
- const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
- vec<T,N> result;
- for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_ps(d[n], mask);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
- return result;
- }
- void clamp(float min, float max)
- {
- __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm_min_ps(_mm_max_ps(d[n], mi), ma);
- for(unsigned n=0; n<Extra; ++n)
- extra[n] = std::min(std::max(extra[n], min), max);
- }
- };
- template<unsigned N,
- unsigned TableSize = (N+3)/4,
- unsigned Extra = (N > TableSize*4) ? N-TableSize*4 : 0
- >
- struct float_SSE_vec
- {
- public:
- typedef float T;
- typedef float_SSE_vec<N,TableSize,Extra> me;
- T d[N+Extra] __attribute__((aligned(16)));
- static constexpr unsigned cap = TableSize*4;
- static constexpr unsigned TableExt = TableSize + !!Extra;
- public:
- float_SSE_vec(): d{} {}
- public:
- VEC_GENERIC_METHODS(float_SSE_vec)
- public:
- float_SSE_vec(const me& b) noexcept = default;
- float_SSE_vec(me&& b) noexcept = default;
- float_SSE_vec(const vec<float,N>& b) noexcept
- {
- for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
- }
- float_SSE_vec(vec<float,N>&& b) noexcept
- {
- for(unsigned n=0; n<N; ++n) d[n] = b.d[n];
- }
- template<typename U>
- float_SSE_vec(const vec<U,N>& b)
- {
- for(unsigned n=0; n<N; ++n) d[n] = b[n];
- }
- float_SSE_vec& operator=(const float_SSE_vec& b) noexcept = default;
- float_SSE_vec& operator=(float_SSE_vec&& b) noexcept = default;
- private:
- inline __m128 Make(unsigned w) const { return _mm_load_ps(&d[w*4]); }
- void Extract(unsigned w, __m128 v) { _mm_store_ps(&d[w*4], v); }
- void ExtractLast(__m128 v) { for(unsigned n=0; n<Extra; ++n) d[TableSize*4+n] = v[n]; }
- public:
- inline T operator[](unsigned n) const { return d[n]; }
- inline void set(unsigned n, T b) { d[n] = b; }
- T HorizontalSum() const
- {
- T out {};
- if(TableSize > 0)
- {
- __m128 result = Make(0);
- for(unsigned a=1; a<TableSize; ++a) result = _mm_add_ps(result, Make(a));
- #ifdef __SSE3__
- result = _mm_hadd_ps(result, result);
- result = _mm_hadd_ps(result, result);
- out = result[0];
- #else
- result = _mm_add_ps(result, _mm_movehl_ps(result, result));
- out = result[0] + result[1];
- #endif
- }
- for(unsigned n=0; n<Extra; ++n) out += d[TableSize*4 + n];
- return out;
- }
- vec<T,N> operator+ (const vec<T,N>& b) const
- {
- vec<T,N> result;
- /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), b.Make(n)));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] + b[TableSize*4]); }
- else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), b.Make(TableSize))); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b.d[n];
- return result;
- }
- vec<T,N> operator- (const vec<T,N>& b) const
- {
- vec<T,N> result;
- /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), b.Make(n)));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] - b[TableSize*4]); }
- else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), b.Make(TableSize))); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b.d[n];
- return result;
- }
- vec<T,N> operator* (const vec<T,N>& b) const
- {
- vec<T,N> result;
- /*for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), b.Make(n)));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] * b[TableSize*4]); }
- else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), b.Make(TableSize))); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b.d[n];
- return result;
- }
- vec<T,N> operator/ (const vec<T,N>& b) const
- {
- vec<T,N> result;
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), b.Make(n)));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] / b[TableSize*4]); }
- else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), b.Make(TableSize))); }
- return result;
- }
- vec<T,N> operator+ (T b) const
- {
- vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_add_ps(Make(n), bb));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] + b); }
- else if(Extra) { result.ExtractLast(_mm_add_ps(Make(TableSize), bb)); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] + b;
- return result;
- }
- vec<T,N> operator- (T b) const
- {
- vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_sub_ps(Make(n), bb));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] - b); }
- else if(Extra) { result.ExtractLast(_mm_sub_ps(Make(TableSize), bb)); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] - b;
- return result;
- }
- vec<T,N> operator* (T b) const
- {
- vec<T,N> result; /*__m128 bb = _mm_set1_ps(b);
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_mul_ps(Make(n), bb));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] * b); }
- else if(Extra) { result.ExtractLast(_mm_mul_ps(Make(TableSize), bb)); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = d[n] * b;
- return result;
- }
- vec<T,N> operator/ (T b) const
- {
- vec<T,N> result; __m128 bb = _mm_set1_ps(b);
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_div_ps(Make(n), bb));
- if(Extra==1) { result.set(TableSize*4, d[TableSize*4] / b); }
- else if(Extra) { result.ExtractLast(_mm_div_ps(Make(TableSize), bb)); }
- return result;
- }
- vec<T,N> operator- () const
- {
- vec<T,N> result;
- /*const int f = 0x80000000u; __m128 mask = (__m128)_mm_set_epi32(f,f,f,f);
- for(unsigned n=0; n<TableSize; ++n) result.Extract(n, _mm_xor_ps(Make(n), mask));
- if(Extra==1) { result.set(TableSize*4, -d[TableSize*4]); }
- else if(Extra) { result.ExtractLast(_mm_xor_ps(Make(TableSize), mask)); }*/
- for(unsigned n=0; n<N; ++n) result.d[n] = -d[n];
- return result;
- }
- void clamp(float min, float max)
- {
- __m128 mi = _mm_set1_ps(min), ma = _mm_set1_ps(max);
- for(unsigned n=0; n<TableSize; ++n) Extract(n, _mm_min_ps(_mm_max_ps(Make(n), mi), ma));
- if(Extra==1) { d[TableSize*4] = std::min(std::max(d[TableSize*4], min), max); }
- else if(Extra) { ExtractLast(_mm_min_ps(_mm_max_ps(Make(TableSize), mi), ma)); }
- }
- };
- #endif
- #ifdef __SSE2__
- template<unsigned N,
- unsigned TableSize = (N+1)/2,
- unsigned Extra = (N > TableSize*2) ? N-TableSize*2 : 0
- >
- struct double_SSE2_vec
- {
- public:
- typedef double T;
- typedef double_SSE2_vec<N,TableSize,Extra> me;
- __m128d d[TableSize] __attribute__((aligned(16)));
- T extra[Extra];
- static constexpr unsigned cap = TableSize*2;
- public:
- double_SSE2_vec(): d{} {}
- /*
- double_SSE2_vec(T a,T b=0)
- : d{} { d[0] = _mm_set_pd(b,a); }
- double_SSE2_vec(T a,T b, T c,T e=0)
- : d{} { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); }
- double_SSE2_vec(T a,T b, T c,T e, T f,T g=0)
- : d{} { d[0] = _mm_set_pd(b,a); d[1] = _mm_set_pd(e,c); d[2] = _mm_set_pd(g,f); }
- template<typename... U>
- double_SSE2_vec(T,T, T,T, T,T, T,U&&...) = delete;
- */
- public:
- VEC_GENERIC_METHODS(double_SSE2_vec)
- public:
- double_SSE2_vec(const me& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- double_SSE2_vec(me&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- double_SSE2_vec(const vec<double,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- double_SSE2_vec(vec<double,N>&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- for(unsigned n=0; n<Extra; ++n) extra[n] = b.extra[n];
- }
- template<typename U>
- double_SSE2_vec(const vec<U,N>& b)
- {
- // FIXME: Array overflow access
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm_set_pd( b[n*2+1], b[n*2+0] );
- for(unsigned n=0; n<Extra; ++n)
- extra[n] = b[cap+n];
- }
- double_SSE2_vec& operator=(const double_SSE2_vec& b) = default;
- double_SSE2_vec& operator=(double_SSE2_vec&& b) = default;
- T operator[](unsigned n) const { return n<cap ? d[n/2][n%2] : extra[n-cap]; }
- inline void set(unsigned n, T b) { if(n < cap) d[n/2][n%2] = b; else extra[n-cap] = b; }
- T HorizontalSum() const
- {
- __m128d result = d[0];
- for(unsigned a=1; a<TableSize; ++a) result = _mm_add_pd(result, d[a]);
- T out {};
- #ifdef __SSE3__
- if(N-Extra >= 2) result = _mm_hadd_pd(result, result);
- out = result[0];
- #else
- if(N-Extra >= 2) out = result[0] + result[1];
- else out = result[0];
- #endif
- for(unsigned n=0; n<Extra; ++n) out += extra[n];
- return out;
- }
- vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b.extra[n];
- return result; }
- vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b.extra[n];
- return result; }
- vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b.extra[n];
- return result; }
- vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], b.d[n]);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b.extra[n];
- return result; }
- vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_add_pd(d[n], _mm_set1_pd(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] + b;
- return result; }
- vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_sub_pd(d[n], _mm_set1_pd(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] - b;
- return result; }
- vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_mul_pd(d[n], _mm_set1_pd(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] * b;
- return result; }
- vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_div_pd(d[n], _mm_set1_pd(b));
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = extra[n] / b;
- return result; }
- vec<T,N> operator- () const
- {
- const int f = 0x80000000u; __m128d mask = (__m128d)_mm_set_epi32(f,0,f,0);
- vec<T,N> result;
- for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm_xor_pd(d[n], mask);
- for(unsigned n=0; n<Extra; ++n) result.extra[n] = -extra[n];
- return result;
- }
- void clamp(double min, double max)
- {
- __m128d mi = _mm_set1_pd(min), ma = _mm_set1_pd(max);
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm_min_pd(_mm_max_pd(d[n], mi), ma);
- for(unsigned n=0; n<Extra; ++n)
- extra[n] = std::min(std::max(extra[n], min), max);
- }
- };
- #endif
- #ifdef __AVX__
- template<unsigned N,
- unsigned TableSize = (N+7)/8>
- struct float_AVX_vec
- {
- public:
- typedef float T;
- typedef float_AVX_vec<N> me;
- __m256 d[TableSize] __attribute__((aligned(16)));
- public:
- float_AVX_vec(): d{} {}
- float_AVX_vec(T a,T b=0,T c=0,T e=0,T f=0,T g=0,T h=0,T i=0)
- : d{} { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); }
- float_AVX_vec(T a,T b,T c,T e,T f,T g,T h,T i, T j,T k=0,T l=0,T m=0,T n=0,T o=0,T p=0,T q=0)
- : d{} { d[0] = _mm256_set_ps(i,h,g,f,e,c,b,a); d[1] = _mm256_set_ps(q,p,o,n,m,l,k,j); }
- template<typename... U>
- float_AVX_vec(T,T,T,T,T,T,T,T, T,T,T,T,T,T,T,T, T,U&&...) = delete;
- public:
- VEC_GENERIC_METHODS(float_AVX_vec)
- public:
- float_AVX_vec(const me& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- float_AVX_vec(me&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- float_AVX_vec(const vec<float,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- float_AVX_vec(vec<float,N>&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- template<typename T>
- float_AVX_vec(const vec<T,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm256_set_ps( b[n*8+7], b[n*8+6], b[n*8+5], b[n*8+4],
- b[n*8+3], b[n*8+2], b[n*8+1], b[n*8+0] );
- }
- float_AVX_vec& operator=(const float_AVX_vec& b) = default;
- float_AVX_vec& operator=(float_AVX_vec&& b) = default;
- T operator[](unsigned n) const { return d[n/8][n%8]; }
- inline void set(unsigned n, T b) { d[n/8][n%8] = b; }
- T HorizontalSum() const
- {
- __m256 result = d[0];
- for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_ps(result, d[a]);
- // FIXME: verify semantics
- if(N > 4) result = _mm256_hadd_ps(result, result);
- if(N > 2) result = _mm256_hadd_ps(result, result);
- if(N > 1) result = _mm256_hadd_ps(result, result);
- return result[0];
- }
- vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], b.d[n]); return result; }
- vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], b.d[n]); return result; }
- vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], b.d[n]); return result; }
- vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], b.d[n]); return result; }
- vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_ps(d[n], _mm256_set1_ps(b)); return result; }
- vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_ps(d[n], _mm256_set1_ps(b)); return result; }
- vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_ps(d[n], _mm256_set1_ps(b)); return result; }
- vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_ps(d[n], _mm256_set1_ps(b)); return result; }
- vec<T,N> operator- () const
- {
- const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,f,f,f,f,f,f,f);
- vec<T,N> result;
- for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_ps(d[n], mask);
- return result;
- }
- void clamp(float min, float max)
- {
- __m256 mi = _mm256_set1_ps(min), ma = _mm256_set1_ps(max);
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm256_min_ps(_mm256_max_ps(d[n], mi), ma);
- }
- };
- template<unsigned N,
- unsigned TableSize = (N+3)/4>
- struct double_AVX_vec
- {
- public:
- typedef float T;
- typedef double_AVX_vec<N> me;
- __m256d d[TableSize] __attribute__((aligned(16)));
- public:
- double_AVX_vec(): d{} {}
- double_AVX_vec(T a,T b=0,T c=0,T e=0)
- : d{} { d[0] = _mm256_set_pd(e,c,b,a); }
- double_AVX_vec(T a,T b,T c,T e, T f,T g=0,T h=0,T i=0)
- : d{} { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); }
- double_AVX_vec(T a,T b,T c,T e, T f,T g,T h,T i, T j,T k=0,T l=0,T m=0)
- : d{} { d[0] = _mm256_set_pd(e,c,b,a); d[1] = _mm256_set_pd(i,h,g,f); d[2] = _mm256_set_pd(m,l,k,j); }
- template<typename... U>
- double_AVX_vec(T,T,T,T, T,T,T,T, T,T,T,T, T,U&&...) = delete;
- public:
- VEC_GENERIC_METHODS(double_AVX_vec)
- public:
- double_AVX_vec(const me& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- double_AVX_vec(me&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- double_AVX_vec(const vec<float,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- double_AVX_vec(vec<float,N>&& b)
- {
- for(unsigned n=0; n<TableSize; ++n) d[n] = b.d[n];
- }
- template<typename T>
- double_AVX_vec(const vec<T,N>& b)
- {
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm256_set_pd( b[n*4+3], b[n*4+2], b[n*4+1], b[n*4+0] );
- }
- double_AVX_vec& operator=(const double_AVX_vec& b) = default;
- double_AVX_vec& operator=(double_AVX_vec&& b) = default;
- T operator[](unsigned n) const { return d[n/4][n%4]; }
- inline void set(unsigned n, T b) { d[n/4][n%4] = b; }
- T HorizontalSum() const
- {
- __m256d result = d[0];
- for(unsigned a=1; a<TableSize; ++a) result = _mm256_add_pd(result, d[a]);
- if(N > 2) result = _mm256_hadd_pd(result, result);
- if(N > 1) result = _mm256_hadd_pd(result, result);
- return result[0];
- }
- vec<T,N> operator+ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], b.d[n]); return result; }
- vec<T,N> operator- (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], b.d[n]); return result; }
- vec<T,N> operator* (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], b.d[n]); return result; }
- vec<T,N> operator/ (const vec<T,N>& b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], b.d[n]); return result; }
- vec<T,N> operator+ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_add_pd(d[n], _mm256_set1_pd(b)); return result; }
- vec<T,N> operator- (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_sub_pd(d[n], _mm256_set1_pd(b)); return result; }
- vec<T,N> operator* (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_mul_pd(d[n], _mm256_set1_pd(b)); return result; }
- vec<T,N> operator/ (T b) const { vec<T,N> result; for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_div_pd(d[n], _mm256_set1_pd(b)); return result; }
- vec<T,N> operator- () const
- {
- const int f = 0x80000000u; __m256d mask = (__m256d)_mm256_set_epi32(f,0,f,0,f,0,f,0);
- vec<T,N> result;
- for(unsigned n=0; n<TableSize; ++n) result.d[n] = _mm256_xor_pd(d[n], mask);
- return result;
- }
- void clamp(float min, float max)
- {
- __m256 mi = _mm256_set1_pd(min), ma = _mm256_set1_pd(max);
- for(unsigned n=0; n<TableSize; ++n)
- d[n] = _mm256_min_pd(_mm256_max_pd(d[n], mi), ma);
- }
- };
- #endif
- #ifdef ENABLE_SSE_FLOAT_2
- template<> struct vec<float,2>: public float_SSE_vec<2>
- {
- public:
- template<typename...T>
- vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}
- float CrossProduct(const vec<float,2>& b) const
- {
- // 3210 * xx01
- __m128 d0 = _mm_set_ps(0,0,d[1],d[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
- __m128 tmp = _mm_mul_ps(d0, d1);
- // xxx0 - xxx1
- return tmp[0] - tmp[1];
- }
- };
- #else
- // Both of these are unnecessary for Clang
- /*float CrossProduct(const vec<float,2>& a, const vec<float,2>& b)
- {
- // 3210 * xx01
- __m128 d0 = _mm_set_ps(0,0,a[1],a[0]), d1 = _mm_set_ps(0,0,b[0],b[1]);
- __m128 tmp = _mm_mul_ps(d0, d1);
- // xxx0 - xxx1
- return tmp[0] - tmp[1];
- }*/
- /*vec<float,2> operator*(const vec<float,2>& a, const vec<float,2>& b)
- {
- __m128 d0 = _mm_loadu_ps((const float*)&a);
- __m128 d1 = _mm_loadu_ps((const float*)&b);
- __m128 result = _mm_mul_ps(d0, d1);
- return {result[0], result[1]};
- }*/
- #endif
- #ifdef ENABLE_SSE2_DOUBLE_2
- template<> struct vec<double,2>: public double_SSE2_vec<2>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
- double CrossProduct(const vec<double,2>& b) const
- {
- // 3210 * xx01
- __m128d tmp = _mm_mul_pd(d[0], _mm_shuffle_pd(b.d[0], b.d[0], 1));
- // xxx0 - xxx1
- #ifdef __SSE3__
- return _mm_hsub_pd(tmp, tmp)[0];
- #else
- return tmp[0] - tmp[1];
- #endif
- }
- };
- #endif
- #ifdef ENABLE_AVX_DOUBLE_4
- template<> struct vec<double,4>: public double_AVX_vec<4>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
- };
- #elif defined(ENABLE_SSE2_DOUBLE_4)
- template<> struct vec<double,4>: public double_SSE2_vec<4>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
- };
- #endif
- #ifdef ENABLE_SSE2_DOUBLE_6
- template<> struct vec<double,6>: public double_SSE2_vec<6>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
- };
- #endif
- #ifdef ENABLE_AVX_DOUBLE_8
- template<> struct vec<double,8>: public double_AVX_vec<8>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
- };
- #endif
- #ifdef ENABLE_AVX_DOUBLE_3
- template<> struct vec<double,3>: public double_AVX_vec<3>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_AVX_vec(std::forward<T>(args)...) {}
- vec<double,3> CrossProduct(const vec<double,3>& b) const;
- };
- #elif defined(ENABLE_SSE2_DOUBLE_3)
- template<> struct vec<double,3>: public double_SSE2_vec<3,1>
- {
- public:
- template<typename...T>
- vec(T&&... args) : double_SSE2_vec(std::forward<T>(args)...) {}
- vec<double,3> CrossProduct(const vec<double,3>& b) const;
- };
- #endif
- #if defined(ENABLE_AVX_DOUBLE_3) || defined(ENABLE_SSE2_DOUBLE_3)
- vec<double,3> vec<double,3>::CrossProduct(const vec<double,3>& b) const
- {
- #define x d[0][0]
- #define y d[0][1]
- #define z d[1][0]
- #ifdef ENABLE_AVX_DOUBLE_8
- vec<double,8> data1{(double) y, (double) z, (double) x, 0, (double) z, (double) x, (double) y, 0};
- vec<double,8> data2{(double)b.z, (double)b.x, (double)b.y, 0, (double)b.y, (double)b.z, (double)b.x, 0};
- auto res = data1 * data2;
- return res.template Limit<4,0>() - res.template Limit<4,4>();
- #else
- vec<double,6> data1{(double) y, (double) z, (double) x, (double) z, (double) x, (double) y};
- vec<double,6> data2{(double)b.z, (double)b.x, (double)b.y, (double)b.y, (double)b.z, (double)b.x};
- auto res = data1 * data2;
- return res.template Limit<3,0>() - res.template Limit<3,3>();
- #endif
- #undef x
- #undef y
- #undef z
- }
- #endif
- #ifdef ENABLE_SSE_FLOAT_3
- template<> struct vec<float,3>: public float_SSE_vec<3>
- {
- public:
- template<typename...T>
- vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {}
- vec<float,3> CrossProduct(const vec<float,3>& b) const
- {
- vec<float,3> result;
- result.d[0] = _mm_sub_ps(
- _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,0,2,1)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,1,0,2))),
- _mm_mul_ps(_mm_shuffle_ps(d[0],d[0], _MM_SHUFFLE(0,1,0,2)), _mm_shuffle_ps(b.d[0],b.d[0], _MM_SHUFFLE(0,0,2,1)))
- );
- return result;
- }
- };
- #else
- // This does help Clang a bit.
- /*#ifdef H1
- vec<float,3> CrossProduct(const vec<float,3>& a, const vec<float,3>& b)
- {
- __m128 av = _mm_loadu_ps(&a[0]), bv = _mm_loadu_ps(&b[0]);
- __m128 d0 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,0,2,1)), d1 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,1,0,2));
- __m128 d2 = _mm_shuffle_ps(av,av, _MM_SHUFFLE(0,1,0,2)), d3 = _mm_shuffle_ps(bv,bv, _MM_SHUFFLE(0,0,2,1));
- auto res = _mm_sub_ps(_mm_mul_ps(d0,d1), _mm_mul_ps(d2,d3));
- return vec<float,3>{(float)res[0],(float)res[1],(float)res[2]};
- }
- #endif*/
- #endif
- #ifdef ENABLE_SSE_FLOAT_4
- template<> struct vec<float,4>: public float_SSE_vecFull<4> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_AVX_FLOAT_4)
- template<> struct vec<float,4>: public float_AVX_vec<4> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_5
- template<> struct vec<float,5>: public float_AVX_vec<5> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_5)
- template<> struct vec<float,5>: public float_SSE_vec<5,1> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_6
- template<> struct vec<float,6>: public float_AVX_vec<6> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_6)
- template<> struct vec<float,6>: public float_SSE_vec<6> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_7
- template<> struct vec<float,7>: public float_AVX_vec<7> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_7)
- template<> struct vec<float,7>: public float_SSE_vec<7> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_8
- template<> struct vec<float,8>: public float_AVX_vec<8> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_8)
- template<> struct vec<float,8>: public float_SSE_vecFull<8> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_9
- template<> struct vec<float,9>: public float_AVX_vec<9> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_9)
- template<> struct vec<float,9>: public float_SSE_vec<9,2> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_10
- template<> struct vec<float,10>: public float_AVX_vec<10> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_10)
- template<> struct vec<float,10>: public float_SSE_vec<10> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_11
- template<> struct vec<float,11>: public float_AVX_vec<11> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_11)
- template<> struct vec<float,11>: public float_SSE_vec<11> { public: template<typename...T> vec(T&&... args) : float_SSE_vec(std::forward<T>(args)...) {} };
- #endif
- #ifdef ENABLE_AVX_FLOAT_12
- template<> struct vec<float,12>: public float_AVX_vec<12> { public: template<typename...T> vec(T&&... args) : float_AVX_vec(std::forward<T>(args)...) {} };
- #elif defined(ENABLE_SSE_FLOAT_12)
- template<> struct vec<float,12>: public float_SSE_vecFull<12> { public: template<typename...T> vec(T&&... args) : float_SSE_vecFull(std::forward<T>(args)...) {} };
- #endif
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement