Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ----BY EXTENSION----
- AVX: {
- arithmetic: {
- __m256 _mm256_add_ps (__m256 a, __m256 b): add packed f32
- __m256 _mm256_div_ps (__m256 a, __m256 b): divide packed f32
- __m256 _mm256_mul_ps (__m256 a, __m256 b): multiply packed f32
- __m256 _mm256_sub_ps (__m256 a, __m256 b): subtract packed f32
- __m256 _mm256_hadd_ps (__m256 a, __m256 b): horiz. add packed f32
- }
- convert: {
- __m256i _mm256_cvtps_epi32 (__m256 a): packed f32 to i32 (w/ rounding)
- __m256i _mm256_cvttps_epi32 (__m256 a): packed f32 to i32 (w/ truncation)
- __m256 _mm256_cvtepi32_ps (__m256i a): packed i32 to f32 (w/ rounding)
- }
- load: {
- __m256 _mm256_load_ps (float const * mem_addr): f32 (32B aligned for dst.mem_addr)
- __m256 _mm256_loadu_ps (float const * mem_addr): f32 (no alignment for dst.mem_addr)
- __m256i _mm256_load_si256 (__m256i const * mem_addr): int (32B aligned for dst.mem_addr)
- __m256i _mm256_loadu_si256 (__m256i const * mem_addr): int (no alignment for dst.mem_addr)
- }
- store: {
- void _mm256_store_ps (float * mem_addr, __m256 a): f32 (32B aligned for mem_addr)
- void _mm256_storeu_ps (float * mem_addr, __m256 a): f32 (no alignment for mem_addr)
- void _mm256_store_si256 (__m256i * mem_addr, __m256i a): int (32B aligned for mem_addr)
- void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a): int (no alignment for mem_addr)
- }
- set: {
- __m256i _mm256_set1_epi8 (char a): set all elements of dst to i8 value
- __m256i _mm256_set1_epi16 (short a): set all elements of dst to i16 value
- __m256i _mm256_set1_epi32 (int a): set all elements of dst to i32 value
- __m256 _mm256_set1_ps (float a): set all elements of dst to f32 value
- }
- special math: {
- __m256 _mm256_max_ps (__m256 a, __m256 b): max() on packed f32
- __m256 _mm256_min_ps (__m256 a, __m256 b): min() on packed f32
- __m256 _mm256_round_ps (__m256 a, int rounding): round packed f32
- }
- swizzle: {
- __m256 _mm256_unpackhi_ps (__m256 a, __m256 b): unpack & interleave f32 from high 128b of a&b
- __m256 _mm256_unpacklo_ps (__m256 a, __m256 b): unpack & interleave f32 from low 128b of a&b
- }
- }
- AVX2: {
- arithmetic: {
- __m256i _mm256_add_epi8 (__m256i a, __m256i b): add packed i8
- __m256i _mm256_add_epi16 (__m256i a, __m256i b): add packed i16
- __m256i _mm256_add_epi32 (__m256i a, __m256i b): add packed i32
- __m256i _mm256_adds_epi8 (__m256i a, __m256i b): add packed i8 w/ saturation
- __m256i _mm256_adds_epi16 (__m256i a, __m256i b): add packed i16 w/ saturation
- __m256i _mm256_adds_epu8 (__m256i a, __m256i b): add packed u8 w/ saturation
- __m256i _mm256_adds_epu16 (__m256i a, __m256i b): add packed u16 w/ saturation
- __m256i _mm256_sub_epi8 (__m256i a, __m256i b): subtract packed i8
- __m256i _mm256_sub_epi16 (__m256i a, __m256i b): subtract packed i16
- __m256i _mm256_sub_epi32 (__m256i a, __m256i b): subtract packed i32
- __m256i _mm256_subs_epi8 (__m256i a, __m256i b): subtract packed i8 w/ saturation
- __m256i _mm256_subs_epi16 (__m256i a, __m256i b): subtract packed i16 w/ saturation
- __m256i _mm256_subs_epu8 (__m256i a, __m256i b): subtract packed u8 w/ saturation
- __m256i _mm256_subs_epu16 (__m256i a, __m256i b): subtract packed u16 w/ saturation
- __m256i _mm256_hadd_epi16 (__m256i a, __m256i b): horiz. add packed i16 pairs
- __m256i _mm256_hadds_epi16 (__m256i a, __m256i b): horiz. add packed i16 pairs w/ saturation
- __m256i _mm256_hadd_epi32 (__m256i a, __m256i b): horiz. add packed i32 pairs
- }
- convert: {
- __m256i _mm256_cvtepi8_epi16 (__m128i a): sign-extend packed i8 to i16
- __m256i _mm256_cvtepi16_epi32 (__m128i a): sign-extend packed i16 to i32
- __m256i _mm256_cvtepi8_epi32 (__m128i a): zero-extend packed i8 to i32
- __m256i _mm256_cvtepu8_epi16 (__m128i a): zero-extend packed u8 to i16
- __m256i _mm256_cvtepu8_epi32 (__m128i a): zero-extend packed u8 to i32
- __m256i _mm256_cvtepu16_epi32 (__m128i a): zero-extend packed u16 to i32
- }
- prob/stat: {
- __m256i _mm256_avg_epu8 (__m256i a, __m256i b): average packed u8
- __m256i _mm256_avg_epu16 (__m256i a, __m256i b): average packed u16
- }
- special math: {
- __m256i _mm256_max_epi8 (__m256i a, __m256i b): max() on packed i8
- __m256i _mm256_max_epi16 (__m256i a, __m256i b): max() on packed i16
- __m256i _mm256_max_epi32 (__m256i a, __m256i b): max() on packed i32
- __m256i _mm256_max_epu8 (__m256i a, __m256i b): max() on packed u8
- __m256i _mm256_max_epu16 (__m256i a, __m256i b): max() on packed u16
- __m256i _mm256_max_epu32 (__m256i a, __m256i b): max() on packed u32
- __m256i _mm256_min_epi8 (__m256i a, __m256i b): min() on packed i8
- __m256i _mm256_min_epi16 (__m256i a, __m256i b): min() on packed i16
- __m256i _mm256_min_epi32 (__m256i a, __m256i b): min() on packed i32
- __m256i _mm256_min_epu8 (__m256i a, __m256i b): min() on packed u8
- __m256i _mm256_min_epu16 (__m256i a, __m256i b): min() on packed u16
- __m256i _mm256_min_epu32 (__m256i a, __m256i b): min() on packed u32
- }
- misc: {
- __m256i _mm256_packs_epi32 (__m256i a, __m256i b): packed i32 to i16 (w/ signed saturation)
- __m256i _mm256_packs_epi16 (__m256i a, __m256i b): packed i16 to i8 (w/ signed saturation)
- __m256i _mm256_packus_epi32 (__m256i a, __m256i b): packed i32 to i16 (w/ unsigned saturation)
- __m256i _mm256_packus_epi16 (__m256i a, __m256i b): packed i16 to i8 (w/ unsigned saturation)
- }
- swizzle: {
- __m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b): unpack and interleave i8 from high 128b of a&b
- __m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b): unpack and interleave i16 from high 128b of a&b
- __m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b): unpack and interleave i32 from high 128b of a&b
- __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b): unpack and interleave i8 from low 128b of a&b
- __m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b): unpack and interleave i16 from low 128b of a&b
- __m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b): unpack and interleave i32 from low 128b of a&b
- }
- }
- SSE: {
- arithmetic: {
- __m128 _mm_add_ps (__m128 a, __m128 b): add packed f32
- __m128 _mm_div_ps (__m128 a, __m128 b): divide packed f32
- __m128 _mm_mul_ps (__m128 a, __m128 b): multiply packed f32
- __m128 _mm_sub_ps (__m128 a, __m128 b): subtract packed f32
- }
- convert: {
- __m128 _mm_cvtpi16_ps (__m64 a): packed i16 to f32
- __m128 _mm_cvtpu16_ps (__m64 a): packed u16 to f32
- __m64 _mm_cvttps_pi32 (__m128 a): packed f32 to i32 (w/ truncation)
- __m64 _mm_cvtt_ps2pi (__m128 a): alias of _mm_cvttps_pi32
- }
- load: {
- __m128 _mm_load_ps1 (float const* mem_addr): basically an f32 set1 but from memory
- __m128 _mm_load1_ps (float const* mem_addr): alias of _mm_load1_ps
- __m128 _mm_load_ps (float const* mem_addr): f32 (16B alignment for dst.mem_addr)
- __m128 _mm_loadu_ps (float const* mem_addr): f32 (no alignment for dst.mem_addr)
- }
- store: {
- void _mm_store_ps (float* mem_addr, __m128 a): f32 (16B alignment for mem_addr)
- void _mm_storeu_ps (float* mem_addr, __m128 a): f32 (no alignment for mem_addr)
- }
- move: {
- __m128 _mm_movehl_ps (__m128 a, __m128 b): f32; b's hi 2 to low dst, a's hi 2 to high dst
- }
- prob/stat: {
- __m64 _mm_avg_pu8 (__m64 a, __m64 b): average packed u8
- __m64 _mm_avg_pu16 (__m64 a, __m64 b): average packed u16
- __m64 _m_pavgb (__m64 a, __m64 b): average packed u8 (alias of _mm_avg_pu8)
- __m64 _m_pavgw (__m64 a, __m64 b): average packed u16 (alias of _mm_avg_pu16)
- }
- set: {
- __m128 _mm_set1_ps (float a): set all elements of dst to f32
- }
- special math: {
- __m64 _mm_max_pu8 (__m64 a, __m64 b): max() packed u8
- __m64 _mm_max_pi16 (__m64 a, __m64 b): max() packed i16
- __m128 _mm_max_ps (__m128 a, __m128 b): max() packed f32
- __m64 _mm_min_pu8 (__m64 a, __m64 b): min() packed u8
- __m64 _mm_min_pi16 (__m64 a, __m64 b): min() packed i16
- __m128 _mm_min_ps (__m128 a, __m128 b): min() packed f32
- __m64 _m_pmaxub (__m64 a, __m64 b): max() packed u8 (alias of _mm_max_pu8)
- __m64 _m_pmaxsw (__m64 a, __m64 b): max() packed i16 (alias of _mm_max_pi16)
- __m64 _m_pminub (__m64 a, __m64 b): min() packed u8 (alias of _mm_min_pu8)
- __m64 _m_pminsw (__m64 a, __m64 b): min() packed i16 (alias of _mm_min_pi16)
- }
- swizzle: {
- __m128 _mm_unpackhi_ps (__m128 a, __m128 b): unpack & interleave f32 from high 64b of a&b
- __m128 _mm_unpacklo_ps (__m128 a, __m128 b): unpack & interleave f32 from low 64b of a&b
- __m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8): shuffle f32 using control
- }
- }
- SSE2: {
- arithmetic: {
- __m128i _mm_add_epi8 (__m128i a, __m128i b): add packed i8
- __m128i _mm_add_epi16 (__m128i a, __m128i b): add packed i16
- __m128i _mm_add_epi32 (__m128i a, __m128i b): add packed i32
- __m128i _mm_adds_epi8 (__m128i a, __m128i b): add packed i8 (w/ saturation)
- __m128i _mm_adds_epi16 (__m128i a, __m128i b): add packed i16 (w/ saturation)
- __m128i _mm_adds_epu8 (__m128i a, __m128i b): add packed u8 (w/ saturation)
- __m128i _mm_adds_epu16 (__m128i a, __m128i b): add packed u16 (w/ saturation)
- __m128i _mm_sub_epi8 (__m128i a, __m128i b): subtract packed i8
- __m128i _mm_sub_epi16 (__m128i a, __m128i b): subtract packed i16
- __m128i _mm_sub_epi32 (__m128i a, __m128i b): subtract packed i32
- __m128i _mm_subs_epi8 (__m128i a, __m128i b): subtract packed i8 (w/ saturation)
- __m128i _mm_subs_epi16 (__m128i a, __m128i b): subtract packed i16 (w/ saturation)
- __m128i _mm_subs_epu8 (__m128i a, __m128i b): subtract packed u8 (w/ saturation)
- __m128i _mm_subs_epu16 (__m128i a, __m128i b): subtract packed u16 (w/ saturation)
- }
- convert: {
- __m128 _mm_cvtepi32_ps (__m128i a): packed i32 to f32
- __m128i _mm_cvtps_epi32 (__m128 a): packed f32 to i32 (w/ rounding)
- __m128i _mm_cvttps_epi32 (__m128 a): packed f32 to i32 (w/ truncation)
- }
- load: {
- __m128i _mm_load_si128 (__m128i const* mem_addr): 128-bits int (16B aligned for dst.mem_addr)
- __m128i _mm_loadu_si128 (__m128i const* mem_addr): 128-bits int (no alignment for dst.mem_addr)
- }
- store: {
- void _mm_store_si128 (__m128i* mem_addr, __m128i a): 128-bits int (16B aligned for mem_addr)
- void _mm_storeu_si128 (__m128i* mem_addr, __m128i a): 128-bits int (no alignment for mem_addr)
- }
- prob/stat: {
- __m128i _mm_avg_epu8 (__m128i a, __m128i b): average packed u8
- __m128i _mm_avg_epu16 (__m128i a, __m128i b): average packed u16
- }
- set: {
- __m128i _mm_set1_epi8 (char a): set all elements of dst to i8
- __m128i _mm_set1_epi16 (short a): set all elements of dst to i16
- __m128i _mm_set1_epi32 (int a): set all elements of dst to i32
- }
- special math: {
- __m128i _mm_max_epu8 (__m128i a, __m128i b): max() packed u8
- __m128i _mm_max_epi16 (__m128i a, __m128i b): max() packed i16
- __m128i _mm_min_epu8 (__m128i a, __m128i b): min() packed u8
- __m128i _mm_min_epi16 (__m128i a, __m128i b): min() packed i16
- }
- swizzle: {
- __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b): unpack & interleave i8 from high 64b of a&b
- __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b): unpack & interleave i16 from high 64b of a&b
- __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b): unpack & interleave i32 from high 64b of a&b
- __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b): unpack & interleave i8 from low 64b of a&b
- __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b): unpack & interleave i16 from low 64b of a&b
- __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b): unpack & interleave i32 from low 64b of a&b
- }
- }
- SSE3: {
- arithmetic: {
- __m128 _mm_hadd_ps (__m128 a, __m128 b): horiz. add packed f32 pairs
- }
- }
- SSE41: {
- convert: {
- __m128i _mm_cvtepi8_epi16 (__m128i a): sign-extend packed i8 to i16
- __m128i _mm_cvtepi8_epi32 (__m128i a): sign-extend packed i8 to i32
- __m128i _mm_cvtepi16_epi32 (__m128i a): sign-extend packed i16 to i32
- __m128i _mm_cvtepu8_epi16 (__m128i a): zero-extend packed u8 to i16
- __m128i _mm_cvtepu8_epi32 (__m128i a): zero-extend packed u8 to i32
- __m128i _mm_cvtepu16_epi32 (__m128i a): zero-extend packed u16 to i32
- }
- special math: {
- __m128i _mm_max_epi8 (__m128i a, __m128i b): max() packed i8
- __m128i _mm_max_epi32 (__m128i a, __m128i b): max() packed i32
- __m128i _mm_max_epu16 (__m128i a, __m128i b): max() packed u16
- __m128i _mm_max_epu32 (__m128i a, __m128i b): max() packed u32
- __m128i _mm_min_epi8 (__m128i a, __m128i b): min() packed i8
- __m128i _mm_min_epi32 (__m128i a, __m128i b): min() packed i32
- __m128i _mm_min_epu16 (__m128i a, __m128i b): min() packed u16
- __m128i _mm_min_epu32 (__m128i a, __m128i b): min() packed u32
- __m128 _mm_round_ps (__m128 a, int rounding): round packed f32
- }
- }
- ----BY OPERATION----
- add: {
- SSE, __m128 ->__m128 : f32 (_mm_add_ps)
- SSE2, __m128i->__m128i: i8 (_mm_add_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_add_epi16)
- SSE2, __m128i->__m128i: i32 (_mm_add_epi32)
- AVX, __m256 ->__m256 : f32 (_mm256_add_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_add_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_add_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_add_epi32)
- }
- divide: {
- SSE, __m128 ->__m128 : f32 (_mm_div_ps)
- AVX, __m256->__m256: f32 (_mm256_div_ps)
- }
- multiply: {
- SSE, __m128 ->__m128 : f32 (_mm_mul_ps)
- AVX, __m256->__m256: f32 (_mm256_mul_ps)
- }
- subtract: {
- SSE, __m128 ->__m128 : f32 (_mm_sub_ps)
- SSE2, __m128i->__m128i: i8 (_mm_sub_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_sub_epi16)
- SSE2, __m128i->__m128i: i32 (_mm_sub_epi32)
- AVX, __m256 ->__m256 : f32 (_mm256_sub_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_sub_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_sub_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_sub_epi32)
- }
- add w/ saturation: {
- SSE2, __m128i->__m128i: i8 (_mm_adds_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_adds_epi16)
- SSE2, __m128i->__m128i: u8 (_mm_adds_epu8)
- SSE2, __m128i->__m128i: u16 (_mm_adds_epu16)
- AVX2, __m256i->__m256i: i8 (_mm256_adds_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_adds_epi16)
- AVX2, __m256i->__m256i: u8 (_mm256_adds_epu8)
- AVX2, __m256i->__m256i: u16 (_mm256_adds_epu16)
- }
- subtract w/ saturation: {
- SSE2, __m128i->__m128i: i8 (_mm_subs_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_subs_epi16)
- SSE2, __m128i->__m128i: u8 (_mm_subs_epu8)
- SSE2, __m128i->__m128i: u16 (_mm_subs_epu16)
- AVX2, __m256i->__m256i: i8 (_mm256_subs_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_subs_epi16)
- AVX2, __m256i->__m256i: u8 (_mm256_subs_epu8)
- AVX2, __m256i->__m256i: u16 (_mm256_subs_epu16)
- }
- horiz. add: {
- SSE3, __m128 ->__m128 : f32 (_mm_hadd_ps)
- AVX, __m256 ->__m256 : f32 (_mm256_hadd_ps)
- AVX2, __m256i->__m256i: i16 (_mm256_hadd_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_hadd_epi32)
- }
- horiz. add w/ saturation: {
- AVX2, __m256i->__m256i: i16 (_mm256_hadds_epi16)
- }
- convert: {
- rounding or n/a: {
- SSE, __m64 ->__m128 : i16 to f32 (_mm_cvtpi16_ps)
- SSE, __m64 ->__m128 : u16 to f32 (_mm_cvtpu16_ps)
- SSE2, __m128i->__m128 : i32 to f32 (_mm_cvtepi32_ps)
- SSE2, __m128 ->__m128i: f32 to i32 (_mm_cvtps_epi32)
- AVX, __m256 ->__m256i: f32 to i32 (_mm256_cvtps_epi32)
- AVX, __m256i->__m256 : i32 to f32 (_mm256_cvtepi32_ps)
- }
- truncation: {
- SSE, __m128->__m64 : f32 to i32 (_mm_cvttps_pi32)
- SSE2, __m128->__m128i: f32 to i32 (_mm_cvttps_epi32)
- AVX, __m256->__m256i: f32 to i32 (_mm256_cvttps_epi32)
- }
- signed saturation: {
- }
- }
- load (alignment is 16B for m128, 32B for m256): {
- SSE, float*->__m128 : f32 (_mm_load_ps)
- SSE2, __m128i*->__m128i: int (_mm_load_si128)
- AVX, float*->__m256 : f32 (_mm256_load_ps)
- AVX, __m256i*->__m256i: int (_mm256_load_si256)
- }
- load unaligned: {
- SSE, float*->__m128 : f32 (_mm_load1_ps) (acts like like set1)
- SSE, float*->__m128 : f32 (_mm_loadu_ps)
- SSE2, __m128i*->__m128i: int (_mm_loadu_si128)
- AVX, float*->__m256 : f32 (_mm256_loadu_ps)
- AVX, __m256i*->__m256i: int (_mm256_loadu_si256)
- }
- store (alignment is 16B for m128, 32B for m256): {
- SSE, __m128 ->float* : f32 (_mm_store_ps)
- SSE2, __m128i->__m128i*: int (_mm_store_si128)
- AVX, __m256 ->float* : f32 (_mm256_store_ps)
- AVX, __m256i->__m256i*: int (_mm256_store_si256)
- }
- store unaligned: {
- SSE, __m128 ->float* : f32 (_mm_storeu_ps)
- SSE2, __m128i->__m128i*: int (_mm_storeu_si128)
- AVX, __m256 ->float* : f32 (_mm256_storeu_ps)
- AVX, __m256i->__m256i*: int (_mm256_storeu_si256)
- }
- movehl: {
- SSE, __m128->__m128: f32 (_mm_movehl_ps)
- }
- shuffle: {
- SSE, __m128->__m128: f32 using control (_mm_shuffle_ps)
- }
- set1: {
- SSE, float->__m128 : f32 (_mm_set1_ps)
- SSE2, char ->__m128i: i8 (_mm_set1_epi8)
- SSE2, short->__m128i: i16 (_mm_set1_epi16)
- SSE2, int ->__m128i: i32 (_mm_set1_epi32)
- AVX, char ->__m256i: i8 (_mm256_set1_epi8)
- AVX, short->__m256i: i16 (_mm256_set1_epi16)
- AVX, int ->__m256i: i32 (_mm256_set1_epi32)
- AVX, float->__m256 : f32 (_mm256_set1_ps)
- }
- unpack high: {
- SSE, __m128 ->__m128 : f32 (_mm_unpackhi_ps)
- SSE2, __m128i->__m128i: i8 (_mm_unpackhi_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_unpackhi_epi16)
- SSE2, __m128i->__m128i: i32 (_mm_unpackhi_epi32)
- AVX, __m256 ->__m256 : f32 (_mm256_unpackhi_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_unpackhi_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_unpackhi_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_unpackhi_epi32)
- }
- unpack low: {
- SSE, __m128 ->__m128 : f32 (_mm_unpacklo_ps)
- SSE2, __m128i->__m128i: i8 (_mm_unpacklo_epi8)
- SSE2, __m128i->__m128i: i16 (_mm_unpacklo_epi16)
- SSE2, __m128i->__m128i: i32 (_mm_unpacklo_epi32)
- AVX, __m256 ->__m256 : f32 (_mm256_unpacklo_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_unpacklo_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_unpacklo_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_unpacklo_epi32)
- }
- pack w/ signed saturation: {
- AVX2, __m256i->__m256i: i32 to i16 (_mm256_packs_epi32)
- AVX2, __m256i->__m256i: i16 to i8 (_mm256_packs_epi16)
- }
- pack w/ unsigned saturation: {
- AVX2, __m256i->__m256i: i32 to i16 (_mm256_packus_epi32)
- AVX2, __m256i->__m256i: i16 to i8 (_mm256_packus_epi16)
- }
- sign-extend: {
- SSE41, __m128i->__m128i: i8 to i16 (_mm_cvtepi8_epi16)
- SSE41, __m128i->__m128i: i8 to i32 (_mm_cvtepi8_epi32)
- SSE41, __m128i->__m128i: i16 to i32 (_mm_cvtepi16_epi32)
- AVX2, __m128i->__m256i: i8 to i16 (_mm256_cvtepi8_epi16)
- AVX2, __m128i->__m256i: i16 to i32 (_mm256_cvtepi16_epi32)
- }
- zero-extend: {
- SSE41, __m128i->__m128i: u8 to i16 (_mm_cvtepu8_epi16)
- SSE41, __m128i->__m128i: u8 to i32 (_mm_cvtepu8_epi32)
- SSE41, __m128i->__m128i: u16 to i32 (_mm_cvtepu16_epi32)
- AVX2, __m128i->__m256i: i8 to i32 (_mm256_cvtepi8_epi32)
- AVX2, __m128i->__m256i: u8 to i16 (_mm256_cvtepu8_epi16)
- AVX2, __m128i->__m256i: u8 to i32 (_mm256_cvtepu8_epi32)
- AVX2, __m128i->__m256i: u16 to i32 (_mm256_cvtepu16_epi32)
- }
- average: {
- SSE, __m64 ->__m64 : u8 (_mm_avg_pu8)
- SSE, __m64 ->__m64 : u16 (_mm_avg_pu16)
- SSE2, __m128i->__m128i: u8 (_mm_avg_epu8)
- SSE2, __m128i->__m128i: u16 (_mm_avg_epu16)
- AVX2, __m256i->__m256i: u8 (_mm256_avg_epu8)
- AVX2, __m256i->__m256i: u16 (_mm256_avg_epu16)
- }
- max: {
- SSE, __m64 ->__m64 : u8 (_mm_max_pu8)
- SSE, __m64 ->__m64 : i16 (_mm_max_pi16)
- SSE, __m128 ->__m128 : f32 (_mm_max_ps)
- SSE2, __m128i->__m128i: u8 (_mm_max_epu8)
- SSE2, __m128i->__m128i: i16 (_mm_max_epi16)
- SSE41, __m128i->__m128i: i8 (_mm_max_epi8)
- SSE41, __m128i->__m128i: i32 (_mm_max_epi32)
- SSE41, __m128i->__m128i: u16 (_mm_max_epu16)
- SSE41, __m128i->__m128i: u32 (_mm_max_epu32)
- AVX, __m256 ->__m256 : f32 (_mm256_max_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_max_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_max_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_max_epi32)
- AVX2, __m256i->__m256i: u8 (_mm256_max_epu8)
- AVX2, __m256i->__m256i: u16 (_mm256_max_epu16)
- AVX2, __m256i->__m256i: u32 (_mm256_max_epu32)
- }
- min: {
- SSE, __m64 ->__m64 : u8 (_mm_min_pu8)
- SSE, __m64 ->__m64 : i16 (_mm_min_pi16)
- SSE, __m128 ->__m128 : f32 (_mm_min_ps)
- SSE2, __m128i->__m128i: u8 (_mm_min_epu8)
- SSE2, __m128i->__m128i: i16 (_mm_min_epi16)
- SSE41, __m128i->__m128i: i8 (_mm_min_epi8)
- SSE41, __m128i->__m128i: i32 (_mm_min_epi32)
- SSE41, __m128i->__m128i: u16 (_mm_min_epu16)
- SSE41, __m128i->__m128i: u32 (_mm_min_epu32)
- AVX, __m256 ->__m256 : f32 (_mm256_min_ps)
- AVX2, __m256i->__m256i: i8 (_mm256_min_epi8)
- AVX2, __m256i->__m256i: i16 (_mm256_min_epi16)
- AVX2, __m256i->__m256i: i32 (_mm256_min_epi32)
- AVX2, __m256i->__m256i: u8 (_mm256_min_epu8)
- AVX2, __m256i->__m256i: u16 (_mm256_min_epu16)
- AVX2, __m256i->__m256i: u32 (_mm256_min_epu32)
- }
- round: {
- SSE41, __m128->__m128: f32 (_mm_round_ps)
- AVX, __m256->__m256: f32 (_mm256_round_ps)
- }
- (reminder: allow mono streams to have stereo volume applied to them)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement