Advertisement
Zgragselus

Heresy on Bagel

Aug 16th, 2023
703
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.19 KB | None | 0 0
  1. inline mat4 inverse(const mat4& m)
  2. {
  3.     __m128 f1 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0xAA),
  4.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  5.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80),
  6.             _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  7.  
  8.     __m128 f2 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55),
  9.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  10.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80),
  11.             _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  12.  
  13.     __m128 f3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55),
  14.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)),
  15.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80),
  16.             _mm_shuffle_ps(m.m3, m.m2, 0xAA)));
  17.  
  18.     __m128 f4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  19.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xFF), _mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)),
  20.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  21.             _mm_shuffle_ps(m.m3, m.m2, 0xFF)));
  22.  
  23.     __m128 f5 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  24.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0xAA), _mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)),
  25.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  26.             _mm_shuffle_ps(m.m3, m.m2, 0xAA)));
  27.  
  28.     __m128 f6 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00),
  29.         _mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x55), _mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80)),
  30.         _mm_mul_ps(_mm_shuffle_ps(_mm_shuffle_ps(m.m4, m.m3, 0x00), _mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80),
  31.             _mm_shuffle_ps(m.m3, m.m2, 0x55)));
  32.  
  33.     __m128 v1 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0x00), _mm_shuffle_ps(m.m2, m.m1, 0x00), 0xA8);
  34.     __m128 v2 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0x55), _mm_shuffle_ps(m.m2, m.m1, 0x55), 0xA8);
  35.     __m128 v3 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0xAA), _mm_shuffle_ps(m.m2, m.m1, 0xAA), 0xA8);
  36.     __m128 v4 = _mm_shuffle_ps(_mm_shuffle_ps(m.m2, m.m1, 0xFF), _mm_shuffle_ps(m.m2, m.m1, 0xFF), 0xA8);
  37.     __m128 s1 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);
  38.     __m128 s2 = _mm_set_ps(0.0f, -0.0f, 0.0f, -0.0f);
  39.     __m128 i1 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v2, f1),
  40.         _mm_mul_ps(v3, f2)),
  41.         _mm_mul_ps(v4, f3)));
  42.     __m128 i2 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f1),
  43.         _mm_mul_ps(v3, f4)),
  44.         _mm_mul_ps(v4, f5)));
  45.     __m128 i3 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f2),
  46.         _mm_mul_ps(v2, f4)),
  47.         _mm_mul_ps(v4, f6)));
  48.     __m128 i4 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f3),
  49.         _mm_mul_ps(v2, f5)),
  50.         _mm_mul_ps(v3, f6)));
  51.     __m128 d = _mm_mul_ps(m.m1, _mm_movelh_ps(_mm_unpacklo_ps(i1, i2), _mm_unpacklo_ps(i3, i4)));
  52.     d = _mm_add_ps(d, _mm_shuffle_ps(d, d, 0x4E));
  53.     d = _mm_add_ps(d, _mm_shuffle_ps(d, d, 0x11));
  54.     d = _mm_div_ps(_mm_set1_ps(1.0f), d);
  55.     return mat4(float4(_mm_mul_ps(i1, d)),
  56.         float4(_mm_mul_ps(i2, d)),
  57.         float4(_mm_mul_ps(i3, d)),
  58.         float4(_mm_mul_ps(i4, d)));
  59. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement