llmatrix4a.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. /**
  2. * @file llmatrix4a.h
  3. * @brief LLMatrix4a class header file - memory aligned and vectorized 4x4 matrix
  4. *
  5. * $LicenseInfo:firstyear=2007&license=viewergpl$
  6. *
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #ifndef LL_LLMATRIX4A_H
  33. #define LL_LLMATRIX4A_H
  34. #include "llmath.h"
  35. #include "llmatrix4.h"
  36. class alignas(16) LLMatrix4a
  37. {
  38. public:
  39. LL_ALIGNED16_NEW_DELETE
  40. enum
  41. {
  42. ROW_FWD = 0,
  43. ROW_LEFT,
  44. ROW_UP,
  45. ROW_TRANS
  46. };
  47. LL_INLINE LLMatrix4a()
  48. {
  49. }
  50. LL_INLINE LLMatrix4a(const LLQuad& q1, const LLQuad& q2,
  51. const LLQuad& q3, const LLQuad& q4)
  52. {
  53. mMatrix[0] = q1;
  54. mMatrix[1] = q2;
  55. mMatrix[2] = q3;
  56. mMatrix[3] = q4;
  57. }
  58. LL_INLINE LLMatrix4a(const LLQuaternion2& quat)
  59. {
  60. const LLVector4a& xyzw = quat.getVector4a();
  61. LLVector4a nyxwz = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2, 3, 0, 1));
  62. nyxwz.negate();
  63. const LLVector4a xnyynx = _mm_unpacklo_ps(xyzw, nyxwz);
  64. const LLVector4a znwwnz = _mm_unpackhi_ps(xyzw, nyxwz);
  65. LLMatrix4a mata;
  66. mata.setRow<0>(_mm_shuffle_ps(xyzw, xnyynx, _MM_SHUFFLE(0, 1, 2, 3)));
  67. mata.setRow<1>(_mm_shuffle_ps(znwwnz, xyzw, _MM_SHUFFLE(1, 0, 2, 3)));
  68. mata.setRow<2>(_mm_shuffle_ps(xnyynx, xyzw, _MM_SHUFFLE(2, 3, 3, 2)));
  69. mata.setRow<3>(_mm_shuffle_ps(xnyynx, znwwnz, _MM_SHUFFLE(2, 3, 1, 3)));
  70. LLMatrix4a matb;
  71. matb.setRow<0>(_mm_shuffle_ps(xyzw, xnyynx, _MM_SHUFFLE(3, 1, 2, 3)));
  72. matb.setRow<1>(_mm_shuffle_ps(znwwnz, xnyynx, _MM_SHUFFLE(1, 0, 2, 3)));
  73. matb.setRow<2>(_mm_shuffle_ps(xnyynx, znwwnz, _MM_SHUFFLE(3, 2, 3, 2)));
  74. matb.setRow<3>(xyzw);
  75. setMul(matb, mata);
  76. }
  77. LL_INLINE explicit LLMatrix4a(const LLMatrix4& val)
  78. {
  79. loadu(val);
  80. }
  81. LL_INLINE explicit LLMatrix4a(const F32* val)
  82. {
  83. loadu(val);
  84. }
  85. LL_INLINE F32* getF32ptr()
  86. {
  87. return mMatrix[0].getF32ptr();
  88. }
  89. LL_INLINE const F32* getF32ptr() const
  90. {
  91. return mMatrix[0].getF32ptr();
  92. }
  93. LL_INLINE LLMatrix4& asMatrix4()
  94. {
  95. return *(LLMatrix4*)this;
  96. }
  97. LL_INLINE const LLMatrix4& asMatrix4() const
  98. {
  99. return *(LLMatrix4*)this;
  100. }
  101. LL_INLINE bool operator==(const LLMatrix4a& rhs) const
  102. {
  103. return mMatrix[0] == rhs.mMatrix[0] && mMatrix[1] == rhs.mMatrix[1] &&
  104. mMatrix[2] == rhs.mMatrix[2] && mMatrix[3] == rhs.mMatrix[3];
  105. }
  106. LL_INLINE bool operator!=(const LLMatrix4a& rhs) const
  107. {
  108. return mMatrix[0] != rhs.mMatrix[0] || mMatrix[1] != rhs.mMatrix[1] ||
  109. mMatrix[2] != rhs.mMatrix[2] || mMatrix[3] != rhs.mMatrix[3];
  110. }
  111. LL_INLINE void clear()
  112. {
  113. mMatrix[0].clear();
  114. mMatrix[1].clear();
  115. mMatrix[2].clear();
  116. mMatrix[3].clear();
  117. }
  118. LL_INLINE void setIdentity()
  119. {
  120. thread_local __m128 ones = _mm_set_ps(1.f, 0.f, 0.f, 1.f);
  121. thread_local __m128 zeroes = _mm_setzero_ps();
  122. mMatrix[0] = _mm_movelh_ps(ones, zeroes);
  123. mMatrix[1] = _mm_movehl_ps(zeroes, ones);
  124. mMatrix[2] = _mm_movelh_ps(zeroes, ones);
  125. mMatrix[3] = _mm_movehl_ps(ones, zeroes);
  126. }
  127. LL_INLINE void loadu(const LLMatrix4& src)
  128. {
  129. mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
  130. mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]);
  131. mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]);
  132. mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]);
  133. }
  134. LL_INLINE void loadu(const LLMatrix3& src)
  135. {
  136. mMatrix[0].load3(src.mMatrix[0]);
  137. mMatrix[1].load3(src.mMatrix[1]);
  138. mMatrix[2].load3(src.mMatrix[2]);
  139. mMatrix[3].set(0.f, 0.f, 0.f, 1.f);
  140. }
  141. LL_INLINE void loadu(const F32* src)
  142. {
  143. mMatrix[0] = _mm_loadu_ps(src);
  144. mMatrix[1] = _mm_loadu_ps(src + 4);
  145. mMatrix[2] = _mm_loadu_ps(src + 8);
  146. mMatrix[3] = _mm_loadu_ps(src + 12);
  147. }
  148. LL_INLINE void store4a(F32* dst) const
  149. {
  150. mMatrix[0].store4a(dst);
  151. mMatrix[1].store4a(dst + 4);
  152. mMatrix[2].store4a(dst + 8);
  153. mMatrix[3].store4a(dst + 12);
  154. }
  155. LL_INLINE void add(const LLMatrix4a& rhs)
  156. {
  157. mMatrix[0].add(rhs.mMatrix[0]);
  158. mMatrix[1].add(rhs.mMatrix[1]);
  159. mMatrix[2].add(rhs.mMatrix[2]);
  160. mMatrix[3].add(rhs.mMatrix[3]);
  161. }
  162. LL_INLINE void mul(const LLMatrix4a& rhs)
  163. {
  164. // Not using rotate4 to avoid extra copy of *this.
  165. LLVector4a x0, x1, x2, x3, y0, y1, y2, y3, z0, z1, z2, z3, w0, w1, w2,
  166. w3;
  167. // 16 shuffles
  168. x0.splat<0>(rhs.mMatrix[0]);
  169. x1.splat<0>(rhs.mMatrix[1]);
  170. x2.splat<0>(rhs.mMatrix[2]);
  171. x3.splat<0>(rhs.mMatrix[3]);
  172. y0.splat<1>(rhs.mMatrix[0]);
  173. y1.splat<1>(rhs.mMatrix[1]);
  174. y2.splat<1>(rhs.mMatrix[2]);
  175. y3.splat<1>(rhs.mMatrix[3]);
  176. z0.splat<2>(rhs.mMatrix[0]);
  177. z1.splat<2>(rhs.mMatrix[1]);
  178. z2.splat<2>(rhs.mMatrix[2]);
  179. z3.splat<2>(rhs.mMatrix[3]);
  180. w0.splat<3>(rhs.mMatrix[0]);
  181. w1.splat<3>(rhs.mMatrix[1]);
  182. w2.splat<3>(rhs.mMatrix[2]);
  183. w3.splat<3>(rhs.mMatrix[3]);
  184. // 16 muls
  185. x0.mul(mMatrix[0]);
  186. x1.mul(mMatrix[0]);
  187. x2.mul(mMatrix[0]);
  188. x3.mul(mMatrix[0]);
  189. y0.mul(mMatrix[1]);
  190. y1.mul(mMatrix[1]);
  191. y2.mul(mMatrix[1]);
  192. y3.mul(mMatrix[1]);
  193. z0.mul(mMatrix[2]);
  194. z1.mul(mMatrix[2]);
  195. z2.mul(mMatrix[2]);
  196. z3.mul(mMatrix[2]);
  197. w0.mul(mMatrix[3]);
  198. w1.mul(mMatrix[3]);
  199. w2.mul(mMatrix[3]);
  200. w3.mul(mMatrix[3]);
  201. // 12 adds
  202. x0.add(y0);
  203. z0.add(w0);
  204. x1.add(y1);
  205. z1.add(w1);
  206. x2.add(y2);
  207. z2.add(w2);
  208. x3.add(y3);
  209. z3.add(w3);
  210. mMatrix[0].setAdd(x0, z0);
  211. mMatrix[1].setAdd(x1, z1);
  212. mMatrix[2].setAdd(x2, z2);
  213. mMatrix[3].setAdd(x3, z3);
  214. }
  215. LL_INLINE void setRows(const LLVector4a& r0, const LLVector4a& r1,
  216. const LLVector4a& r2)
  217. {
  218. mMatrix[0] = r0;
  219. mMatrix[1] = r1;
  220. mMatrix[2] = r2;
  221. }
  222. template<int N>
  223. LL_INLINE void setRow(const LLVector4a& row)
  224. {
  225. mMatrix[N] = row;
  226. }
  227. template<int N>
  228. LL_INLINE const LLVector4a& getRow() const
  229. {
  230. return mMatrix[N];
  231. }
  232. template<int N>
  233. LL_INLINE LLVector4a& getRow()
  234. {
  235. return mMatrix[N];
  236. }
  237. template<int N>
  238. LL_INLINE void setColumn(const LLVector4a& col)
  239. {
  240. mMatrix[0].copyComponent<N>(col.getScalarAt<0>());
  241. mMatrix[1].copyComponent<N>(col.getScalarAt<1>());
  242. mMatrix[2].copyComponent<N>(col.getScalarAt<2>());
  243. mMatrix[3].copyComponent<N>(col.getScalarAt<3>());
  244. }
  245. template<int N>
  246. LL_INLINE LLVector4a getColumn()
  247. {
  248. LLVector4a v;
  249. v.clear();
  250. v.copyComponent<0>(mMatrix[0].getScalarAt<N>());
  251. v.copyComponent<1>(mMatrix[1].getScalarAt<N>());
  252. v.copyComponent<2>(mMatrix[2].getScalarAt<N>());
  253. v.copyComponent<3>(mMatrix[3].getScalarAt<N>());
  254. return v;
  255. }
  256. LL_INLINE void setMul(const LLMatrix4a& m, F32 s)
  257. {
  258. mMatrix[0].setMul(m.mMatrix[0], s);
  259. mMatrix[1].setMul(m.mMatrix[1], s);
  260. mMatrix[2].setMul(m.mMatrix[2], s);
  261. mMatrix[3].setMul(m.mMatrix[3], s);
  262. }
  263. LL_INLINE void setMul(const LLMatrix4a& m0, const LLMatrix4a& m1)
  264. {
  265. m0.rotate4(m1.mMatrix[0], mMatrix[0]);
  266. m0.rotate4(m1.mMatrix[1], mMatrix[1]);
  267. m0.rotate4(m1.mMatrix[2], mMatrix[2]);
  268. m0.rotate4(m1.mMatrix[3], mMatrix[3]);
  269. }
  270. LL_INLINE void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w)
  271. {
  272. LLVector4a d0, d1, d2, d3;
  273. d0.setSub(b.mMatrix[0], a.mMatrix[0]);
  274. d1.setSub(b.mMatrix[1], a.mMatrix[1]);
  275. d2.setSub(b.mMatrix[2], a.mMatrix[2]);
  276. d3.setSub(b.mMatrix[3], a.mMatrix[3]);
  277. // this = a + d*w
  278. d0.mul(w);
  279. d1.mul(w);
  280. d2.mul(w);
  281. d3.mul(w);
  282. mMatrix[0].setAdd(a.mMatrix[0], d0);
  283. mMatrix[1].setAdd(a.mMatrix[1], d1);
  284. mMatrix[2].setAdd(a.mMatrix[2], d2);
  285. mMatrix[3].setAdd(a.mMatrix[3], d3);
  286. }
  287. LL_INLINE void rotate(const LLVector4a& v, LLVector4a& res) const
  288. {
  289. LLVector4a x, y, z;
  290. x.splat<0>(v);
  291. y.splat<1>(v);
  292. z.splat<2>(v);
  293. x.mul(mMatrix[0]);
  294. y.mul(mMatrix[1]);
  295. z.mul(mMatrix[2]);
  296. x.add(y);
  297. res.setAdd(x, z);
  298. }
  299. LL_INLINE void rotate4(const LLVector4a& v, LLVector4a& res) const
  300. {
  301. LLVector4a x, y, z, w;
  302. x.splat<0>(v);
  303. y.splat<1>(v);
  304. z.splat<2>(v);
  305. w.splat<3>(v);
  306. x.mul(mMatrix[0]);
  307. y.mul(mMatrix[1]);
  308. z.mul(mMatrix[2]);
  309. w.mul(mMatrix[3]);
  310. x.add(y);
  311. z.add(w);
  312. res.setAdd(x, z);
  313. }
  314. LL_INLINE void affineTransform(const LLVector4a& v, LLVector4a& res) const
  315. {
  316. LLVector4a x, y, z;
  317. x.splat<0>(v);
  318. y.splat<1>(v);
  319. z.splat<2>(v);
  320. x.mul(mMatrix[0]);
  321. y.mul(mMatrix[1]);
  322. z.mul(mMatrix[2]);
  323. x.add(y);
  324. z.add(mMatrix[3]);
  325. res.setAdd(x, z);
  326. }
  327. LL_INLINE const LLVector4a& getTranslation() const { return mMatrix[3]; }
  328. LL_INLINE void perspectiveTransform(const LLVector4a& v,
  329. LLVector4a& res) const
  330. {
  331. LLVector4a x, y, z, s, t, p, q;
  332. x.splat<0>(v);
  333. y.splat<1>(v);
  334. z.splat<2>(v);
  335. s.splat<3>(mMatrix[0]);
  336. t.splat<3>(mMatrix[1]);
  337. p.splat<3>(mMatrix[2]);
  338. q.splat<3>(mMatrix[3]);
  339. s.mul(x);
  340. t.mul(y);
  341. p.mul(z);
  342. q.add(s);
  343. t.add(p);
  344. q.add(t);
  345. x.mul(mMatrix[0]);
  346. y.mul(mMatrix[1]);
  347. z.mul(mMatrix[2]);
  348. x.add(y);
  349. z.add(mMatrix[3]);
  350. res.setAdd(x, z);
  351. res.div(q);
  352. }
  353. LL_INLINE void transpose()
  354. {
  355. __m128 q1 = _mm_unpackhi_ps(mMatrix[0], mMatrix[1]);
  356. __m128 q2 = _mm_unpacklo_ps(mMatrix[0], mMatrix[1]);
  357. __m128 q3 = _mm_unpacklo_ps(mMatrix[2], mMatrix[3]);
  358. __m128 q4 = _mm_unpackhi_ps(mMatrix[2], mMatrix[3]);
  359. mMatrix[0] = _mm_movelh_ps(q2, q3);
  360. mMatrix[1] = _mm_movehl_ps(q3, q2);
  361. mMatrix[2] = _mm_movelh_ps(q1, q4);
  362. mMatrix[3] = _mm_movehl_ps(q4, q1);
  363. }
  364. // Following function is adapted from:
  365. // http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
  366. //
  367. // License/Copyright Statement:
  368. //
  369. // Copyright (c) 2001 Intel Corporation.
  370. //
  371. // Permition is granted to use, copy, distribute and prepare derivative
  372. // works of this library for any purpose and without fee, provided, that
  373. // the above copyright notice and this statement appear in all copies.
  374. // Intel makes no representations about the suitability of this library for
  375. // any purpose, and specifically disclaims all warranties.
  376. // See LEGAL-intel_matrixlib.TXT for all the legal information.
  377. LL_INLINE F32 invert()
  378. {
  379. alignas(16) const unsigned int Sign_PNNP[4] = { 0x00000000,
  380. 0x80000000,
  381. 0x80000000,
  382. 0x00000000 };
  383. // The inverse is calculated using "Divide and Conquer" technique. The
  384. // original matrix is divide into four 2x2 sub-matrices. Since each
  385. // register holds four matrix element, the smaller matrices are
  386. // represented as a registers. Hence we get a better locality of the
  387. // calculations.
  388. // The four sub-matrices:
  389. LLVector4a A = _mm_movelh_ps(mMatrix[0], mMatrix[1]);
  390. LLVector4a B = _mm_movehl_ps(mMatrix[1], mMatrix[0]);
  391. LLVector4a C = _mm_movelh_ps(mMatrix[2], mMatrix[3]);
  392. LLVector4a D = _mm_movehl_ps(mMatrix[3], mMatrix[2]);
  393. // Partial inverse of the sub-matrices:
  394. LLVector4a iA, iB, iC, iD;
  395. LLVector4a DC, AB;
  396. // Determinant of the sub-matrices:
  397. LLSimdScalar dA, dB, dC, dD;
  398. LLSimdScalar det, d, d1, d2;
  399. LLVector4a rd;
  400. // AB = A# * B
  401. AB.setMul(_mm_shuffle_ps(A, A, 0x0F), B);
  402. AB.sub(_mm_mul_ps(_mm_shuffle_ps(A, A, 0xA5),
  403. _mm_shuffle_ps(B, B, 0x4E)));
  404. // DC = D# * C
  405. DC.setMul(_mm_shuffle_ps(D,D,0x0F), C);
  406. DC.sub(_mm_mul_ps(_mm_shuffle_ps(D, D, 0xA5),
  407. _mm_shuffle_ps(C, C, 0x4E)));
  408. // dA = |A|
  409. dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F), A);
  410. dA -= _mm_movehl_ps(dA, dA);
  411. // dB = |B|
  412. dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F), B);
  413. dB -= _mm_movehl_ps(dB, dB);
  414. // dC = |C|
  415. dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F), C);
  416. dC -= _mm_movehl_ps(dC, dC);
  417. // dD = |D|
  418. dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F), D);
  419. dD -= _mm_movehl_ps(dD, dD);
  420. // d = trace(AB*DC) = trace(A#*B*D#*C)
  421. d = _mm_mul_ps(_mm_shuffle_ps(DC, DC, 0xD8), AB);
  422. // iD = C*A#*B
  423. iD.setMul(_mm_shuffle_ps(C, C, 0xA0), _mm_movelh_ps(AB, AB));
  424. iD.add(_mm_mul_ps(_mm_shuffle_ps(C, C, 0xF5), _mm_movehl_ps(AB, AB)));
  425. // iA = B*D#*C
  426. iA.setMul(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
  427. iA.add(_mm_mul_ps(_mm_shuffle_ps(B, B, 0xF5), _mm_movehl_ps(DC, DC)));
  428. // d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
  429. d = _mm_add_ps(d, _mm_movehl_ps(d, d));
  430. d += _mm_shuffle_ps(d, d, 1);
  431. d1 = dA * dD;
  432. d2 = dB * dC;
  433. // iD = D*|A| - C*A#*B
  434. iD.setSub(_mm_mul_ps(D, _mm_shuffle_ps(dA, dA, 0)), iD);
  435. // iA = A*|D| - B*D#*C;
  436. iA.setSub(_mm_mul_ps(A, _mm_shuffle_ps(dD, dD, 0)), iA);
  437. // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
  438. det = d1 + d2 - d;
  439. __m128 is_zero_mask = _mm_cmpeq_ps(det,_mm_setzero_ps());
  440. rd = _mm_div_ss(_mm_set_ss(1.f),
  441. _mm_or_ps(_mm_andnot_ps(is_zero_mask, det),
  442. _mm_and_ps(is_zero_mask, _mm_set_ss(1.f))));
  443. #ifdef ZERO_SINGULAR
  444. rd = _mm_and_ps(_mm_cmpneq_ss(det, _mm_setzero_ps()), rd);
  445. #endif
  446. // iB = D * (A#B)# = D*B#*A
  447. iB.setMul(D, _mm_shuffle_ps(AB, AB, 0x33));
  448. iB.sub(_mm_mul_ps(_mm_shuffle_ps(D, D, 0xB1),
  449. _mm_shuffle_ps(AB, AB, 0x66)));
  450. // iC = A * (D#C)# = A*C#*D
  451. iC.setMul(A, _mm_shuffle_ps(DC, DC, 0x33));
  452. iC.sub(_mm_mul_ps(_mm_shuffle_ps(A, A, 0xB1),
  453. _mm_shuffle_ps(DC, DC, 0x66)));
  454. rd = _mm_shuffle_ps(rd, rd, 0);
  455. rd = _mm_xor_ps(rd, _mm_load_ps((const F32*)Sign_PNNP));
  456. // iB = C*|B| - D*B#*A
  457. iB.setSub(_mm_mul_ps(C, _mm_shuffle_ps(dB, dB, 0)), iB);
  458. // iC = B*|C| - A*C#*D;
  459. iC.setSub(_mm_mul_ps(B, _mm_shuffle_ps(dC, dC, 0)), iC);
  460. // iX = iX / det
  461. iA.mul(rd);
  462. iB.mul(rd);
  463. iC.mul(rd);
  464. iD.mul(rd);
  465. mMatrix[0] = _mm_shuffle_ps(iA, iB, 0x77);
  466. mMatrix[1] = _mm_shuffle_ps(iA, iB, 0x22);
  467. mMatrix[2] = _mm_shuffle_ps(iC, iD, 0x77);
  468. mMatrix[3] = _mm_shuffle_ps(iC, iD, 0x22);
  469. F32 ret;
  470. _mm_store_ss(&ret, det);
  471. return ret;
  472. }
  473. LL_INLINE LLVector4a rowMul(const LLVector4a& row) const
  474. {
  475. LLVector4a result;
  476. result = _mm_mul_ps(_mm_shuffle_ps(row, row, _MM_SHUFFLE(0, 0, 0, 0)),
  477. mMatrix[0]);
  478. result = _mm_add_ps(result,
  479. _mm_mul_ps(_mm_shuffle_ps(row, row,
  480. _MM_SHUFFLE(1, 1, 1, 1)),
  481. mMatrix[1]));
  482. result = _mm_add_ps(result,
  483. _mm_mul_ps(_mm_shuffle_ps(row, row,
  484. _MM_SHUFFLE(2, 2, 2, 2)),
  485. mMatrix[2]));
  486. result = _mm_add_ps(result,
  487. _mm_mul_ps(_mm_shuffle_ps(row, row,
  488. _MM_SHUFFLE(3, 3, 3, 3)),
  489. mMatrix[3]));
  490. return result;
  491. }
  492. LL_INLINE void matMul(const LLMatrix4a& a, const LLMatrix4a& b)
  493. {
  494. mMatrix[0] = b.rowMul(a.mMatrix[0]);
  495. mMatrix[1] = b.rowMul(a.mMatrix[1]);
  496. mMatrix[2] = b.rowMul(a.mMatrix[2]);
  497. mMatrix[3] = b.rowMul(a.mMatrix[3]);
  498. }
  499. void matMulBoundBox(const LLVector4a* in_extents,
  500. LLVector4a* out_extents) const;
  501. // ================== Affine transformation matrix only ===================
  502. // Multiply matrix with a pure translation matrix.
  503. LL_INLINE void applyTranslationAffine(F32 x, F32 y, F32 z)
  504. {
  505. const LLVector4a xyz0(x, y, z, 0.f);
  506. LLVector4a xxxx, yyyy, zzzz;
  507. xxxx.splat<0>(xyz0);
  508. yyyy.splat<1>(xyz0);
  509. zzzz.splat<2>(xyz0);
  510. LLVector4a sum1, sum2, sum3;
  511. sum1.setMul(xxxx, mMatrix[0]);
  512. sum2.setMul(yyyy, mMatrix[1]);
  513. sum3.setMul(zzzz, mMatrix[2]);
  514. mMatrix[3].add(sum1);
  515. mMatrix[3].add(sum2);
  516. mMatrix[3].add(sum3);
  517. }
  518. // Multiply matrix with a pure translation matrix.
  519. LL_INLINE void applyTranslationAffine(const LLVector3& trans)
  520. {
  521. applyTranslationAffine(trans.mV[VX], trans.mV[VY], trans.mV[VZ]);
  522. }
  523. // Multiply matrix with a pure scale matrix.
  524. LL_INLINE void applyScaleAffine(F32 x, F32 y, F32 z)
  525. {
  526. const LLVector4a xyz0(x, y, z, 0);
  527. LLVector4a xxxx, yyyy, zzzz;
  528. xxxx.splat<0>(xyz0);
  529. yyyy.splat<1>(xyz0);
  530. zzzz.splat<2>(xyz0);
  531. mMatrix[0].mul(xxxx);
  532. mMatrix[1].mul(yyyy);
  533. mMatrix[2].mul(zzzz);
  534. }
  535. // Multiply matrix with a pure scale matrix.
  536. LL_INLINE void applyScaleAffine(const LLVector3& scale)
  537. {
  538. applyScaleAffine(scale.mV[VX], scale.mV[VY], scale.mV[VZ]);
  539. }
  540. // Multiply matrix with a pure scale matrix.
  541. LL_INLINE void applyScaleAffine(F32 s)
  542. {
  543. const LLVector4a scale(s);
  544. mMatrix[0].mul(scale);
  545. mMatrix[1].mul(scale);
  546. mMatrix[2].mul(scale);
  547. }
  548. // Direct addition to row3.
  549. LL_INLINE void translateAffine(const LLVector3& trans)
  550. {
  551. LLVector4a translation;
  552. translation.load3(trans.mV);
  553. mMatrix[3].add(translation);
  554. }
  555. // Direct assignment of row3.
  556. LL_INLINE void setTranslateAffine(const LLVector3& trans)
  557. {
  558. thread_local const LLVector4Logical mask =
  559. _mm_load_ps((F32*)&S_V4LOGICAL_MASK_TABLE[12]);
  560. LLVector4a translation;
  561. translation.load3(trans.mV);
  562. mMatrix[3].setSelectWithMask(mask, mMatrix[3], translation);
  563. }
  564. LL_INLINE void mulAffine(const LLMatrix4a& rhs)
  565. {
  566. LLVector4a x0, y0, z0, x1, y1, z1, x2, y2, z2, x3, y3, z3;
  567. // 12 shuffles
  568. x0.splat<0>(rhs.mMatrix[0]);
  569. x1.splat<0>(rhs.mMatrix[1]);
  570. x2.splat<0>(rhs.mMatrix[2]);
  571. x3.splat<0>(rhs.mMatrix[3]);
  572. y0.splat<1>(rhs.mMatrix[0]);
  573. y1.splat<1>(rhs.mMatrix[1]);
  574. y2.splat<1>(rhs.mMatrix[2]);
  575. y3.splat<1>(rhs.mMatrix[3]);
  576. z0.splat<2>(rhs.mMatrix[0]);
  577. z1.splat<2>(rhs.mMatrix[1]);
  578. z2.splat<2>(rhs.mMatrix[2]);
  579. z3.splat<2>(rhs.mMatrix[3]);
  580. // 12 muls
  581. x0.mul(mMatrix[0]);
  582. x1.mul(mMatrix[0]);
  583. x2.mul(mMatrix[0]);
  584. x3.mul(mMatrix[0]);
  585. y0.mul(mMatrix[1]);
  586. y1.mul(mMatrix[1]);
  587. y2.mul(mMatrix[1]);
  588. y3.mul(mMatrix[1]);
  589. z0.mul(mMatrix[2]);
  590. z1.mul(mMatrix[2]);
  591. z2.mul(mMatrix[2]);
  592. z3.mul(mMatrix[2]);
  593. // 9 adds
  594. x0.add(y0);
  595. x1.add(y1);
  596. x2.add(y2);
  597. x3.add(y3);
  598. z3.add(mMatrix[3]);
  599. mMatrix[0].setAdd(x0, z0);
  600. mMatrix[1].setAdd(x1, z1);
  601. mMatrix[2].setAdd(x2, z2);
  602. mMatrix[3].setAdd(x3, z3);
  603. }
  604. LL_INLINE void extractRotationAffine()
  605. {
  606. thread_local const LLVector4Logical mask =
  607. _mm_load_ps((F32*)&S_V4LOGICAL_MASK_TABLE[12]);
  608. mMatrix[0].setSelectWithMask(mask, _mm_setzero_ps(), mMatrix[0]);
  609. mMatrix[1].setSelectWithMask(mask, _mm_setzero_ps(), mMatrix[1]);
  610. mMatrix[2].setSelectWithMask(mask, _mm_setzero_ps(), mMatrix[2]);
  611. mMatrix[3].setSelectWithMask(mask, LLVector4a(1.f), _mm_setzero_ps());
  612. }
  613. bool isIdentity() const;
  614. public:
  615. alignas(16) LLVector4a mMatrix[4];
  616. };
  617. #endif