llvector4a.inl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. /**
  2. * @file llvector4a.inl
  3. * @brief LLVector4a inline function implementations
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewergpl$
  6. *
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #ifndef LL_INL_INCLUDE
  33. # error "You cannot #include this file yourself, #include llmath.h instead !"
  34. #endif
  35. ////////////////////////////////////
  36. // GET/SET
  37. ////////////////////////////////////
  38. // Prefer this method for read-only access to a single element. Prefer the
  39. // templated version below if the element is known at compile time.
  40. LL_INLINE LLSimdScalar LLVector4a::getScalarAt(S32 idx) const
  41. {
  42. // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically
  43. // (should be effectively a nop)
  44. switch (idx)
  45. {
  46. case 0:
  47. return mQ;
  48. case 1:
  49. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
  50. case 2:
  51. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
  52. default:
  53. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
  54. }
  55. }
  56. template <int N> LL_INLINE LLSimdScalar LLVector4a::getScalarAt() const
  57. {
  58. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
  59. }
  60. template<> LL_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
  61. {
  62. return mQ;
  63. }
  64. LL_INLINE void LLVector4a::splat(const LLSimdScalar& x)
  65. {
  66. mQ = _mm_shuffle_ps(x.getQuad(), x.getQuad(), _MM_SHUFFLE(0, 0, 0, 0));
  67. }
  68. // Set all 4 elements to element N of src, with N known at compile time
  69. template <int N> LL_INLINE void LLVector4a::splat(const LLVector4a& src)
  70. {
  71. mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N));
  72. }
  73. // Set all 4 elements to element i of v, with i NOT known at compile time
  74. LL_INLINE void LLVector4a::splat(const LLVector4a& v, U32 i)
  75. {
  76. switch (i)
  77. {
  78. case 0:
  79. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
  80. break;
  81. case 1:
  82. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
  83. break;
  84. case 2:
  85. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
  86. break;
  87. case 3:
  88. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
  89. }
  90. }
  91. // Sets element N to that of src's element N
  92. template <int N> LL_INLINE void LLVector4a::copyComponent(const LLVector4a& src)
  93. {
  94. thread_local const LLVector4Logical mask =
  95. _mm_load_ps((F32*)&S_V4LOGICAL_MASK_TABLE[N * 4]);
  96. setSelectWithMask(mask, src, mQ);
  97. }
  98. // Select bits from src_if_true and src_if_false according to bits in mask
  99. LL_INLINE void LLVector4a::setSelectWithMask(const LLVector4Logical& mask,
  100. const LLVector4a& src_if_true,
  101. const LLVector4a& src_if_false)
  102. {
  103. // (((src_if_true ^ src_if_false) & mask) ^ src_if_false)
  104. // E.g., src_if_false = 1010b, src_if_true = 0101b, mask = 1100b
  105. // (src_if_true ^ src_if_false) = 1111b --> & mask = 1100b -->
  106. // ^ src_if_false = 0110b,
  107. // as expected (01 from src_if_true, 10 from src_if_false)
  108. // Courtesy of Mark++:
  109. //http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
  110. mQ = _mm_xor_ps(src_if_false,
  111. _mm_and_ps(mask, _mm_xor_ps(src_if_true, src_if_false)));
  112. }
  113. ////////////////////////////////////
  114. // ALGEBRAIC
  115. ////////////////////////////////////
  116. // Set this to (a x b) (geometric cross-product)
  117. LL_INLINE void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
  118. {
  119. #if 0 // Old code by LL
  120. // Vectors are stored in memory in w, z, y, x order from high to low
  121. // Set vector1 = { a[W], a[X], a[Z], a[Y] }
  122. const LLQuad vector1 = _mm_shuffle_ps(a.mQ, a.mQ, _MM_SHUFFLE(3, 0, 2, 1));
  123. // Set vector2 = { b[W], b[Y], b[X], b[Z] }
  124. const LLQuad vector2 = _mm_shuffle_ps(b.mQ, b.mQ, _MM_SHUFFLE(3, 1, 0, 2));
  125. // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
  126. mQ = _mm_mul_ps(vector1, vector2);
  127. // vector3 = { a[W], a[Y], a[X], a[Z] }
  128. const LLQuad vector3 = _mm_shuffle_ps(a.mQ, a.mQ, _MM_SHUFFLE(3, 1, 0, 2));
  129. // vector4 = { b[W], b[X], b[Z], b[Y] }
  130. const LLQuad vector4 = _mm_shuffle_ps(b.mQ, b.mQ, _MM_SHUFFLE(3, 0, 2, 1));
  131. // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z],
  132. // a[Y]*b[Z] - a[Z]*b[Y] }
  133. mQ = _mm_sub_ps(mQ, _mm_mul_ps(vector3, vector4));
  134. #else // Alchemy's version
  135. LLQuad tmp0 = _mm_shuffle_ps(b.mQ, b.mQ, _MM_SHUFFLE(3, 0, 2, 1));
  136. LLQuad tmp1 = _mm_shuffle_ps(a.mQ, a.mQ, _MM_SHUFFLE(3, 0, 2, 1));
  137. tmp0 = _mm_mul_ps(tmp0, a.mQ);
  138. tmp1 = _mm_mul_ps(tmp1, b.mQ);
  139. LLQuad tmp2 = _mm_sub_ps(tmp0, tmp1);
  140. mQ = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
  141. #endif
  142. }
  143. // Set all elements to the dot product of the x, y, and z elements in a and b
  144. LL_INLINE void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
  145. {
  146. #if LL_SSE41
  147. mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
  148. #else
  149. // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
  150. const LLQuad ab = _mm_mul_ps(a.mQ, b.mQ);
  151. // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
  152. const __m128i wzxy =
  153. _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1));
  154. // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X],
  155. // a[X] * b[X] + a[Y] * b[Y] }
  156. const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
  157. // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y],
  158. // a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  159. const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
  160. // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
  161. const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab),
  162. _MM_SHUFFLE(2, 2, 2, 2));
  163. // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
  164. mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
  165. #endif
  166. }
  167. // Set all elements to the dot product of the x, y, z, and w elements in a and b
  168. LL_INLINE void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
  169. {
  170. #if LL_SSE41
  171. mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
  172. #else
  173. // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
  174. const LLQuad ab = _mm_mul_ps(a.mQ, b.mQ);
  175. // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
  176. const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab),
  177. _MM_SHUFFLE(2, 3, 0, 1));
  178. // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W],
  179. // a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  180. const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
  181. // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y],
  182. // a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  183. const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
  184. const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
  185. // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
  186. mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
  187. #endif
  188. }
  189. // Return the 3D dot product of this vector and b
  190. LL_INLINE LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
  191. {
  192. #if LL_SSE41
  193. return _mm_dp_ps(mQ, b.mQ, 0x7f);
  194. #else
  195. const LLQuad ab = _mm_mul_ps(mQ, b.mQ);
  196. const LLQuad splatY =
  197. _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(ab),
  198. _MM_SHUFFLE(1, 1, 1, 1)));
  199. const LLQuad splatZ =
  200. _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(ab),
  201. _MM_SHUFFLE(2, 2, 2, 2)));
  202. const LLQuad xPlusY = _mm_add_ps(ab, splatY);
  203. return _mm_add_ps(xPlusY, splatZ);
  204. #endif
  205. }
  206. // Return the 4D dot product of this vector and b
  207. LL_INLINE LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
  208. {
  209. #if LL_SSE41
  210. return _mm_dp_ps(mQ, b.mQ, 0xff);
  211. #else
  212. // ab = { w, z, y, x }
  213. const LLQuad ab = _mm_mul_ps(mQ, b.mQ);
  214. // upperProdsInLowerElems = { y, x, y, x }
  215. const LLQuad upperProdsInLowerElems = _mm_movehl_ps(ab, ab);
  216. // sumOfPairs = { w+y, z+x, 2y, 2x }
  217. const LLQuad sumOfPairs = _mm_add_ps(upperProdsInLowerElems, ab);
  218. // shuffled = { z+x, z+x, z+x, z+x }
  219. const LLQuad shuffled =
  220. _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(sumOfPairs),
  221. _MM_SHUFFLE(1, 1, 1, 1)));
  222. return _mm_add_ss(sumOfPairs, shuffled);
  223. #endif
  224. }
  225. // Normalize this vector with respect to the x, y, and z components only.
  226. // Accurate to 22 bytes of precision. W component is destroyed. Note that this
  227. // does not consider zero length vectors !
  228. LL_INLINE void LLVector4a::normalize3()
  229. {
  230. // len_squared = a dot a
  231. LLVector4a len_squared; len_squared.setAllDot3(*this, *this);
  232. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2,
  233. // ~1/len(a)^2, ~1/len(a)^2 }
  234. const LLQuad rsqrt = _mm_rsqrt_ps(len_squared.mQ);
  235. thread_local const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  236. thread_local const LLQuad three = {3.f, 3.f, 3.f, 3.f };
  237. // Now we do one round of Newton-Raphson approximation to get full accuracy
  238. // According to the Newton-Raphson method, given a first 'w' for the root
  239. // of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) the next better approximation
  240. // w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  241. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) =
  242. // 1.5 * w - 0.5 * a * w^3 = 0.5 * w * (3 - a*w^2)
  243. // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the
  244. // input vector 'a', not the 'a' from the above formula which is actually
  245. // len_squared). So out = a * [0.5 * rsqrt * (3 - len_squared * rsqrt * rsqrt)]
  246. const LLQuad AtimesRsqrt = _mm_mul_ps(len_squared.mQ, rsqrt);
  247. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps(AtimesRsqrt, rsqrt);
  248. const LLQuad threeMinusAtimesRsqrtTimesRsqrt =
  249. _mm_sub_ps(three, AtimesRsqrtTimesRsqrt);
  250. const LLQuad nrApprox =
  251. _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  252. mQ = _mm_mul_ps(mQ, nrApprox);
  253. }
  254. // Normalize this vector with respect to all components. Accurate to 22 bytes
  255. // of precision. Note that this does not consider zero length vectors !
  256. LL_INLINE void LLVector4a::normalize4()
  257. {
  258. // len_squared = a dot a
  259. LLVector4a len_squared; len_squared.setAllDot4(*this, *this);
  260. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2,
  261. // ~1/len(a)^2, ~1/len(a)^2 }
  262. const LLQuad rsqrt = _mm_rsqrt_ps(len_squared.mQ);
  263. thread_local const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  264. thread_local const LLQuad three = { 3.f, 3.f, 3.f, 3.f };
  265. // Now we do one round of Newton-Raphson approximation to get full accuracy
  266. // According to the Newton-Raphson method, given a first 'w' for the root
  267. // of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) the next better approximation
  268. // w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  269. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) =
  270. // 1.5 * w - 0.5 * a * w^3 = 0.5 * w * (3 - a*w^2)
  271. // Our first approx is w = rsqrt. We need out = a * w[i + 1] (this is the
  272. // input vector 'a', not the 'a' from the above formula which is actually
  273. // len_squared). So out = a * [0.5 * rsqrt * (3 - len_squared * rsqrt * rsqrt)]
  274. const LLQuad AtimesRsqrt = _mm_mul_ps(len_squared.mQ, rsqrt);
  275. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps(AtimesRsqrt, rsqrt);
  276. const LLQuad threeMinusAtimesRsqrtTimesRsqrt =
  277. _mm_sub_ps(three, AtimesRsqrtTimesRsqrt);
  278. const LLQuad nrApprox =
  279. _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  280. mQ = _mm_mul_ps(mQ, nrApprox);
  281. }
  282. // Normalize this vector with respect to the x, y, and z components only.
  283. // Accurate to 22 bytes of precision. W component is destroyed. Note that this
  284. // does not consider zero length vectors !
  285. LL_INLINE LLSimdScalar LLVector4a::normalize3withLength()
  286. {
  287. // len_squared = a dot a
  288. LLVector4a len_squared; len_squared.setAllDot3(*this, *this);
  289. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2,
  290. // ~1/len(a)^2, ~1/len(a)^2 }
  291. const LLQuad rsqrt = _mm_rsqrt_ps(len_squared.mQ);
  292. thread_local const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  293. thread_local const LLQuad three = { 3.f, 3.f, 3.f, 3.f };
  294. // Now we do one round of Newton-Raphson approximation to get full accuracy
  295. // According to the Newton-Raphson method, given a first 'w' for the root
  296. // of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) the next better approximation
  297. // w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  298. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) =
  299. // 1.5 * w - 0.5 * a * w^3 = 0.5 * w * (3 - a*w^2)
  300. // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the
  301. // input vector 'a', not the 'a' from the above formula which is actually
  302. // len_squared). So out = a * [0.5 * rsqrt * (3 - len_squared * rsqrt * rsqrt)]
  303. const LLQuad AtimesRsqrt = _mm_mul_ps(len_squared.mQ, rsqrt);
  304. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps(AtimesRsqrt, rsqrt);
  305. const LLQuad threeMinusAtimesRsqrtTimesRsqrt =
  306. _mm_sub_ps(three, AtimesRsqrtTimesRsqrt);
  307. const LLQuad nrApprox =
  308. _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  309. mQ = _mm_mul_ps(mQ, nrApprox);
  310. return _mm_sqrt_ss(len_squared);
  311. }
  312. // Normalize this vector with respect to the x, y, and z components only.
  313. // Accurate only to 10-12 bits of precision. W component is destroyed.
  314. // Note that this does not consider zero length vectors !
  315. LL_INLINE void LLVector4a::normalize3fast()
  316. {
  317. LLVector4a len_squared;
  318. len_squared.setAllDot3(*this, *this);
  319. const LLQuad approxRsqrt = _mm_rsqrt_ps(len_squared.mQ);
  320. mQ = _mm_mul_ps(mQ, approxRsqrt);
  321. }
  322. LL_INLINE void LLVector4a::normalize3fast_checked(LLVector4a* d)
  323. {
  324. if (!isFinite3())
  325. {
  326. *this = d ? *d : LLVector4a(0, 1, 0, 1);
  327. return;
  328. }
  329. LLVector4a len_squared; len_squared.setAllDot3(*this, *this);
  330. if (len_squared.getF32ptr()[0] <= FLT_EPSILON)
  331. {
  332. *this = d ? *d : LLVector4a(0, 1, 0, 1);
  333. return;
  334. }
  335. const LLQuad approxRsqrt = _mm_rsqrt_ps(len_squared.mQ);
  336. mQ = _mm_mul_ps(mQ, approxRsqrt);
  337. }
  338. // Return true if this vector is normalized with respect to x,y,z up to
  339. // tolerance
  340. LL_INLINE LLBool32 LLVector4a::isNormalized3(F32 tolerance) const
  341. {
  342. alignas(16) thread_local const U32 ones[4] = {
  343. 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
  344. };
  345. LLSimdScalar tol = _mm_load_ss(&tolerance);
  346. tol = _mm_mul_ss(tol, tol);
  347. LLVector4a lenSquared; lenSquared.setAllDot3(*this, *this);
  348. lenSquared.sub(*reinterpret_cast<const LLVector4a*>(ones));
  349. lenSquared.setAbs(lenSquared);
  350. return _mm_comile_ss(lenSquared, tol);
  351. }
  352. // Return true if this vector is normalized with respect to all components up
  353. // to tolerance
  354. LL_INLINE LLBool32 LLVector4a::isNormalized4(F32 tolerance) const
  355. {
  356. alignas(16) thread_local const U32 ones[4] = {
  357. 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
  358. };
  359. LLSimdScalar tol = _mm_load_ss(&tolerance);
  360. tol = _mm_mul_ss(tol, tol);
  361. LLVector4a lenSquared; lenSquared.setAllDot4(*this, *this);
  362. lenSquared.sub(*reinterpret_cast<const LLVector4a*>(ones));
  363. lenSquared.setAbs(lenSquared);
  364. return _mm_comile_ss(lenSquared, tol);
  365. }
  366. LL_INLINE LLBool32 LLVector4a::isFinite3() const
  367. {
  368. alignas(16) thread_local const U32 nanOrInfMask[4] = {
  369. 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
  370. };
  371. ll_assert_aligned(nanOrInfMask, 16);
  372. const __m128i nanOrInfMaskV =
  373. *reinterpret_cast<const __m128i*>(nanOrInfMask);
  374. const __m128i maskResult = _mm_and_si128(_mm_castps_si128(mQ),
  375. nanOrInfMaskV);
  376. const LLVector4Logical equalityCheck =
  377. _mm_castsi128_ps(_mm_cmpeq_epi32(maskResult, nanOrInfMaskV));
  378. return !equalityCheck.areAnySet(LLVector4Logical::MASK_XYZ);
  379. }
  380. LL_INLINE LLBool32 LLVector4a::isFinite4() const
  381. {
  382. alignas(16) thread_local const U32 nanOrInfMask[4] = {
  383. 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
  384. };
  385. const __m128i nanOrInfMaskV =
  386. *reinterpret_cast<const __m128i*>(nanOrInfMask);
  387. const __m128i maskResult = _mm_and_si128(_mm_castps_si128(mQ),
  388. nanOrInfMaskV);
  389. const LLVector4Logical equalityCheck =
  390. _mm_castsi128_ps(_mm_cmpeq_epi32(maskResult, nanOrInfMaskV));
  391. return !equalityCheck.areAnySet(LLVector4Logical::MASK_XYZW);
  392. }
  393. LL_INLINE void LLVector4a::setRotatedInv(const LLRotation& rot,
  394. const LLVector4a& vec)
  395. {
  396. LLRotation inv; inv.setTranspose(rot);
  397. setRotated(inv, vec);
  398. }
  399. LL_INLINE void LLVector4a::setRotatedInv(const LLQuaternion2& quat,
  400. const LLVector4a& vec)
  401. {
  402. LLQuaternion2 invRot; invRot.setConjugate(quat);
  403. setRotated(invRot, vec);
  404. }
  405. LL_INLINE void LLVector4a::clamp(const LLVector4a& low, const LLVector4a& high)
  406. {
  407. const LLVector4Logical highMask = greaterThan(high);
  408. const LLVector4Logical lowMask = lessThan(low);
  409. setSelectWithMask(highMask, high, *this);
  410. setSelectWithMask(lowMask, low, *this);
  411. }
  412. ////////////////////////////////////
  413. // LOGICAL
  414. //
  415. // The functions in this section will compare the elements in this vector to
  416. // those in rhs and return an LLVector4Logical with all bits set in elements
  417. // where the comparison was true and all bits unset in elements where the
  418. // comparison was false.
  419. //
  420. // WARNING: Other than equals3 and equals4, these functions do NOT account for
  421. // floating point tolerance. You should include the appropriate tolerance in
  422. // the inputs.
  423. ////////////////////////////////////
  424. // Returns true if this and rhs are componentwise equal up to the specified
  425. // absolute tolerance
  426. LL_INLINE bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance) const
  427. {
  428. LLVector4a diff; diff.setSub(*this, rhs);
  429. diff.setAbs(diff);
  430. const LLQuad tol = _mm_set1_ps(tolerance);
  431. const LLQuad cmp = _mm_cmplt_ps(diff, tol);
  432. return (_mm_movemask_ps(cmp) & LLVector4Logical::MASK_XYZW) ==
  433. LLVector4Logical::MASK_XYZW;
  434. }
  435. LL_INLINE bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance) const
  436. {
  437. LLVector4a diff; diff.setSub(*this, rhs);
  438. diff.setAbs(diff);
  439. const LLQuad tol = _mm_set1_ps(tolerance);
  440. const LLQuad t = _mm_cmplt_ps(diff, tol);
  441. return (_mm_movemask_ps(t) & LLVector4Logical::MASK_XYZ) ==
  442. LLVector4Logical::MASK_XYZ;
  443. }