llvector4a.cpp 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /**
  2. * @file llvector4a.cpp
  3. * @brief SIMD vector implementation
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewergpl$
  6. *
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #include "linden_common.h"
  33. #include "llmath.h"
  34. #include "llmemory.h"
  35. #include "llquantize.h"
  36. extern const LLQuad F_ZERO_4A = { 0.f, 0.f, 0.f, 0.f };
  37. extern const LLQuad F_APPROXIMATELY_ZERO_4A =
  38. {
  39. F_APPROXIMATELY_ZERO,
  40. F_APPROXIMATELY_ZERO,
  41. F_APPROXIMATELY_ZERO,
  42. F_APPROXIMATELY_ZERO
  43. };
  44. extern const LLVector4a LL_V4A_ZERO =
  45. reinterpret_cast<const LLVector4a&> (F_ZERO_4A);
  46. extern const LLVector4a LL_V4A_EPSILON =
  47. reinterpret_cast<const LLVector4a&> (F_APPROXIMATELY_ZERO_4A);
  48. //static
  49. void LLVector4a::memcpyNonAliased16(F32* __restrict dst,
  50. const F32* __restrict src,
  51. size_t bytes)
  52. {
  53. ll_memcpy_nonaliased_aligned_16((char*)dst, (char*)src, bytes);
  54. }
  55. void LLVector4a::setRotated(const LLRotation& rot, const LLVector4a& vec)
  56. {
  57. const LLVector4a col0 = rot.getColumn(0);
  58. const LLVector4a col1 = rot.getColumn(1);
  59. const LLVector4a col2 = rot.getColumn(2);
  60. LLVector4a result = _mm_load_ss(vec.getF32ptr());
  61. result.splat<0>(result);
  62. result.mul(col0);
  63. {
  64. LLVector4a yyyy = _mm_load_ss(vec.getF32ptr() + 1);
  65. yyyy.splat<0>(yyyy);
  66. yyyy.mul(col1);
  67. result.add(yyyy);
  68. }
  69. {
  70. LLVector4a zzzz = _mm_load_ss(vec.getF32ptr() + 2);
  71. zzzz.splat<0>(zzzz);
  72. zzzz.mul(col2);
  73. result.add(zzzz);
  74. }
  75. *this = result;
  76. }
  77. void LLVector4a::setRotated(const LLQuaternion2& quat, const LLVector4a& vec)
  78. {
  79. const LLVector4a& quatVec = quat.getVector4a();
  80. LLVector4a temp; temp.setCross3(quatVec, vec);
  81. temp.add(temp);
  82. const LLVector4a realPart(quatVec.getScalarAt<3>());
  83. LLVector4a tempTimesReal; tempTimesReal.setMul(temp, realPart);
  84. mQ = vec;
  85. add(tempTimesReal);
  86. LLVector4a imagCrossTemp; imagCrossTemp.setCross3(quatVec, temp);
  87. add(imagCrossTemp);
  88. }
  89. void LLVector4a::quantize8(const LLVector4a& low, const LLVector4a& high)
  90. {
  91. LLVector4a val(mQ);
  92. LLVector4a delta; delta.setSub(high, low);
  93. {
  94. val.clamp(low, high);
  95. val.sub(low);
  96. // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
  97. const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
  98. // {
  99. // thread_local alignas(16) const F32 F_TWO_4A[4] = { 2.f, 2.f, 2.f, 2.f };
  100. // LLVector4a two; two.load4a(F_TWO_4A);
  101. //
  102. // // Here we use _mm_rcp_ps plus one round of newton-raphson
  103. // // We wish to find 'x' such that x = 1/delta
  104. // // As a first approximation, we take x0 = _mm_rcp_ps(delta)
  105. // // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * (2 - a * x0)
  106. // // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
  107. // const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
  108. // oneOverDelta.setMul(delta, recipApprox);
  109. // oneOverDelta.setSub(two, oneOverDelta);
  110. // oneOverDelta.mul(recipApprox);
  111. // }
  112. val.mul(oneOverDelta);
  113. val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
  114. }
  115. val = _mm_cvtepi32_ps(_mm_cvtps_epi32(val.mQ));
  116. {
  117. val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
  118. val.mul(delta);
  119. val.add(low);
  120. }
  121. {
  122. LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
  123. LLVector4a absVal; absVal.setAbs(val);
  124. setSelectWithMask(absVal.lessThan(maxError), F_ZERO_4A, val);
  125. }
  126. }
  127. void LLVector4a::quantize16(const LLVector4a& low, const LLVector4a& high)
  128. {
  129. LLVector4a val(mQ);
  130. LLVector4a delta; delta.setSub(high, low);
  131. {
  132. val.clamp(low, high);
  133. val.sub(low);
  134. // 16-bit quantization means we need a round of Newton-Raphson
  135. LLVector4a oneOverDelta;
  136. {
  137. alignas(16) thread_local const F32 F_TWO_4A[4] = { 2.f, 2.f, 2.f, 2.f };
  138. ll_assert_aligned(F_TWO_4A, 16);
  139. LLVector4a two; two.load4a(F_TWO_4A);
  140. // Here we use _mm_rcp_ps plus one round of newton-raphson
  141. // We wish to find 'x' such that x = 1/delta
  142. // As a first approximation, we take x0 = _mm_rcp_ps(delta)
  143. // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * (2 - a * x0)
  144. // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
  145. const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
  146. oneOverDelta.setMul(delta, recipApprox);
  147. oneOverDelta.setSub(two, oneOverDelta);
  148. oneOverDelta.mul(recipApprox);
  149. }
  150. val.mul(oneOverDelta);
  151. val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
  152. }
  153. val = _mm_cvtepi32_ps(_mm_cvtps_epi32(val.mQ));
  154. {
  155. val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
  156. val.mul(delta);
  157. val.add(low);
  158. }
  159. {
  160. LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
  161. LLVector4a absVal; absVal.setAbs(val);
  162. setSelectWithMask(absVal.lessThan(maxError), F_ZERO_4A, val);
  163. }
  164. }