llvector4a.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. /**
  2. * @file llvector4a.h
  3. * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewergpl$
  6. *
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #ifndef LL_LLVECTOR4A_H
  33. #define LL_LLVECTOR4A_H
  34. #include "hbintrinsics.h"
  35. #include "llmemory.h" // For LL_ALIGNED16_NEW_DELETE
  36. class LLRotation;
  37. ///////////////////////////////////
  38. // FIRST TIME USERS PLEASE READ
  39. //////////////////////////////////
  40. // This is just the beginning of LLVector4a. There are many more useful
  41. // functions yet to be implemented. For example, setNeg to negate a vector,
  42. // rotate() to apply a matrix rotation, various functions to manipulate only
  43. // the X, Y, and Z elements and many others (including a whole variety of
  44. // accessors). So if you don't see a function here that you need, please
  45. // contact Falcon or someone else with SSE experience (Richard, I think, has
  46. // some and davep has a little as of the time of this writing, July 08, 2010)
  47. // about getting it implemented before you resort to LLVector3/LLVector4.
  48. /////////////////////////////////
  49. class alignas(16) LLVector4a
  50. {
  51. public:
  52. LL_ALIGNED16_NEW_DELETE
  53. ///////////////////////////////////
  54. // STATIC METHODS
  55. ///////////////////////////////////
  56. // Call this method at startup to avoid 15,000+ cycle penalties from
  57. // denormalized numbers
  58. static void initClass()
  59. {
  60. #if LL_WINDOWS
  61. unsigned int current_word = 0;
  62. _controlfp_s(&current_word, _DN_FLUSH, _MCW_DN);
  63. #endif
  64. #if defined(_MM_SET_DENORMALS_ZERO_MODE) && defined(_MM_DENORMALS_ZERO_ON)
  65. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  66. #endif
  67. #if defined(_MM_SET_FLUSH_ZERO_MODE) && defined(_MM_FLUSH_ZERO_ON)
  68. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  69. #endif
  70. _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
  71. }
  72. // Returns a vector of all zeros
  73. static LL_INLINE const LLVector4a& getZero()
  74. {
  75. extern const LLVector4a LL_V4A_ZERO;
  76. return LL_V4A_ZERO;
  77. }
  78. // Returns a vector of all epsilon, where epsilon is a small float suitable
  79. // for approximate equality checks
  80. static LL_INLINE const LLVector4a& getEpsilon()
  81. {
  82. extern const LLVector4a LL_V4A_EPSILON;
  83. return LL_V4A_EPSILON;
  84. }
  85. // Copies 16 bytes from src to dst. Source and destination must be 16-byte
  86. // aligned
  87. static LL_INLINE void copy4a(F32* dst, const F32* src)
  88. {
  89. _mm_store_ps(dst, _mm_load_ps(src));
  90. }
  91. // Copies words 16-byte blocks from src to dst. Source and destination must
  92. // not overlap. Source and dest must be 16-byte aligned and size must be
  93. // multiple of 16.
  94. static void memcpyNonAliased16(F32* __restrict dst,
  95. const F32* __restrict src, size_t bytes);
  96. ////////////////////////////////////
  97. // CONSTRUCTORS
  98. ////////////////////////////////////
  99. // DO NOT INITIALIZE: The overhead is completely unnecessary
  100. LL_INLINE LLVector4a()
  101. {
  102. }
  103. LL_INLINE LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
  104. {
  105. set(x, y, z, w);
  106. }
  107. LL_INLINE LLVector4a(F32 x)
  108. {
  109. splat(x);
  110. }
  111. LL_INLINE LLVector4a(const LLSimdScalar& x)
  112. {
  113. splat(x);
  114. }
  115. LL_INLINE LLVector4a(LLQuad q)
  116. {
  117. mQ = q;
  118. }
  119. ////////////////////////////////////
  120. // LOAD/STORE
  121. ////////////////////////////////////
  122. // Loads from 16-byte aligned src array (preferred method of loading)
  123. LL_INLINE void load4a(const F32* src)
  124. {
  125. mQ = _mm_load_ps(src);
  126. }
  127. // Loads from unaligned src array (NB: significantly slower than load4a)
  128. LL_INLINE void loadua(const F32* src)
  129. {
  130. mQ = _mm_loadu_ps(src);
  131. }
  132. // Loads only three floats beginning at address 'src'. Slowest method.
  133. LL_INLINE void load3(const F32* src, const F32 w = 0.f)
  134. {
  135. // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
  136. // NB: This differs from the convention of { Z, Y, X, W }
  137. mQ = _mm_set_ps(w, src[2], src[1], src[0]);
  138. }
  139. // Stores to a 16-byte aligned memory address
  140. LL_INLINE void store4a(F32* dst) const
  141. {
  142. _mm_store_ps(dst, mQ);
  143. }
  144. ////////////////////////////////////
  145. // BASIC GET/SET
  146. ////////////////////////////////////
  147. // Returns a "this" as an F32 pointer.
  148. LL_INLINE F32* getF32ptr()
  149. {
  150. return (F32*)&mQ;
  151. }
  152. // Returns a "this" as a const F32 pointer.
  153. LL_INLINE const F32* const getF32ptr() const
  154. {
  155. return (const F32* const)&mQ;
  156. }
  157. // Read-only access to single float in this vector. Do not use in proximity
  158. // to any function call that manipulates the data at the whole vector level
  159. // or you will incur a substantial penalty. Consider using the splat
  160. // functions instead
  161. LL_INLINE F32 operator[](const S32 idx) const
  162. {
  163. return ((F32*)&mQ)[idx];
  164. }
  165. // Prefer this method for read-only access to a single element. Prefer the
  166. // templated version if the elem is known at compile time.
  167. LL_INLINE LLSimdScalar getScalarAt(S32 idx) const;
  168. // Prefer this method for read-only access to a single element. Prefer the
  169. // templated version if the elem is known at compile time.
  170. template <int N> LL_INLINE LLSimdScalar getScalarAt() const;
  171. // Sets to an x, y, z and optional w provided
  172. LL_INLINE void set(F32 x, F32 y, F32 z, F32 w = 0.f)
  173. {
  174. mQ = _mm_set_ps(w, z, y, x);
  175. }
  176. // Sets to all zeros. This is preferred to using ::getZero()
  177. LL_INLINE void clear()
  178. {
  179. mQ = LLVector4a::getZero().mQ;
  180. }
  181. // Sets all elements to 'x'
  182. LL_INLINE void splat(F32 x)
  183. {
  184. mQ = _mm_set1_ps(x);
  185. }
  186. // Sets all elements to 'x'
  187. LL_INLINE void splat(const LLSimdScalar& x);
  188. // Sets all 4 elements to element N of src, with N known at compile time
  189. template <int N> LL_INLINE void splat(const LLVector4a& src);
  190. // Sets all 4 elements to element i of v, with i NOT known at compile time
  191. LL_INLINE void splat(const LLVector4a& v, U32 i);
  192. // Sets element N to that of src's element N. Much cleaner than
  193. // {LLVector4Logical mask; mask.clear(); mask.setElement<N>();
  194. // target.setSelectWithMask(mask,src,target);}
  195. template <int N> LL_INLINE void copyComponent(const LLVector4a& src);
  196. // Selects bits from src_if_true and src_if_false according to bits in mask
  197. LL_INLINE void setSelectWithMask(const LLVector4Logical& mask,
  198. const LLVector4a& src_if_true,
  199. const LLVector4a& src_if_false);
  200. ////////////////////////////////////
  201. // ALGEBRAIC
  202. ////////////////////////////////////
  203. // Sets this to the element-wise (a + b)
  204. LL_INLINE void setAdd(const LLVector4a& a, const LLVector4a& b)
  205. {
  206. mQ = _mm_add_ps(a.mQ, b.mQ);
  207. }
  208. // Sets this to element-wise (a - b)
  209. LL_INLINE void setSub(const LLVector4a& a, const LLVector4a& b)
  210. {
  211. mQ = _mm_sub_ps(a.mQ, b.mQ);
  212. }
  213. // Sets this to element-wise multiply (a * b)
  214. LL_INLINE void setMul(const LLVector4a& a, const LLVector4a& b)
  215. {
  216. mQ = _mm_mul_ps(a.mQ, b.mQ);
  217. }
  218. // Sets this to element-wise quotient (a / b)
  219. LL_INLINE void setDiv(const LLVector4a& a, const LLVector4a& b)
  220. {
  221. mQ = _mm_div_ps(a.mQ, b.mQ);
  222. }
  223. // Sets this to the element-wise absolute value of src
  224. LL_INLINE void setAbs(const LLVector4a& src)
  225. {
  226. alignas(16) thread_local const U32 F_ABS_MASK_4A[4] = {
  227. 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
  228. };
  229. mQ = _mm_and_ps(src.mQ,
  230. *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
  231. }
  232. // Adds to each component in this vector the corresponding component in rhs
  233. LL_INLINE void add(const LLVector4a& rhs)
  234. {
  235. mQ = _mm_add_ps(mQ, rhs.mQ);
  236. }
  237. // Subtracts from each component in this vector the corresponding component
  238. // in rhs
  239. LL_INLINE void sub(const LLVector4a& rhs)
  240. {
  241. mQ = _mm_sub_ps(mQ, rhs.mQ);
  242. }
  243. // Multiplies each component in this vector by the corresponding component
  244. // in rhs
  245. LL_INLINE void mul(const LLVector4a& rhs)
  246. {
  247. mQ = _mm_mul_ps(mQ, rhs.mQ);
  248. }
  249. // Divides each component in this vector by the corresponding component in
  250. // rhs
  251. LL_INLINE void div(const LLVector4a& rhs)
  252. {
  253. // *TODO: Check accuracy, maybe add divFast
  254. mQ = _mm_div_ps(mQ, rhs.mQ);
  255. }
  256. // Multiplies this vector by x in a scalar fashion
  257. LL_INLINE void mul(F32 x)
  258. {
  259. LLVector4a t;
  260. t.splat(x);
  261. mQ = _mm_mul_ps(mQ, t.mQ);
  262. }
  263. // Sets this to (a x b) (geometric cross-product)
  264. LL_INLINE void setCross3(const LLVector4a& a, const LLVector4a& b);
  265. // Sets all elements to the dot product of the x, y, and z elements in a
  266. // and b
  267. LL_INLINE void setAllDot3(const LLVector4a& a, const LLVector4a& b);
  268. // Sets all elements to the dot product of the x, y, z, and w elements in a
  269. // and b
  270. LL_INLINE void setAllDot4(const LLVector4a& a, const LLVector4a& b);
  271. // Returns the 3D dot product of this vector and b
  272. LL_INLINE LLSimdScalar dot3(const LLVector4a& b) const;
  273. // Returns the 4D dot product of this vector and b
  274. LL_INLINE LLSimdScalar dot4(const LLVector4a& b) const;
  275. // Normalizes this vector with respect to the x, y, and z components only.
  276. // Accurate to 22 bits of precision. W component is destroyed.
  277. // Note that this does not consider zero length vectors !
  278. LL_INLINE void normalize3();
  279. // Same as normalize3() but with respect to all 4 components
  280. LL_INLINE void normalize4();
  281. // Same as normalize3(), but returns length as a SIMD scalar
  282. LL_INLINE LLSimdScalar normalize3withLength();
  283. // Normalizes this vector with respect to the x, y, and z components only.
  284. // Accurate only to 10-12 bits of precision. W component is destroyed.
  285. // Note that this does not consider zero length vectors !
  286. LL_INLINE void normalize3fast();
  287. LL_INLINE void normalize3fast_checked(LLVector4a* d = NULL);
  288. // Returns true if this vector is normalized with respect to x,y,z up to
  289. // tolerance
  290. LL_INLINE LLBool32 isNormalized3(F32 tolerance = 1e-3) const;
  291. // Returns true if this vector is normalized with respect to all components
  292. // up to tolerance
  293. LL_INLINE LLBool32 isNormalized4(F32 tolerance = 1e-3) const;
  294. // Sets all elements to the length of vector 'v'
  295. LL_INLINE void setAllLength3(const LLVector4a& v)
  296. {
  297. LLVector4a len_squared;
  298. len_squared.setAllDot3(v, v);
  299. mQ = _mm_sqrt_ps(len_squared.mQ);
  300. }
  301. // Gets this vector's length
  302. LL_INLINE LLSimdScalar getLength3() const
  303. {
  304. return _mm_sqrt_ss(dot3((const LLVector4a)mQ));
  305. }
  306. // Sets the components of this vector to the minimum of the corresponding
  307. // components of lhs and rhs
  308. LL_INLINE void setMin(const LLVector4a& lhs, const LLVector4a& rhs)
  309. {
  310. mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
  311. }
  312. // Sets the components of this vector to the maximum of the corresponding
  313. // components of lhs and rhs
  314. LL_INLINE void setMax(const LLVector4a& lhs, const LLVector4a& rhs)
  315. {
  316. mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
  317. }
  318. // Clamps this vector to be within the component-wise range low to high
  319. // (inclusive)
  320. LL_INLINE void clamp(const LLVector4a& low, const LLVector4a& high);
  321. // Sets this to (c * lhs) + rhs * (1 - c)
  322. LL_INLINE void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
  323. {
  324. LLVector4a t;
  325. t.setSub(rhs, lhs);
  326. t.mul(c);
  327. setAdd(lhs, t);
  328. }
  329. // Returns true (nonzero) if x, y, z (and w for Finite4) are all finite
  330. // floats
  331. LL_INLINE LLBool32 isFinite3() const;
  332. LL_INLINE LLBool32 isFinite4() const;
  333. // Sets this vector to 'vec' rotated by the LLRotation or LLQuaternion2
  334. // provided
  335. void setRotated(const LLRotation& rot, const LLVector4a& vec);
  336. void setRotated(const class LLQuaternion2& quat, const LLVector4a& vec);
  337. // Sets this vector to 'vec' rotated by the INVERSE of the LLRotation or
  338. // LLQuaternion2 provided
  339. LL_INLINE void setRotatedInv(const LLRotation& rot, const LLVector4a& vec);
  340. LL_INLINE void setRotatedInv(const class LLQuaternion2& quat,
  341. const LLVector4a& vec);
  342. // Quantizes this vector to 8 or 16 bit precision
  343. void quantize8(const LLVector4a& low, const LLVector4a& high);
  344. void quantize16(const LLVector4a& low, const LLVector4a& high);
  345. LL_INLINE void negate()
  346. {
  347. alignas(16) thread_local const U32 sign_mask[] =
  348. { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  349. mQ = _mm_xor_ps(*reinterpret_cast<const LLQuad*>(sign_mask), mQ);
  350. }
  351. ////////////////////////////////////
  352. // LOGICAL
  353. ////////////////////////////////////
  354. // The methods in this section will compare the elements in this vector to
  355. // those in rhs and return an LLVector4Logical with all bits set in
  356. // elements where the comparison was true and all bits unset in elements
  357. // where the comparison was false. See llvector4logica.h
  358. ////////////////////////////////////
  359. // WARNING: Other than equals3 and equals4, these functions do NOT account
  360. // for floating point tolerance. You should include the appropriate
  361. // tolerance in the inputs.
  362. ////////////////////////////////////
  363. LL_INLINE LLVector4Logical greaterThan(const LLVector4a& rhs) const
  364. {
  365. return _mm_cmpgt_ps(mQ, rhs.mQ);
  366. }
  367. LL_INLINE LLVector4Logical lessThan(const LLVector4a& rhs) const
  368. {
  369. return _mm_cmplt_ps(mQ, rhs.mQ);
  370. }
  371. LL_INLINE LLVector4Logical greaterEqual(const LLVector4a& rhs) const
  372. {
  373. return _mm_cmpge_ps(mQ, rhs.mQ);
  374. }
  375. LL_INLINE LLVector4Logical lessEqual(const LLVector4a& rhs) const
  376. {
  377. return _mm_cmple_ps(mQ, rhs.mQ);
  378. }
  379. LL_INLINE LLVector4Logical equal(const LLVector4a& rhs) const
  380. {
  381. return _mm_cmpeq_ps(mQ, rhs.mQ);
  382. }
  383. // Returns true if this and rhs are componentwise equal up to the specified
  384. // absolute tolerance
  385. LL_INLINE bool equals4(const LLVector4a& rhs,
  386. F32 tolerance = F_APPROXIMATELY_ZERO) const;
  387. LL_INLINE bool equals3(const LLVector4a& rhs,
  388. F32 tolerance = F_APPROXIMATELY_ZERO) const;
  389. LL_INLINE bool operator==(const LLVector4a& rhs) const
  390. {
  391. return equals4(rhs);
  392. }
  393. LL_INLINE bool operator!=(const LLVector4a& rhs) const
  394. {
  395. return !equals4(rhs);
  396. }
  397. ////////////////////////////////////
  398. // OPERATORS
  399. ////////////////////////////////////
  400. // Do NOT add aditional operators without consulting someone with SSE
  401. // experience
  402. LL_INLINE const LLVector4a& operator=(const LLVector4a& rhs)
  403. {
  404. mQ = rhs.mQ;
  405. return *this;
  406. }
  407. LL_INLINE const LLVector4a& operator=(const LLQuad& rhs)
  408. {
  409. mQ = rhs;
  410. return *this;
  411. }
  412. LL_INLINE operator LLQuad() const
  413. {
  414. return mQ;
  415. }
  416. private:
  417. LLQuad mQ;
  418. };
  419. LL_INLINE void update_min_max(LLVector4a& min, LLVector4a& max,
  420. const LLVector4a& p)
  421. {
  422. min.setMin(min, p);
  423. max.setMax(max, p);
  424. }
  425. #endif