123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513 |
- /**
- * @file llvector4a.h
- * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector
- *
- * $LicenseInfo:firstyear=2010&license=viewergpl$
- *
- * Copyright (C) 2010, Linden Research, Inc.
- *
- * Second Life Viewer Source Code
- * The source code in this file ("Source Code") is provided by Linden Lab
- * to you under the terms of the GNU General Public License, version 2.0
- * ("GPL"), unless you have obtained a separate licensing agreement
- * ("Other License"), formally executed by you and Linden Lab. Terms of
- * the GPL can be found in doc/GPL-license.txt in this distribution, or
- * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
- *
- * There are special exceptions to the terms and conditions of the GPL as
- * it is applied to this Source Code. View the full text of the exception
- * in the file doc/FLOSS-exception.txt in this software distribution, or
- * online at
- * http://secondlifegrid.net/programs/open_source/licensing/flossexception
- *
- * By copying, modifying or distributing this software, you acknowledge
- * that you have read and understood your obligations described above,
- * and agree to abide by those obligations.
- *
- * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
- * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
- * COMPLETENESS OR PERFORMANCE.
- * $/LicenseInfo$
- */
- #ifndef LL_LLVECTOR4A_H
- #define LL_LLVECTOR4A_H
- #include "hbintrinsics.h"
- #include "llmemory.h" // For LL_ALIGNED16_NEW_DELETE
- class LLRotation;
- ///////////////////////////////////
- // FIRST TIME USERS PLEASE READ
- //////////////////////////////////
- // This is just the beginning of LLVector4a. There are many more useful
- // functions yet to be implemented. For example, setNeg to negate a vector,
- // rotate() to apply a matrix rotation, various functions to manipulate only
- // the X, Y, and Z elements and many others (including a whole variety of
- // accessors). So if you don't see a function here that you need, please
- // contact Falcon or someone else with SSE experience (Richard, I think, has
- // some and davep has a little as of the time of this writing, July 08, 2010)
- // about getting it implemented before you resort to LLVector3/LLVector4.
- /////////////////////////////////
- class alignas(16) LLVector4a
- {
- public:
- LL_ALIGNED16_NEW_DELETE
- ///////////////////////////////////
- // STATIC METHODS
- ///////////////////////////////////
- // Call this method at startup to avoid 15,000+ cycle penalties from
- // denormalized numbers
- static void initClass()
- {
- #if LL_WINDOWS
- unsigned int current_word = 0;
- _controlfp_s(¤t_word, _DN_FLUSH, _MCW_DN);
- #endif
- #if defined(_MM_SET_DENORMALS_ZERO_MODE) && defined(_MM_DENORMALS_ZERO_ON)
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
- #endif
- #if defined(_MM_SET_FLUSH_ZERO_MODE) && defined(_MM_FLUSH_ZERO_ON)
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
- #endif
- _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
- }
- // Returns a vector of all zeros
- static LL_INLINE const LLVector4a& getZero()
- {
- extern const LLVector4a LL_V4A_ZERO;
- return LL_V4A_ZERO;
- }
- // Returns a vector of all epsilon, where epsilon is a small float suitable
- // for approximate equality checks
- static LL_INLINE const LLVector4a& getEpsilon()
- {
- extern const LLVector4a LL_V4A_EPSILON;
- return LL_V4A_EPSILON;
- }
- // Copies 16 bytes from src to dst. Source and destination must be 16-byte
- // aligned
- static LL_INLINE void copy4a(F32* dst, const F32* src)
- {
- _mm_store_ps(dst, _mm_load_ps(src));
- }
- // Copies words 16-byte blocks from src to dst. Source and destination must
- // not overlap. Source and dest must be 16-byte aligned and size must be
- // multiple of 16.
- static void memcpyNonAliased16(F32* __restrict dst,
- const F32* __restrict src, size_t bytes);
- ////////////////////////////////////
- // CONSTRUCTORS
- ////////////////////////////////////
- // DO NOT INITIALIZE: The overhead is completely unnecessary
- LL_INLINE LLVector4a()
- {
- }
- LL_INLINE LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
- {
- set(x, y, z, w);
- }
- LL_INLINE LLVector4a(F32 x)
- {
- splat(x);
- }
- LL_INLINE LLVector4a(const LLSimdScalar& x)
- {
- splat(x);
- }
- LL_INLINE LLVector4a(LLQuad q)
- {
- mQ = q;
- }
- ////////////////////////////////////
- // LOAD/STORE
- ////////////////////////////////////
- // Loads from 16-byte aligned src array (preferred method of loading)
- LL_INLINE void load4a(const F32* src)
- {
- mQ = _mm_load_ps(src);
- }
- // Loads from unaligned src array (NB: significantly slower than load4a)
- LL_INLINE void loadua(const F32* src)
- {
- mQ = _mm_loadu_ps(src);
- }
- // Loads only three floats beginning at address 'src'. Slowest method.
- LL_INLINE void load3(const F32* src, const F32 w = 0.f)
- {
- // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
- // NB: This differs from the convention of { Z, Y, X, W }
- mQ = _mm_set_ps(w, src[2], src[1], src[0]);
- }
- // Stores to a 16-byte aligned memory address
- LL_INLINE void store4a(F32* dst) const
- {
- _mm_store_ps(dst, mQ);
- }
- ////////////////////////////////////
- // BASIC GET/SET
- ////////////////////////////////////
- // Returns a "this" as an F32 pointer.
- LL_INLINE F32* getF32ptr()
- {
- return (F32*)&mQ;
- }
- // Returns a "this" as a const F32 pointer.
- LL_INLINE const F32* const getF32ptr() const
- {
- return (const F32* const)&mQ;
- }
- // Read-only access to single float in this vector. Do not use in proximity
- // to any function call that manipulates the data at the whole vector level
- // or you will incur a substantial penalty. Consider using the splat
- // functions instead
- LL_INLINE F32 operator[](const S32 idx) const
- {
- return ((F32*)&mQ)[idx];
- }
- // Prefer this method for read-only access to a single element. Prefer the
- // templated version if the elem is known at compile time.
- LL_INLINE LLSimdScalar getScalarAt(S32 idx) const;
- // Prefer this method for read-only access to a single element. Prefer the
- // templated version if the elem is known at compile time.
- template <int N> LL_INLINE LLSimdScalar getScalarAt() const;
- // Sets to an x, y, z and optional w provided
- LL_INLINE void set(F32 x, F32 y, F32 z, F32 w = 0.f)
- {
- mQ = _mm_set_ps(w, z, y, x);
- }
- // Sets to all zeros. This is preferred to using ::getZero()
- LL_INLINE void clear()
- {
- mQ = LLVector4a::getZero().mQ;
- }
- // Sets all elements to 'x'
- LL_INLINE void splat(F32 x)
- {
- mQ = _mm_set1_ps(x);
- }
- // Sets all elements to 'x'
- LL_INLINE void splat(const LLSimdScalar& x);
- // Sets all 4 elements to element N of src, with N known at compile time
- template <int N> LL_INLINE void splat(const LLVector4a& src);
- // Sets all 4 elements to element i of v, with i NOT known at compile time
- LL_INLINE void splat(const LLVector4a& v, U32 i);
- // Sets element N to that of src's element N. Much cleaner than
- // {LLVector4Logical mask; mask.clear(); mask.setElement<N>();
- // target.setSelectWithMask(mask,src,target);}
- template <int N> LL_INLINE void copyComponent(const LLVector4a& src);
- // Selects bits from src_if_true and src_if_false according to bits in mask
- LL_INLINE void setSelectWithMask(const LLVector4Logical& mask,
- const LLVector4a& src_if_true,
- const LLVector4a& src_if_false);
- ////////////////////////////////////
- // ALGEBRAIC
- ////////////////////////////////////
- // Sets this to the element-wise (a + b)
- LL_INLINE void setAdd(const LLVector4a& a, const LLVector4a& b)
- {
- mQ = _mm_add_ps(a.mQ, b.mQ);
- }
- // Sets this to element-wise (a - b)
- LL_INLINE void setSub(const LLVector4a& a, const LLVector4a& b)
- {
- mQ = _mm_sub_ps(a.mQ, b.mQ);
- }
- // Sets this to element-wise multiply (a * b)
- LL_INLINE void setMul(const LLVector4a& a, const LLVector4a& b)
- {
- mQ = _mm_mul_ps(a.mQ, b.mQ);
- }
- // Sets this to element-wise quotient (a / b)
- LL_INLINE void setDiv(const LLVector4a& a, const LLVector4a& b)
- {
- mQ = _mm_div_ps(a.mQ, b.mQ);
- }
- // Sets this to the element-wise absolute value of src
- LL_INLINE void setAbs(const LLVector4a& src)
- {
- alignas(16) thread_local const U32 F_ABS_MASK_4A[4] = {
- 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
- };
- mQ = _mm_and_ps(src.mQ,
- *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
- }
- // Adds to each component in this vector the corresponding component in rhs
- LL_INLINE void add(const LLVector4a& rhs)
- {
- mQ = _mm_add_ps(mQ, rhs.mQ);
- }
- // Subtracts from each component in this vector the corresponding component
- // in rhs
- LL_INLINE void sub(const LLVector4a& rhs)
- {
- mQ = _mm_sub_ps(mQ, rhs.mQ);
- }
- // Multiplies each component in this vector by the corresponding component
- // in rhs
- LL_INLINE void mul(const LLVector4a& rhs)
- {
- mQ = _mm_mul_ps(mQ, rhs.mQ);
- }
- // Divides each component in this vector by the corresponding component in
- // rhs
- LL_INLINE void div(const LLVector4a& rhs)
- {
- // *TODO: Check accuracy, maybe add divFast
- mQ = _mm_div_ps(mQ, rhs.mQ);
- }
- // Multiplies this vector by x in a scalar fashion
- LL_INLINE void mul(F32 x)
- {
- LLVector4a t;
- t.splat(x);
- mQ = _mm_mul_ps(mQ, t.mQ);
- }
- // Sets this to (a x b) (geometric cross-product)
- LL_INLINE void setCross3(const LLVector4a& a, const LLVector4a& b);
- // Sets all elements to the dot product of the x, y, and z elements in a
- // and b
- LL_INLINE void setAllDot3(const LLVector4a& a, const LLVector4a& b);
- // Sets all elements to the dot product of the x, y, z, and w elements in a
- // and b
- LL_INLINE void setAllDot4(const LLVector4a& a, const LLVector4a& b);
- // Returns the 3D dot product of this vector and b
- LL_INLINE LLSimdScalar dot3(const LLVector4a& b) const;
- // Returns the 4D dot product of this vector and b
- LL_INLINE LLSimdScalar dot4(const LLVector4a& b) const;
- // Normalizes this vector with respect to the x, y, and z components only.
- // Accurate to 22 bits of precision. W component is destroyed.
- // Note that this does not consider zero length vectors !
- LL_INLINE void normalize3();
- // Same as normalize3() but with respect to all 4 components
- LL_INLINE void normalize4();
- // Same as normalize3(), but returns length as a SIMD scalar
- LL_INLINE LLSimdScalar normalize3withLength();
- // Normalizes this vector with respect to the x, y, and z components only.
- // Accurate only to 10-12 bits of precision. W component is destroyed.
- // Note that this does not consider zero length vectors !
- LL_INLINE void normalize3fast();
- LL_INLINE void normalize3fast_checked(LLVector4a* d = NULL);
- // Returns true if this vector is normalized with respect to x,y,z up to
- // tolerance
- LL_INLINE LLBool32 isNormalized3(F32 tolerance = 1e-3) const;
- // Returns true if this vector is normalized with respect to all components
- // up to tolerance
- LL_INLINE LLBool32 isNormalized4(F32 tolerance = 1e-3) const;
- // Sets all elements to the length of vector 'v'
- LL_INLINE void setAllLength3(const LLVector4a& v)
- {
- LLVector4a len_squared;
- len_squared.setAllDot3(v, v);
- mQ = _mm_sqrt_ps(len_squared.mQ);
- }
- // Gets this vector's length
- LL_INLINE LLSimdScalar getLength3() const
- {
- return _mm_sqrt_ss(dot3((const LLVector4a)mQ));
- }
- // Sets the components of this vector to the minimum of the corresponding
- // components of lhs and rhs
- LL_INLINE void setMin(const LLVector4a& lhs, const LLVector4a& rhs)
- {
- mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
- }
- // Sets the components of this vector to the maximum of the corresponding
- // components of lhs and rhs
- LL_INLINE void setMax(const LLVector4a& lhs, const LLVector4a& rhs)
- {
- mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
- }
- // Clamps this vector to be within the component-wise range low to high
- // (inclusive)
- LL_INLINE void clamp(const LLVector4a& low, const LLVector4a& high);
- // Sets this to (c * lhs) + rhs * (1 - c)
- LL_INLINE void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
- {
- LLVector4a t;
- t.setSub(rhs, lhs);
- t.mul(c);
- setAdd(lhs, t);
- }
- // Returns true (nonzero) if x, y, z (and w for Finite4) are all finite
- // floats
- LL_INLINE LLBool32 isFinite3() const;
- LL_INLINE LLBool32 isFinite4() const;
- // Sets this vector to 'vec' rotated by the LLRotation or LLQuaternion2
- // provided
- void setRotated(const LLRotation& rot, const LLVector4a& vec);
- void setRotated(const class LLQuaternion2& quat, const LLVector4a& vec);
- // Sets this vector to 'vec' rotated by the INVERSE of the LLRotation or
- // LLQuaternion2 provided
- LL_INLINE void setRotatedInv(const LLRotation& rot, const LLVector4a& vec);
- LL_INLINE void setRotatedInv(const class LLQuaternion2& quat,
- const LLVector4a& vec);
- // Quantizes this vector to 8 or 16 bit precision
- void quantize8(const LLVector4a& low, const LLVector4a& high);
- void quantize16(const LLVector4a& low, const LLVector4a& high);
- LL_INLINE void negate()
- {
- alignas(16) thread_local const U32 sign_mask[] =
- { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
- mQ = _mm_xor_ps(*reinterpret_cast<const LLQuad*>(sign_mask), mQ);
- }
- ////////////////////////////////////
- // LOGICAL
- ////////////////////////////////////
- // The methods in this section will compare the elements in this vector to
- // those in rhs and return an LLVector4Logical with all bits set in
- // elements where the comparison was true and all bits unset in elements
- // where the comparison was false. See llvector4logica.h
- ////////////////////////////////////
- // WARNING: Other than equals3 and equals4, these functions do NOT account
- // for floating point tolerance. You should include the appropriate
- // tolerance in the inputs.
- ////////////////////////////////////
- LL_INLINE LLVector4Logical greaterThan(const LLVector4a& rhs) const
- {
- return _mm_cmpgt_ps(mQ, rhs.mQ);
- }
- LL_INLINE LLVector4Logical lessThan(const LLVector4a& rhs) const
- {
- return _mm_cmplt_ps(mQ, rhs.mQ);
- }
- LL_INLINE LLVector4Logical greaterEqual(const LLVector4a& rhs) const
- {
- return _mm_cmpge_ps(mQ, rhs.mQ);
- }
- LL_INLINE LLVector4Logical lessEqual(const LLVector4a& rhs) const
- {
- return _mm_cmple_ps(mQ, rhs.mQ);
- }
- LL_INLINE LLVector4Logical equal(const LLVector4a& rhs) const
- {
- return _mm_cmpeq_ps(mQ, rhs.mQ);
- }
- // Returns true if this and rhs are componentwise equal up to the specified
- // absolute tolerance
- LL_INLINE bool equals4(const LLVector4a& rhs,
- F32 tolerance = F_APPROXIMATELY_ZERO) const;
- LL_INLINE bool equals3(const LLVector4a& rhs,
- F32 tolerance = F_APPROXIMATELY_ZERO) const;
- LL_INLINE bool operator==(const LLVector4a& rhs) const
- {
- return equals4(rhs);
- }
- LL_INLINE bool operator!=(const LLVector4a& rhs) const
- {
- return !equals4(rhs);
- }
- ////////////////////////////////////
- // OPERATORS
- ////////////////////////////////////
- // Do NOT add aditional operators without consulting someone with SSE
- // experience
- LL_INLINE const LLVector4a& operator=(const LLVector4a& rhs)
- {
- mQ = rhs.mQ;
- return *this;
- }
- LL_INLINE const LLVector4a& operator=(const LLQuad& rhs)
- {
- mQ = rhs;
- return *this;
- }
- LL_INLINE operator LLQuad() const
- {
- return mQ;
- }
- private:
- LLQuad mQ;
- };
- LL_INLINE void update_min_max(LLVector4a& min, LLVector4a& max,
- const LLVector4a& p)
- {
- min.setMin(min, p);
- max.setMax(max, p);
- }
- #endif
|