/** * @file llvector4a.h * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector * * $LicenseInfo:firstyear=2010&license=viewergpl$ * * Copyright (C) 2010, Linden Research, Inc. * * Second Life Viewer Source Code * The source code in this file ("Source Code") is provided by Linden Lab * to you under the terms of the GNU General Public License, version 2.0 * ("GPL"), unless you have obtained a separate licensing agreement * ("Other License"), formally executed by you and Linden Lab. Terms of * the GPL can be found in doc/GPL-license.txt in this distribution, or * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 * * There are special exceptions to the terms and conditions of the GPL as * it is applied to this Source Code. View the full text of the exception * in the file doc/FLOSS-exception.txt in this software distribution, or * online at * http://secondlifegrid.net/programs/open_source/licensing/flossexception * * By copying, modifying or distributing this software, you acknowledge * that you have read and understood your obligations described above, * and agree to abide by those obligations. * * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, * COMPLETENESS OR PERFORMANCE. * $/LicenseInfo$ */ #ifndef LL_LLVECTOR4A_H #define LL_LLVECTOR4A_H #include "hbintrinsics.h" #include "llmemory.h" // For LL_ALIGNED16_NEW_DELETE class LLRotation; /////////////////////////////////// // FIRST TIME USERS PLEASE READ ////////////////////////////////// // This is just the beginning of LLVector4a. There are many more useful // functions yet to be implemented. For example, setNeg to negate a vector, // rotate() to apply a matrix rotation, various functions to manipulate only // the X, Y, and Z elements and many others (including a whole variety of // accessors). So if you don't see a function here that you need, please // contact Falcon or someone else with SSE experience (Richard, I think, has // some and davep has a little as of the time of this writing, July 08, 2010) // about getting it implemented before you resort to LLVector3/LLVector4. ///////////////////////////////// class alignas(16) LLVector4a { public: LL_ALIGNED16_NEW_DELETE /////////////////////////////////// // STATIC METHODS /////////////////////////////////// // Call this method at startup to avoid 15,000+ cycle penalties from // denormalized numbers static void initClass() { #if LL_WINDOWS unsigned int current_word = 0; _controlfp_s(¤t_word, _DN_FLUSH, _MCW_DN); #endif #if defined(_MM_SET_DENORMALS_ZERO_MODE) && defined(_MM_DENORMALS_ZERO_ON) _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); #endif #if defined(_MM_SET_FLUSH_ZERO_MODE) && defined(_MM_FLUSH_ZERO_ON) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); } // Returns a vector of all zeros static LL_INLINE const LLVector4a& getZero() { extern const LLVector4a LL_V4A_ZERO; return LL_V4A_ZERO; } // Returns a vector of all epsilon, where epsilon is a small float suitable // for approximate equality checks static LL_INLINE const LLVector4a& getEpsilon() { extern const LLVector4a LL_V4A_EPSILON; return LL_V4A_EPSILON; } // Copies 16 bytes from src to dst. Source and destination must be 16-byte // aligned static LL_INLINE void copy4a(F32* dst, const F32* src) { _mm_store_ps(dst, _mm_load_ps(src)); } // Copies words 16-byte blocks from src to dst. Source and destination must // not overlap. Source and dest must be 16-byte aligned and size must be // multiple of 16. static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes); //////////////////////////////////// // CONSTRUCTORS //////////////////////////////////// // DO NOT INITIALIZE: The overhead is completely unnecessary LL_INLINE LLVector4a() { } LL_INLINE LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f) { set(x, y, z, w); } LL_INLINE LLVector4a(F32 x) { splat(x); } LL_INLINE LLVector4a(const LLSimdScalar& x) { splat(x); } LL_INLINE LLVector4a(LLQuad q) { mQ = q; } //////////////////////////////////// // LOAD/STORE //////////////////////////////////// // Loads from 16-byte aligned src array (preferred method of loading) LL_INLINE void load4a(const F32* src) { mQ = _mm_load_ps(src); } // Loads from unaligned src array (NB: significantly slower than load4a) LL_INLINE void loadua(const F32* src) { mQ = _mm_loadu_ps(src); } // Loads only three floats beginning at address 'src'. Slowest method. LL_INLINE void load3(const F32* src, const F32 w = 0.f) { // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } // NB: This differs from the convention of { Z, Y, X, W } mQ = _mm_set_ps(w, src[2], src[1], src[0]); } // Stores to a 16-byte aligned memory address LL_INLINE void store4a(F32* dst) const { _mm_store_ps(dst, mQ); } //////////////////////////////////// // BASIC GET/SET //////////////////////////////////// // Returns a "this" as an F32 pointer. LL_INLINE F32* getF32ptr() { return (F32*)&mQ; } // Returns a "this" as a const F32 pointer. LL_INLINE const F32* const getF32ptr() const { return (const F32* const)&mQ; } // Read-only access to single float in this vector. Do not use in proximity // to any function call that manipulates the data at the whole vector level // or you will incur a substantial penalty. Consider using the splat // functions instead LL_INLINE F32 operator[](const S32 idx) const { return ((F32*)&mQ)[idx]; } // Prefer this method for read-only access to a single element. Prefer the // templated version if the elem is known at compile time. LL_INLINE LLSimdScalar getScalarAt(S32 idx) const; // Prefer this method for read-only access to a single element. Prefer the // templated version if the elem is known at compile time. template LL_INLINE LLSimdScalar getScalarAt() const; // Sets to an x, y, z and optional w provided LL_INLINE void set(F32 x, F32 y, F32 z, F32 w = 0.f) { mQ = _mm_set_ps(w, z, y, x); } // Sets to all zeros. This is preferred to using ::getZero() LL_INLINE void clear() { mQ = LLVector4a::getZero().mQ; } // Sets all elements to 'x' LL_INLINE void splat(F32 x) { mQ = _mm_set1_ps(x); } // Sets all elements to 'x' LL_INLINE void splat(const LLSimdScalar& x); // Sets all 4 elements to element N of src, with N known at compile time template LL_INLINE void splat(const LLVector4a& src); // Sets all 4 elements to element i of v, with i NOT known at compile time LL_INLINE void splat(const LLVector4a& v, U32 i); // Sets element N to that of src's element N. Much cleaner than // {LLVector4Logical mask; mask.clear(); mask.setElement(); // target.setSelectWithMask(mask,src,target);} template LL_INLINE void copyComponent(const LLVector4a& src); // Selects bits from src_if_true and src_if_false according to bits in mask LL_INLINE void setSelectWithMask(const LLVector4Logical& mask, const LLVector4a& src_if_true, const LLVector4a& src_if_false); //////////////////////////////////// // ALGEBRAIC //////////////////////////////////// // Sets this to the element-wise (a + b) LL_INLINE void setAdd(const LLVector4a& a, const LLVector4a& b) { mQ = _mm_add_ps(a.mQ, b.mQ); } // Sets this to element-wise (a - b) LL_INLINE void setSub(const LLVector4a& a, const LLVector4a& b) { mQ = _mm_sub_ps(a.mQ, b.mQ); } // Sets this to element-wise multiply (a * b) LL_INLINE void setMul(const LLVector4a& a, const LLVector4a& b) { mQ = _mm_mul_ps(a.mQ, b.mQ); } // Sets this to element-wise quotient (a / b) LL_INLINE void setDiv(const LLVector4a& a, const LLVector4a& b) { mQ = _mm_div_ps(a.mQ, b.mQ); } // Sets this to the element-wise absolute value of src LL_INLINE void setAbs(const LLVector4a& src) { alignas(16) thread_local const U32 F_ABS_MASK_4A[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; mQ = _mm_and_ps(src.mQ, *reinterpret_cast(F_ABS_MASK_4A)); } // Adds to each component in this vector the corresponding component in rhs LL_INLINE void add(const LLVector4a& rhs) { mQ = _mm_add_ps(mQ, rhs.mQ); } // Subtracts from each component in this vector the corresponding component // in rhs LL_INLINE void sub(const LLVector4a& rhs) { mQ = _mm_sub_ps(mQ, rhs.mQ); } // Multiplies each component in this vector by the corresponding component // in rhs LL_INLINE void mul(const LLVector4a& rhs) { mQ = _mm_mul_ps(mQ, rhs.mQ); } // Divides each component in this vector by the corresponding component in // rhs LL_INLINE void div(const LLVector4a& rhs) { // *TODO: Check accuracy, maybe add divFast mQ = _mm_div_ps(mQ, rhs.mQ); } // Multiplies this vector by x in a scalar fashion LL_INLINE void mul(F32 x) { LLVector4a t; t.splat(x); mQ = _mm_mul_ps(mQ, t.mQ); } // Sets this to (a x b) (geometric cross-product) LL_INLINE void setCross3(const LLVector4a& a, const LLVector4a& b); // Sets all elements to the dot product of the x, y, and z elements in a // and b LL_INLINE void setAllDot3(const LLVector4a& a, const LLVector4a& b); // Sets all elements to the dot product of the x, y, z, and w elements in a // and b LL_INLINE void setAllDot4(const LLVector4a& a, const LLVector4a& b); // Returns the 3D dot product of this vector and b LL_INLINE LLSimdScalar dot3(const LLVector4a& b) const; // Returns the 4D dot product of this vector and b LL_INLINE LLSimdScalar dot4(const LLVector4a& b) const; // Normalizes this vector with respect to the x, y, and z components only. // Accurate to 22 bits of precision. W component is destroyed. // Note that this does not consider zero length vectors ! LL_INLINE void normalize3(); // Same as normalize3() but with respect to all 4 components LL_INLINE void normalize4(); // Same as normalize3(), but returns length as a SIMD scalar LL_INLINE LLSimdScalar normalize3withLength(); // Normalizes this vector with respect to the x, y, and z components only. // Accurate only to 10-12 bits of precision. W component is destroyed. // Note that this does not consider zero length vectors ! LL_INLINE void normalize3fast(); LL_INLINE void normalize3fast_checked(LLVector4a* d = NULL); // Returns true if this vector is normalized with respect to x,y,z up to // tolerance LL_INLINE LLBool32 isNormalized3(F32 tolerance = 1e-3) const; // Returns true if this vector is normalized with respect to all components // up to tolerance LL_INLINE LLBool32 isNormalized4(F32 tolerance = 1e-3) const; // Sets all elements to the length of vector 'v' LL_INLINE void setAllLength3(const LLVector4a& v) { LLVector4a len_squared; len_squared.setAllDot3(v, v); mQ = _mm_sqrt_ps(len_squared.mQ); } // Gets this vector's length LL_INLINE LLSimdScalar getLength3() const { return _mm_sqrt_ss(dot3((const LLVector4a)mQ)); } // Sets the components of this vector to the minimum of the corresponding // components of lhs and rhs LL_INLINE void setMin(const LLVector4a& lhs, const LLVector4a& rhs) { mQ = _mm_min_ps(lhs.mQ, rhs.mQ); } // Sets the components of this vector to the maximum of the corresponding // components of lhs and rhs LL_INLINE void setMax(const LLVector4a& lhs, const LLVector4a& rhs) { mQ = _mm_max_ps(lhs.mQ, rhs.mQ); } // Clamps this vector to be within the component-wise range low to high // (inclusive) LL_INLINE void clamp(const LLVector4a& low, const LLVector4a& high); // Sets this to (c * lhs) + rhs * (1 - c) LL_INLINE void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c) { LLVector4a t; t.setSub(rhs, lhs); t.mul(c); setAdd(lhs, t); } // Returns true (nonzero) if x, y, z (and w for Finite4) are all finite // floats LL_INLINE LLBool32 isFinite3() const; LL_INLINE LLBool32 isFinite4() const; // Sets this vector to 'vec' rotated by the LLRotation or LLQuaternion2 // provided void setRotated(const LLRotation& rot, const LLVector4a& vec); void setRotated(const class LLQuaternion2& quat, const LLVector4a& vec); // Sets this vector to 'vec' rotated by the INVERSE of the LLRotation or // LLQuaternion2 provided LL_INLINE void setRotatedInv(const LLRotation& rot, const LLVector4a& vec); LL_INLINE void setRotatedInv(const class LLQuaternion2& quat, const LLVector4a& vec); // Quantizes this vector to 8 or 16 bit precision void quantize8(const LLVector4a& low, const LLVector4a& high); void quantize16(const LLVector4a& low, const LLVector4a& high); LL_INLINE void negate() { alignas(16) thread_local const U32 sign_mask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; mQ = _mm_xor_ps(*reinterpret_cast(sign_mask), mQ); } //////////////////////////////////// // LOGICAL //////////////////////////////////// // The methods in this section will compare the elements in this vector to // those in rhs and return an LLVector4Logical with all bits set in // elements where the comparison was true and all bits unset in elements // where the comparison was false. See llvector4logica.h //////////////////////////////////// // WARNING: Other than equals3 and equals4, these functions do NOT account // for floating point tolerance. You should include the appropriate // tolerance in the inputs. //////////////////////////////////// LL_INLINE LLVector4Logical greaterThan(const LLVector4a& rhs) const { return _mm_cmpgt_ps(mQ, rhs.mQ); } LL_INLINE LLVector4Logical lessThan(const LLVector4a& rhs) const { return _mm_cmplt_ps(mQ, rhs.mQ); } LL_INLINE LLVector4Logical greaterEqual(const LLVector4a& rhs) const { return _mm_cmpge_ps(mQ, rhs.mQ); } LL_INLINE LLVector4Logical lessEqual(const LLVector4a& rhs) const { return _mm_cmple_ps(mQ, rhs.mQ); } LL_INLINE LLVector4Logical equal(const LLVector4a& rhs) const { return _mm_cmpeq_ps(mQ, rhs.mQ); } // Returns true if this and rhs are componentwise equal up to the specified // absolute tolerance LL_INLINE bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const; LL_INLINE bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const; LL_INLINE bool operator==(const LLVector4a& rhs) const { return equals4(rhs); } LL_INLINE bool operator!=(const LLVector4a& rhs) const { return !equals4(rhs); } //////////////////////////////////// // OPERATORS //////////////////////////////////// // Do NOT add aditional operators without consulting someone with SSE // experience LL_INLINE const LLVector4a& operator=(const LLVector4a& rhs) { mQ = rhs.mQ; return *this; } LL_INLINE const LLVector4a& operator=(const LLQuad& rhs) { mQ = rhs; return *this; } LL_INLINE operator LLQuad() const { return mQ; } private: LLQuad mQ; }; LL_INLINE void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p) { min.setMin(min, p); max.setMax(max, p); } #endif