123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- /**
- * @file llmatrix3a.cpp
- * @brief LLMatrix3a class implementation - memory aligned and vectorized 3x3 matrix
- *
- * $LicenseInfo:firstyear=2010&license=viewergpl$
- *
- * Copyright (C) 2010, Linden Research, Inc.
- *
- * Second Life Viewer Source Code
- * The source code in this file ("Source Code") is provided by Linden Lab
- * to you under the terms of the GNU General Public License, version 2.0
- * ("GPL"), unless you have obtained a separate licensing agreement
- * ("Other License"), formally executed by you and Linden Lab. Terms of
- * the GPL can be found in doc/GPL-license.txt in this distribution, or
- * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
- *
- * There are special exceptions to the terms and conditions of the GPL as
- * it is applied to this Source Code. View the full text of the exception
- * in the file doc/FLOSS-exception.txt in this software distribution, or
- * online at
- * http://secondlifegrid.net/programs/open_source/licensing/flossexception
- *
- * By copying, modifying or distributing this software, you acknowledge
- * that you have read and understood your obligations described above,
- * and agree to abide by those obligations.
- *
- * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
- * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
- * COMPLETENESS OR PERFORMANCE.
- * $/LicenseInfo$
- */
- #include "linden_common.h"
- #include "llmath.h"
- alignas(16) static const F32 M_IDENT_3A[12] = {
- 1.f, 0.f, 0.f, 0.f, // Column 1
- 0.f, 1.f, 0.f, 0.f, // Column 2
- 0.f, 0.f, 1.f, 0.f // Column 3
- };
- extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*>(M_IDENT_3A);
- void LLMatrix3a::setMul(const LLMatrix3a& lhs, const LLMatrix3a& rhs)
- {
- const LLVector4a col0 = lhs.getColumn(0);
- const LLVector4a col1 = lhs.getColumn(1);
- const LLVector4a col2 = lhs.getColumn(2);
- for (S32 i = 0; i < 3; ++i)
- {
- LLVector4a xxxx = _mm_load_ss(rhs.mColumns[i].getF32ptr());
- xxxx.splat<0>(xxxx);
- xxxx.mul(col0);
- {
- LLVector4a yyyy = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 1);
- yyyy.splat<0>(yyyy);
- yyyy.mul(col1);
- xxxx.add(yyyy);
- }
- {
- LLVector4a zzzz = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 2);
- zzzz.splat<0>(zzzz);
- zzzz.mul(col2);
- xxxx.add(zzzz);
- }
- xxxx.store4a(mColumns[i].getF32ptr());
- }
- }
- //static
- void LLMatrix3a::batchTransform(const LLMatrix3a& xform, const LLVector4a* src,
- int num_vecs, LLVector4a* dst)
- {
- const LLVector4a col0 = xform.getColumn(0);
- const LLVector4a col1 = xform.getColumn(1);
- const LLVector4a col2 = xform.getColumn(2);
- const LLVector4a* max_addr = src + num_vecs;
- if (num_vecs & 0x1)
- {
- LLVector4a xxxx = _mm_load_ss((const F32*)src);
- LLVector4a yyyy = _mm_load_ss((const F32*)src + 1);
- LLVector4a zzzz = _mm_load_ss((const F32*)src + 2);
- xxxx.splat<0>(xxxx);
- yyyy.splat<0>(yyyy);
- zzzz.splat<0>(zzzz);
- xxxx.mul(col0);
- yyyy.mul(col1);
- zzzz.mul(col2);
- xxxx.add(yyyy);
- xxxx.add(zzzz);
- xxxx.store4a((F32*)dst);
- src++;
- dst++;
- }
- num_vecs >>= 1;
- while (src < max_addr)
- {
- _mm_prefetch((const char*)(src + 32), _MM_HINT_NTA);
- _mm_prefetch((const char*)(dst + 32), _MM_HINT_NTA);
- LLVector4a xxxx = _mm_load_ss((const F32*)src);
- LLVector4a xxxx1= _mm_load_ss((const F32*)(src + 1));
- xxxx.splat<0>(xxxx);
- xxxx1.splat<0>(xxxx1);
- xxxx.mul(col0);
- xxxx1.mul(col0);
- {
- LLVector4a yyyy = _mm_load_ss((const F32*)src + 1);
- LLVector4a yyyy1 = _mm_load_ss((const F32*)(src + 1) + 1);
- yyyy.splat<0>(yyyy);
- yyyy1.splat<0>(yyyy1);
- yyyy.mul(col1);
- yyyy1.mul(col1);
- xxxx.add(yyyy);
- xxxx1.add(yyyy1);
- }
- {
- LLVector4a zzzz = _mm_load_ss((const F32*)(src) + 2);
- LLVector4a zzzz1 = _mm_load_ss((const F32*)(++src) + 2);
- zzzz.splat<0>(zzzz);
- zzzz1.splat<0>(zzzz1);
- zzzz.mul(col2);
- zzzz1.mul(col2);
- xxxx.add(zzzz);
- xxxx1.add(zzzz1);
- }
- xxxx.store4a(dst->getF32ptr());
- src++;
- dst++;
- xxxx1.store4a((F32*)dst++);
- }
- }
|