/** * @file llmatrix3a.cpp * @brief LLMatrix3a class implementation - memory aligned and vectorized 3x3 matrix * * $LicenseInfo:firstyear=2010&license=viewergpl$ * * Copyright (C) 2010, Linden Research, Inc. * * Second Life Viewer Source Code * The source code in this file ("Source Code") is provided by Linden Lab * to you under the terms of the GNU General Public License, version 2.0 * ("GPL"), unless you have obtained a separate licensing agreement * ("Other License"), formally executed by you and Linden Lab. Terms of * the GPL can be found in doc/GPL-license.txt in this distribution, or * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 * * There are special exceptions to the terms and conditions of the GPL as * it is applied to this Source Code. View the full text of the exception * in the file doc/FLOSS-exception.txt in this software distribution, or * online at * http://secondlifegrid.net/programs/open_source/licensing/flossexception * * By copying, modifying or distributing this software, you acknowledge * that you have read and understood your obligations described above, * and agree to abide by those obligations. * * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, * COMPLETENESS OR PERFORMANCE. * $/LicenseInfo$ */ #include "linden_common.h" #include "llmath.h" alignas(16) static const F32 M_IDENT_3A[12] = { 1.f, 0.f, 0.f, 0.f, // Column 1 0.f, 1.f, 0.f, 0.f, // Column 2 0.f, 0.f, 1.f, 0.f // Column 3 }; extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast(M_IDENT_3A); void LLMatrix3a::setMul(const LLMatrix3a& lhs, const LLMatrix3a& rhs) { const LLVector4a col0 = lhs.getColumn(0); const LLVector4a col1 = lhs.getColumn(1); const LLVector4a col2 = lhs.getColumn(2); for (S32 i = 0; i < 3; ++i) { LLVector4a xxxx = _mm_load_ss(rhs.mColumns[i].getF32ptr()); xxxx.splat<0>(xxxx); xxxx.mul(col0); { LLVector4a yyyy = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 1); yyyy.splat<0>(yyyy); yyyy.mul(col1); xxxx.add(yyyy); } { LLVector4a zzzz = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 2); zzzz.splat<0>(zzzz); zzzz.mul(col2); xxxx.add(zzzz); } xxxx.store4a(mColumns[i].getF32ptr()); } } //static void LLMatrix3a::batchTransform(const LLMatrix3a& xform, const LLVector4a* src, int num_vecs, LLVector4a* dst) { const LLVector4a col0 = xform.getColumn(0); const LLVector4a col1 = xform.getColumn(1); const LLVector4a col2 = xform.getColumn(2); const LLVector4a* max_addr = src + num_vecs; if (num_vecs & 0x1) { LLVector4a xxxx = _mm_load_ss((const F32*)src); LLVector4a yyyy = _mm_load_ss((const F32*)src + 1); LLVector4a zzzz = _mm_load_ss((const F32*)src + 2); xxxx.splat<0>(xxxx); yyyy.splat<0>(yyyy); zzzz.splat<0>(zzzz); xxxx.mul(col0); yyyy.mul(col1); zzzz.mul(col2); xxxx.add(yyyy); xxxx.add(zzzz); xxxx.store4a((F32*)dst); src++; dst++; } num_vecs >>= 1; while (src < max_addr) { _mm_prefetch((const char*)(src + 32), _MM_HINT_NTA); _mm_prefetch((const char*)(dst + 32), _MM_HINT_NTA); LLVector4a xxxx = _mm_load_ss((const F32*)src); LLVector4a xxxx1= _mm_load_ss((const F32*)(src + 1)); xxxx.splat<0>(xxxx); xxxx1.splat<0>(xxxx1); xxxx.mul(col0); xxxx1.mul(col0); { LLVector4a yyyy = _mm_load_ss((const F32*)src + 1); LLVector4a yyyy1 = _mm_load_ss((const F32*)(src + 1) + 1); yyyy.splat<0>(yyyy); yyyy1.splat<0>(yyyy1); yyyy.mul(col1); yyyy1.mul(col1); xxxx.add(yyyy); xxxx1.add(yyyy1); } { LLVector4a zzzz = _mm_load_ss((const F32*)(src) + 2); LLVector4a zzzz1 = _mm_load_ss((const F32*)(++src) + 2); zzzz.splat<0>(zzzz); zzzz1.splat<0>(zzzz1); zzzz.mul(col2); zzzz1.mul(col2); xxxx.add(zzzz); xxxx1.add(zzzz1); } xxxx.store4a(dst->getF32ptr()); src++; dst++; xxxx1.store4a((F32*)dst++); } }