GwynethLlewelyn
/
CoolVLViewer


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
							/**
 * @file llvector4a.cpp
 * @brief SIMD vector implementation
 *
 * $LicenseInfo:firstyear=2010&license=viewergpl$
 *
 * Copyright (C) 2010, Linden Research, Inc.
 *
 * Second Life Viewer Source Code
 * The source code in this file ("Source Code") is provided by Linden Lab
 * to you under the terms of the GNU General Public License, version 2.0
 * ("GPL"), unless you have obtained a separate licensing agreement
 * ("Other License"), formally executed by you and Linden Lab.  Terms of
 * the GPL can be found in doc/GPL-license.txt in this distribution, or
 * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
 *
 * There are special exceptions to the terms and conditions of the GPL as
 * it is applied to this Source Code. View the full text of the exception
 * in the file doc/FLOSS-exception.txt in this software distribution, or
 * online at
 * http://secondlifegrid.net/programs/open_source/licensing/flossexception
 *
 * By copying, modifying or distributing this software, you acknowledge
 * that you have read and understood your obligations described above,
 * and agree to abide by those obligations.
 *
 * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
 * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
 * COMPLETENESS OR PERFORMANCE.
 * $/LicenseInfo$
 */

#include "linden_common.h"

#include "llmath.h"
#include "llmemory.h"
#include "llquantize.h"

extern const LLQuad F_ZERO_4A = { 0.f, 0.f, 0.f, 0.f };
extern const LLQuad F_APPROXIMATELY_ZERO_4A =
{
	F_APPROXIMATELY_ZERO,
	F_APPROXIMATELY_ZERO,
	F_APPROXIMATELY_ZERO,
	F_APPROXIMATELY_ZERO
};

extern const LLVector4a LL_V4A_ZERO =
	reinterpret_cast<const LLVector4a&> (F_ZERO_4A);
extern const LLVector4a LL_V4A_EPSILON =
	reinterpret_cast<const LLVector4a&> (F_APPROXIMATELY_ZERO_4A);

//static
void LLVector4a::memcpyNonAliased16(F32* __restrict dst,
									const F32* __restrict src,
									size_t bytes)
{
	ll_memcpy_nonaliased_aligned_16((char*)dst, (char*)src, bytes);
}

void LLVector4a::setRotated(const LLRotation& rot, const LLVector4a& vec)
{
	const LLVector4a col0 = rot.getColumn(0);
	const LLVector4a col1 = rot.getColumn(1);
	const LLVector4a col2 = rot.getColumn(2);

	LLVector4a result = _mm_load_ss(vec.getF32ptr());
	result.splat<0>(result);
	result.mul(col0);

	{
		LLVector4a yyyy = _mm_load_ss(vec.getF32ptr() +  1);
		yyyy.splat<0>(yyyy);
		yyyy.mul(col1);
		result.add(yyyy);
	}

	{
		LLVector4a zzzz = _mm_load_ss(vec.getF32ptr() +  2);
		zzzz.splat<0>(zzzz);
		zzzz.mul(col2);
		result.add(zzzz);
	}

	*this = result;
}

void LLVector4a::setRotated(const LLQuaternion2& quat, const LLVector4a& vec)
{
	const LLVector4a& quatVec = quat.getVector4a();
	LLVector4a temp; temp.setCross3(quatVec, vec);
	temp.add(temp);

	const LLVector4a realPart(quatVec.getScalarAt<3>());
	LLVector4a tempTimesReal; tempTimesReal.setMul(temp, realPart);

	mQ = vec;
	add(tempTimesReal);

	LLVector4a imagCrossTemp; imagCrossTemp.setCross3(quatVec, temp);
	add(imagCrossTemp);
}

void LLVector4a::quantize8(const LLVector4a& low, const LLVector4a& high)
{
	LLVector4a val(mQ);
	LLVector4a delta; delta.setSub(high, low);

	{
		val.clamp(low, high);
		val.sub(low);

		// 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
		const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
// 		{
// 			thread_local alignas(16) const F32 F_TWO_4A[4] = { 2.f, 2.f, 2.f, 2.f };
// 			LLVector4a two; two.load4a(F_TWO_4A);
//
// 			// Here we use _mm_rcp_ps plus one round of newton-raphson
// 			// We wish to find 'x' such that x = 1/delta
// 			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
// 			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * (2 - a * x0)
// 			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
// 			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
// 			oneOverDelta.setMul(delta, recipApprox);
// 			oneOverDelta.setSub(two, oneOverDelta);
// 			oneOverDelta.mul(recipApprox);
// 		}

		val.mul(oneOverDelta);
		val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
	}

	val = _mm_cvtepi32_ps(_mm_cvtps_epi32(val.mQ));

	{
		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
		val.mul(delta);
		val.add(low);
	}

	{
		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
		LLVector4a absVal; absVal.setAbs(val);
		setSelectWithMask(absVal.lessThan(maxError), F_ZERO_4A, val);
	}
}

void LLVector4a::quantize16(const LLVector4a& low, const LLVector4a& high)
{
	LLVector4a val(mQ);
	LLVector4a delta; delta.setSub(high, low);

	{
		val.clamp(low, high);
		val.sub(low);

		// 16-bit quantization means we need a round of Newton-Raphson
		LLVector4a oneOverDelta;
		{
			alignas(16) thread_local const F32 F_TWO_4A[4] = { 2.f, 2.f, 2.f, 2.f };
			ll_assert_aligned(F_TWO_4A, 16);

			LLVector4a two; two.load4a(F_TWO_4A);

			// Here we use _mm_rcp_ps plus one round of newton-raphson
			// We wish to find 'x' such that x = 1/delta
			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * (2 - a * x0)
			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
			oneOverDelta.setMul(delta, recipApprox);
			oneOverDelta.setSub(two, oneOverDelta);
			oneOverDelta.mul(recipApprox);
		}

		val.mul(oneOverDelta);
		val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
	}

	val = _mm_cvtepi32_ps(_mm_cvtps_epi32(val.mQ));

	{
		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
		val.mul(delta);
		val.add(low);
	}

	{
		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
		LLVector4a absVal; absVal.setAbs(val);
		setSelectWithMask(absVal.lessThan(maxError), F_ZERO_4A, val);
	}
}