llmatrix3a.cpp 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. /**
  2. * @file llmatrix3a.cpp
  3. * @brief LLMatrix3a class implementation - memory aligned and vectorized 3x3 matrix
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewergpl$
  6. *
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #include "linden_common.h"
  33. #include "llmath.h"
  34. alignas(16) static const F32 M_IDENT_3A[12] = {
  35. 1.f, 0.f, 0.f, 0.f, // Column 1
  36. 0.f, 1.f, 0.f, 0.f, // Column 2
  37. 0.f, 0.f, 1.f, 0.f // Column 3
  38. };
  39. extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*>(M_IDENT_3A);
  40. void LLMatrix3a::setMul(const LLMatrix3a& lhs, const LLMatrix3a& rhs)
  41. {
  42. const LLVector4a col0 = lhs.getColumn(0);
  43. const LLVector4a col1 = lhs.getColumn(1);
  44. const LLVector4a col2 = lhs.getColumn(2);
  45. for (S32 i = 0; i < 3; ++i)
  46. {
  47. LLVector4a xxxx = _mm_load_ss(rhs.mColumns[i].getF32ptr());
  48. xxxx.splat<0>(xxxx);
  49. xxxx.mul(col0);
  50. {
  51. LLVector4a yyyy = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 1);
  52. yyyy.splat<0>(yyyy);
  53. yyyy.mul(col1);
  54. xxxx.add(yyyy);
  55. }
  56. {
  57. LLVector4a zzzz = _mm_load_ss(rhs.mColumns[i].getF32ptr() + 2);
  58. zzzz.splat<0>(zzzz);
  59. zzzz.mul(col2);
  60. xxxx.add(zzzz);
  61. }
  62. xxxx.store4a(mColumns[i].getF32ptr());
  63. }
  64. }
  65. //static
  66. void LLMatrix3a::batchTransform(const LLMatrix3a& xform, const LLVector4a* src,
  67. int num_vecs, LLVector4a* dst)
  68. {
  69. const LLVector4a col0 = xform.getColumn(0);
  70. const LLVector4a col1 = xform.getColumn(1);
  71. const LLVector4a col2 = xform.getColumn(2);
  72. const LLVector4a* max_addr = src + num_vecs;
  73. if (num_vecs & 0x1)
  74. {
  75. LLVector4a xxxx = _mm_load_ss((const F32*)src);
  76. LLVector4a yyyy = _mm_load_ss((const F32*)src + 1);
  77. LLVector4a zzzz = _mm_load_ss((const F32*)src + 2);
  78. xxxx.splat<0>(xxxx);
  79. yyyy.splat<0>(yyyy);
  80. zzzz.splat<0>(zzzz);
  81. xxxx.mul(col0);
  82. yyyy.mul(col1);
  83. zzzz.mul(col2);
  84. xxxx.add(yyyy);
  85. xxxx.add(zzzz);
  86. xxxx.store4a((F32*)dst);
  87. src++;
  88. dst++;
  89. }
  90. num_vecs >>= 1;
  91. while (src < max_addr)
  92. {
  93. _mm_prefetch((const char*)(src + 32), _MM_HINT_NTA);
  94. _mm_prefetch((const char*)(dst + 32), _MM_HINT_NTA);
  95. LLVector4a xxxx = _mm_load_ss((const F32*)src);
  96. LLVector4a xxxx1= _mm_load_ss((const F32*)(src + 1));
  97. xxxx.splat<0>(xxxx);
  98. xxxx1.splat<0>(xxxx1);
  99. xxxx.mul(col0);
  100. xxxx1.mul(col0);
  101. {
  102. LLVector4a yyyy = _mm_load_ss((const F32*)src + 1);
  103. LLVector4a yyyy1 = _mm_load_ss((const F32*)(src + 1) + 1);
  104. yyyy.splat<0>(yyyy);
  105. yyyy1.splat<0>(yyyy1);
  106. yyyy.mul(col1);
  107. yyyy1.mul(col1);
  108. xxxx.add(yyyy);
  109. xxxx1.add(yyyy1);
  110. }
  111. {
  112. LLVector4a zzzz = _mm_load_ss((const F32*)(src) + 2);
  113. LLVector4a zzzz1 = _mm_load_ss((const F32*)(++src) + 2);
  114. zzzz.splat<0>(zzzz);
  115. zzzz1.splat<0>(zzzz1);
  116. zzzz.mul(col2);
  117. zzzz1.mul(col2);
  118. xxxx.add(zzzz);
  119. xxxx1.add(zzzz1);
  120. }
  121. xxxx.store4a(dst->getF32ptr());
  122. src++;
  123. dst++;
  124. xxxx1.store4a((F32*)dst++);
  125. }
  126. }