mct.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. /*
  2. * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
  3. * Copyright (c) 2002-2007, Professor Benoit Macq
  4. * Copyright (c) 2001-2003, David Janssens
  5. * Copyright (c) 2002-2003, Yannick Verschueren
  6. * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
  7. * Copyright (c) 2005, Herve Drolon, FreeImage Team
  8. * All rights reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  20. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  23. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29. * POSSIBILITY OF SUCH DAMAGE.
  30. */
  31. #if SSE2NEON
  32. # define SSE2NEON_ALLOC_DEFINED
  33. # include "sse2neon.h"
  34. /* Since we emulate SSE/2/4 with NEON, let's make sure __SSE*__ is defined */
  35. # if !defined(__SSE__)
  36. # define __SSE__ 1
  37. # endif
  38. # if !defined(__SSE2__)
  39. # define __SSE2__ 1
  40. # endif
  41. # if !defined(__SSE4_1__)
  42. # define __SSE4_1__ 1
  43. # endif
  44. #else
  45. # include <immintrin.h>
  46. #endif
  47. /* MSVC does not define __SSE__ neither __SSE2__... */
  48. #if !defined(__SSE__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1))
  49. # define __SSE__ 1
  50. #endif
  51. #if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
  52. # define __SSE2__ 1
  53. /* MSVC does not define __SSE4_1__ either, but when AVX is here, so should be SSE 4.1... */
  54. # if !defined(__SSE4_1__) && defined(__AVX__)
  55. # define __SSE4_1__ 1
  56. # endif
  57. #endif
  58. #if defined(_M_X64) || defined(__x86_64__)
  59. # define INTPTR_T long long
  60. #else
  61. # define INTPTR_T long
  62. #endif
  63. #include "opj_includes.h"
  64. #if defined(__MSVC_VER__) || defined(_MSC_VER)
  65. # pragma warning(disable : 4311)
  66. #endif
  67. /* <summary> */
  68. /* This table contains the norms of the basis function of the reversible MCT. */
  69. /* </summary> */
  70. static const double mct_norms[3] = { 1.732, .8292, .8292 };
  71. /* <summary> */
  72. /* This table contains the norms of the basis function of the irreversible MCT. */
  73. /* </summary> */
  74. static const double mct_norms_real[3] = { 1.732, 1.805, 1.573 };
  75. /* <summary> */
  76. /* Foward reversible MCT. */
  77. /* </summary> */
  78. #ifdef __SSE2__
  79. void mct_encode(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  80. int i = 0;
  81. /* Buffers are normally aligned on 16 bytes... */
  82. if (((INTPTR_T)c0 & 0xf) == 0 && ((INTPTR_T)c1 & 0xf) == 0 && ((INTPTR_T)c2 & 0xf) == 0) {
  83. const int cnt = n & ~3U;
  84. for ( ; i < cnt; i += 4) {
  85. __m128i y, u, v;
  86. __m128i r = _mm_load_si128((const __m128i*)&(c0[i]));
  87. __m128i g = _mm_load_si128((const __m128i*)&(c1[i]));
  88. __m128i b = _mm_load_si128((const __m128i*)&(c2[i]));
  89. y = _mm_add_epi32(g, g);
  90. y = _mm_add_epi32(y, b);
  91. y = _mm_add_epi32(y, r);
  92. y = _mm_srai_epi32(y, 2);
  93. u = _mm_sub_epi32(b, g);
  94. v = _mm_sub_epi32(r, g);
  95. _mm_store_si128((__m128i*)&(c0[i]), y);
  96. _mm_store_si128((__m128i*)&(c1[i]), u);
  97. _mm_store_si128((__m128i*)&(c2[i]), v);
  98. }
  99. }
  100. for ( ; i < n; ++i) {
  101. int r = c0[i];
  102. int g = c1[i];
  103. int b = c2[i];
  104. int y = (r + g + g + b) >> 2;
  105. int u = b - g;
  106. int v = r - g;
  107. c0[i] = y;
  108. c1[i] = u;
  109. c2[i] = v;
  110. }
  111. }
  112. #else
  113. void mct_encode(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  114. int i;
  115. for (i = 0; i < n; ++i) {
  116. int r = c0[i];
  117. int g = c1[i];
  118. int b = c2[i];
  119. int y = (r + g + g + b) >> 2;
  120. int u = b - g;
  121. int v = r - g;
  122. c0[i] = y;
  123. c1[i] = u;
  124. c2[i] = v;
  125. }
  126. }
  127. #endif
  128. /* <summary> */
  129. /* Inverse reversible MCT. */
  130. /* </summary> */
  131. #ifdef __SSE2__
  132. void mct_decode(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  133. int i = 0;
  134. /* Buffers are normally aligned on 16 bytes... */
  135. if (((INTPTR_T)c0 & 0xf) == 0 && ((INTPTR_T)c1 & 0xf) == 0 && ((INTPTR_T)c2 & 0xf) == 0) {
  136. const int cnt = n & ~3U;
  137. for ( ; i < cnt; i += 4) {
  138. __m128i r, g, b;
  139. __m128i y = _mm_load_si128((const __m128i*)&(c0[i]));
  140. __m128i u = _mm_load_si128((const __m128i*)&(c1[i]));
  141. __m128i v = _mm_load_si128((const __m128i*)&(c2[i]));
  142. g = y;
  143. g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2));
  144. r = _mm_add_epi32(v, g);
  145. b = _mm_add_epi32(u, g);
  146. _mm_store_si128((__m128i*)&(c0[i]), r);
  147. _mm_store_si128((__m128i*)&(c1[i]), g);
  148. _mm_store_si128((__m128i*)&(c2[i]), b);
  149. }
  150. }
  151. for ( ; i < n; ++i) {
  152. int y = c0[i];
  153. int u = c1[i];
  154. int v = c2[i];
  155. int g = y - ((u + v) >> 2);
  156. int r = v + g;
  157. int b = u + g;
  158. c0[i] = r;
  159. c1[i] = g;
  160. c2[i] = b;
  161. }
  162. }
  163. #else
  164. void mct_decode(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  165. int i;
  166. for (i = 0; i < n; ++i) {
  167. int y = c0[i];
  168. int u = c1[i];
  169. int v = c2[i];
  170. int g = y - ((u + v) >> 2);
  171. int r = v + g;
  172. int b = u + g;
  173. c0[i] = r;
  174. c1[i] = g;
  175. c2[i] = b;
  176. }
  177. }
  178. #endif
  179. /* <summary> */
  180. /* Get norm of basis function of reversible MCT. */
  181. /* </summary> */
  182. double mct_getnorm(int compno) {
  183. return mct_norms[compno];
  184. }
  185. /* <summary> */
  186. /* Foward irreversible MCT. */
  187. /* </summary> */
  188. #ifdef __SSE4_1__
  189. void mct_encode_real(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  190. int i = 0;
  191. /* Buffers are normally aligned on 16 bytes... */
  192. if (((INTPTR_T)c0 & 0xf) == 0 && ((INTPTR_T)c1 & 0xf) == 0 && ((INTPTR_T)c2 & 0xf) == 0) {
  193. const int cnt = n & ~3U;
  194. const __m128i ry = _mm_set1_epi32(2449);
  195. const __m128i gy = _mm_set1_epi32(4809);
  196. const __m128i by = _mm_set1_epi32(934);
  197. const __m128i ru = _mm_set1_epi32(1382);
  198. const __m128i gu = _mm_set1_epi32(2714);
  199. const __m128i gv = _mm_set1_epi32(3430);
  200. const __m128i bv = _mm_set1_epi32(666);
  201. const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096),
  202. _MM_SHUFFLE(1, 0, 1, 0));
  203. for ( ; i < cnt; i += 4) {
  204. __m128i lo, hi, y, u, v;
  205. __m128i r = _mm_load_si128((const __m128i*)&(c0[i]));
  206. __m128i g = _mm_load_si128((const __m128i*)&(c1[i]));
  207. __m128i b = _mm_load_si128((const __m128i*)&(c2[i]));
  208. hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
  209. lo = _mm_mul_epi32(r, ry);
  210. hi = _mm_mul_epi32(hi, ry);
  211. lo = _mm_add_epi64(lo, mulround);
  212. hi = _mm_add_epi64(hi, mulround);
  213. lo = _mm_srli_epi64(lo, 13);
  214. hi = _mm_slli_epi64(hi, 32 - 13);
  215. y = _mm_blend_epi16(lo, hi, 0xCC);
  216. hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
  217. lo = _mm_mul_epi32(g, gy);
  218. hi = _mm_mul_epi32(hi, gy);
  219. lo = _mm_add_epi64(lo, mulround);
  220. hi = _mm_add_epi64(hi, mulround);
  221. lo = _mm_srli_epi64(lo, 13);
  222. hi = _mm_slli_epi64(hi, 32 - 13);
  223. y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
  224. hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
  225. lo = _mm_mul_epi32(b, by);
  226. hi = _mm_mul_epi32(hi, by);
  227. lo = _mm_add_epi64(lo, mulround);
  228. hi = _mm_add_epi64(hi, mulround);
  229. lo = _mm_srli_epi64(lo, 13);
  230. hi = _mm_slli_epi64(hi, 32 - 13);
  231. y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC));
  232. _mm_store_si128((__m128i *) & (c0[i]), y);
  233. lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0)));
  234. hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1)));
  235. lo = _mm_slli_epi64(lo, 12);
  236. hi = _mm_slli_epi64(hi, 12);
  237. lo = _mm_add_epi64(lo, mulround);
  238. hi = _mm_add_epi64(hi, mulround);
  239. lo = _mm_srli_epi64(lo, 13);
  240. hi = _mm_slli_epi64(hi, 32 - 13);
  241. u = _mm_blend_epi16(lo, hi, 0xCC);
  242. hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1));
  243. lo = _mm_mul_epi32(r, ru);
  244. hi = _mm_mul_epi32(hi, ru);
  245. lo = _mm_add_epi64(lo, mulround);
  246. hi = _mm_add_epi64(hi, mulround);
  247. lo = _mm_srli_epi64(lo, 13);
  248. hi = _mm_slli_epi64(hi, 32 - 13);
  249. u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
  250. hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
  251. lo = _mm_mul_epi32(g, gu);
  252. hi = _mm_mul_epi32(hi, gu);
  253. lo = _mm_add_epi64(lo, mulround);
  254. hi = _mm_add_epi64(hi, mulround);
  255. lo = _mm_srli_epi64(lo, 13);
  256. hi = _mm_slli_epi64(hi, 32 - 13);
  257. u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC));
  258. _mm_store_si128((__m128i *) & (c1[i]), u);
  259. lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0)));
  260. hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1)));
  261. lo = _mm_slli_epi64(lo, 12);
  262. hi = _mm_slli_epi64(hi, 12);
  263. lo = _mm_add_epi64(lo, mulround);
  264. hi = _mm_add_epi64(hi, mulround);
  265. lo = _mm_srli_epi64(lo, 13);
  266. hi = _mm_slli_epi64(hi, 32 - 13);
  267. v = _mm_blend_epi16(lo, hi, 0xCC);
  268. hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1));
  269. lo = _mm_mul_epi32(g, gv);
  270. hi = _mm_mul_epi32(hi, gv);
  271. lo = _mm_add_epi64(lo, mulround);
  272. hi = _mm_add_epi64(hi, mulround);
  273. lo = _mm_srli_epi64(lo, 13);
  274. hi = _mm_slli_epi64(hi, 32 - 13);
  275. v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
  276. hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1));
  277. lo = _mm_mul_epi32(b, bv);
  278. hi = _mm_mul_epi32(hi, bv);
  279. lo = _mm_add_epi64(lo, mulround);
  280. hi = _mm_add_epi64(hi, mulround);
  281. lo = _mm_srli_epi64(lo, 13);
  282. hi = _mm_slli_epi64(hi, 32 - 13);
  283. v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC));
  284. _mm_store_si128((__m128i *) & (c2[i]), v);
  285. }
  286. }
  287. for ( ; i < n; ++i) {
  288. int r = c0[i];
  289. int g = c1[i];
  290. int b = c2[i];
  291. int y = fix_mul(r, 2449) + fix_mul(g, 4809) + fix_mul(b, 934);
  292. int u = -fix_mul(r, 1382) - fix_mul(g, 2714) + fix_mul(b, 4096);
  293. int v = fix_mul(r, 4096) - fix_mul(g, 3430) - fix_mul(b, 666);
  294. c0[i] = y;
  295. c1[i] = u;
  296. c2[i] = v;
  297. }
  298. }
  299. #else
  300. void mct_encode_real(int* restrict c0, int* restrict c1, int* restrict c2, int n) {
  301. int i;
  302. for (i = 0; i < n; ++i) {
  303. int r = c0[i];
  304. int g = c1[i];
  305. int b = c2[i];
  306. int y = fix_mul(r, 2449) + fix_mul(g, 4809) + fix_mul(b, 934);
  307. int u = -fix_mul(r, 1382) - fix_mul(g, 2714) + fix_mul(b, 4096);
  308. int v = fix_mul(r, 4096) - fix_mul(g, 3430) - fix_mul(b, 666);
  309. c0[i] = y;
  310. c1[i] = u;
  311. c2[i] = v;
  312. }
  313. }
  314. #endif
  315. /* <summary> */
  316. /* Inverse irreversible MCT. */
  317. /* </summary> */
  318. void mct_decode_real(float* restrict c0, float* restrict c1, float* restrict c2, int n) {
  319. int i;
  320. #ifdef __SSE__
  321. int count = n >> 3;
  322. __m128 vrv, vgu, vgv, vbu;
  323. vrv = _mm_set1_ps(1.402f);
  324. vgu = _mm_set1_ps(0.34413f);
  325. vgv = _mm_set1_ps(0.71414f);
  326. vbu = _mm_set1_ps(1.772f);
  327. for (i = 0; i < count; ++i) {
  328. __m128 vy, vu, vv;
  329. __m128 vr, vg, vb;
  330. vy = _mm_load_ps(c0);
  331. vu = _mm_load_ps(c1);
  332. vv = _mm_load_ps(c2);
  333. vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
  334. vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
  335. vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
  336. _mm_store_ps(c0, vr);
  337. _mm_store_ps(c1, vg);
  338. _mm_store_ps(c2, vb);
  339. c0 += 4;
  340. c1 += 4;
  341. c2 += 4;
  342. vy = _mm_load_ps(c0);
  343. vu = _mm_load_ps(c1);
  344. vv = _mm_load_ps(c2);
  345. vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
  346. vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
  347. vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
  348. _mm_store_ps(c0, vr);
  349. _mm_store_ps(c1, vg);
  350. _mm_store_ps(c2, vb);
  351. c0 += 4;
  352. c1 += 4;
  353. c2 += 4;
  354. }
  355. n &= 7;
  356. #endif
  357. for (i = 0; i < n; ++i) {
  358. float y = c0[i];
  359. float u = c1[i];
  360. float v = c2[i];
  361. float r = y + (v * 1.402f);
  362. float g = y - (u * 0.34413f) - (v * 0.71414f);
  363. float b = y + (u * 1.772f);
  364. c0[i] = r;
  365. c1[i] = g;
  366. c2[i] = b;
  367. }
  368. }
  369. /* <summary> */
  370. /* Get norm of basis function of irreversible MCT. */
  371. /* </summary> */
  372. double mct_getnorm_real(int compno) {
  373. return mct_norms_real[compno];
  374. }