llmemory.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. /**
  2. * @file llmemory.cpp
  3. * @brief Very special memory allocation/deallocation stuff here
  4. *
  5. * $LicenseInfo:firstyear=2002&license=viewergpl$
  6. *
  7. * Copyright (c) 2002-2009, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #include "linden_common.h"
  33. #include "llmemory.h"
  34. #include "llsys.h"
  35. #if LL_JEMALLOC
  36. # include "jemalloc/jemalloc.h"
  37. #elif LL_MIMALLOC && LL_WINDOWS
  38. # include "mimalloc/mimalloc-new-delete.h"
  39. #endif
  40. #if LL_DARWIN
  41. # include <sys/types.h>
  42. # include <sys/sysctl.h>
  43. # include <mach/task.h>
  44. # include <mach/mach.h>
  45. # include <mach/mach_init.h>
  46. # include <mach/mach_host.h>
  47. # include <unistd.h>
  48. #elif LL_LINUX
  49. # include <malloc.h>
  50. # include <sys/resource.h>
  51. # include <unistd.h>
  52. #elif LL_WINDOWS
  53. # include <malloc.h>
  54. # include <psapi.h>
  55. #endif
  56. void* gReservedFreeSpace = NULL;
  57. //static
  58. U32 LLMemory::sMaxPhysicalMemInKB = 0;
  59. U32 LLMemory::sMaxVirtualMemInKB = 0;
  60. U32 LLMemory::sAvailPhysicalMemInKB = U32_MAX;
  61. U32 LLMemory::sAvailVirtualMemInKB = U32_MAX;
  62. U32 LLMemory::sAllocatedMemInKB = 0;
  63. U32 LLMemory::sAllocatedPageSizeInKB = 0;
  64. bool LLMemory::sFailedAllocation = false;
  65. bool LLMemory::sFailedAllocationOnce = false;
  66. #if LL_LINUX && !LL_JEMALLOC
  67. // Stats: number of successful malloc timming.
  68. static U32 sTrimmed = 0;
  69. #endif
  70. #if LL_DEBUG
  71. void ll_assert_aligned_error()
  72. {
  73. llerrs << "Alignment check failed !" << llendl;
  74. }
  75. #endif
  76. //static
  77. void LLMemory::initClass()
  78. {
  79. #if LL_JEMALLOC
  80. unsigned int oval, nval = 0;
  81. size_t osz = sizeof(oval);
  82. size_t nsz = sizeof(nval);
  83. mallctl("thread.arena", &oval, &osz, &nval, nsz);
  84. #endif
  85. if (!gReservedFreeSpace)
  86. {
  87. // Reserve some space that we will free() on crash to try and avoid out
  88. // of memory conditions while dumping the stack trace log... 256Kb
  89. // should be plenty enough.
  90. gReservedFreeSpace = malloc(262144);
  91. }
  92. }
  93. //static
  94. void LLMemory::cleanupClass()
  95. {
  96. if (gReservedFreeSpace)
  97. {
  98. free(gReservedFreeSpace);
  99. gReservedFreeSpace = NULL;
  100. }
  101. }
  102. //static
  103. void LLMemory::allocationFailed(size_t size)
  104. {
  105. sFailedAllocation = sFailedAllocationOnce = true;
  106. if (size > 0)
  107. {
  108. llwarns << "Memory allocation failure for size: " << size << llendl;
  109. }
  110. }
  111. //static
  112. void LLMemory::updateMemoryInfo(bool trim_heap)
  113. {
  114. // jemalloc v5.0+ does properly release memory to the system, and v5.2+
  115. // does redirect new and delete calls to its own allocators, so there is
  116. // no need for trimming with the system malloc() here. HB
  117. #if !LL_JEMALLOC
  118. if (trim_heap)
  119. {
  120. // Trim the heap down from freed memory so that we can compute the
  121. // actual available virtual space. HB
  122. // *TODO: implement heap trimming for macOS, if at all possible (if
  123. // not, try and use jemalloc with macOS ?)...
  124. # if LL_LINUX
  125. constexpr size_t keep = 100 * 1024 * 1024; // Trim all but 100Mb
  126. sTrimmed += malloc_trim(keep);
  127. # elif LL_WINDOWS
  128. _heapmin();
  129. # endif
  130. }
  131. #endif
  132. getMaxMemoryKB(sMaxPhysicalMemInKB, sMaxVirtualMemInKB);
  133. getAvailableMemoryKB(sAvailPhysicalMemInKB, sAvailVirtualMemInKB);
  134. #if LL_WINDOWS
  135. HANDLE self = GetCurrentProcess();
  136. PROCESS_MEMORY_COUNTERS counters;
  137. if (!GetProcessMemoryInfo(self, &counters, sizeof(counters)))
  138. {
  139. llwarns << "GetProcessMemoryInfo failed" << llendl;
  140. sAllocatedPageSizeInKB = sMaxVirtualMemInKB - sAvailVirtualMemInKB;
  141. sAllocatedMemInKB = 0;
  142. }
  143. else
  144. {
  145. sAllocatedPageSizeInKB = (U32)(counters.PagefileUsage / 1024);
  146. sAllocatedMemInKB = (U32)(counters.WorkingSetSize / 1024);
  147. }
  148. #elif LL_DARWIN
  149. task_vm_info info;
  150. mach_msg_type_number_t info_count = TASK_VM_INFO_COUNT;
  151. // MACH_TASK_BASIC_INFO reports the same resident_size, but does not tell
  152. // us the reusable bytes or phys_footprint.
  153. if (task_info(mach_task_self(), TASK_VM_INFO,
  154. reinterpret_cast<task_info_t>(&info),
  155. &info_count) == KERN_SUCCESS)
  156. {
  157. sAllocatedPageSizeInKB = U32(info.resident_size) / 1024;
  158. sAllocatedMemInKB = U32(info.resident_size - info.reusable) / 1024;
  159. }
  160. else
  161. {
  162. sAllocatedPageSizeInKB = sMaxVirtualMemInKB - sAvailVirtualMemInKB;
  163. sAllocatedMemInKB = (U32)(LLMemory::getCurrentRSS() / 1024);
  164. }
  165. #else
  166. sAllocatedPageSizeInKB = sMaxVirtualMemInKB - sAvailVirtualMemInKB;
  167. sAllocatedMemInKB = (U32)(LLMemory::getCurrentRSS() / 1024);
  168. #endif
  169. }
  170. #if LL_JEMALLOC
  171. #define JEMALLOC_STATS_STRING_SIZE 262144
  172. static void jemalloc_write_cb(void* data, const char* s)
  173. {
  174. if (data && s)
  175. {
  176. std::string* buff = (std::string*)data;
  177. size_t buff_len = buff->length();
  178. size_t line_len = strlen(s);
  179. if (buff_len + line_len >= JEMALLOC_STATS_STRING_SIZE)
  180. {
  181. line_len = JEMALLOC_STATS_STRING_SIZE - buff_len - 1;
  182. }
  183. if (line_len > 0)
  184. {
  185. buff->append(s, line_len);
  186. }
  187. }
  188. }
  189. #endif
  190. //static
  191. void LLMemory::logMemoryInfo()
  192. {
  193. updateMemoryInfo();
  194. #if LL_JEMALLOC
  195. unsigned int arenas, arena;
  196. size_t sz = sizeof(arenas);
  197. mallctl("opt.narenas", &arenas, &sz, NULL, 0);
  198. U32 opt_arenas = arenas;
  199. mallctl("arenas.narenas", &arenas, &sz, NULL, 0);
  200. mallctl("thread.arena", &arena, &sz, NULL, 0);
  201. llinfos << "jemalloc initialized with " << opt_arenas
  202. << " arenas, using now " << arenas
  203. << " arenas, main thread using arena " << arena << "." << llendl;
  204. bool stats_enabled = false;
  205. sz = sizeof(stats_enabled);
  206. mallctl("config.stats", &stats_enabled, &sz, NULL, 0);
  207. if (stats_enabled)
  208. {
  209. std::string malloc_stats_str;
  210. // IMPORTANT: we cannot reserve memory during jemalloc_write_cb() call
  211. // by malloc_stats_print(), so we reserve a fixed string buffer.
  212. malloc_stats_str.reserve(JEMALLOC_STATS_STRING_SIZE);
  213. malloc_stats_print(jemalloc_write_cb, &malloc_stats_str, NULL);
  214. llinfos << "jemalloc stats:\n" << malloc_stats_str << llendl;
  215. }
  216. #endif
  217. llinfos << "System memory information: Max physical memory: "
  218. << sMaxPhysicalMemInKB << "KB - Allocated physical memory: "
  219. << sAllocatedMemInKB << "KB - Available physical memory: "
  220. << sAvailPhysicalMemInKB << "KB - Allocated virtual memory: "
  221. << sAllocatedPageSizeInKB << "KB"
  222. #if LL_LINUX && !LL_JEMALLOC
  223. << " - Number of actual glibc malloc() heap trimming occurrences: "
  224. << sTrimmed
  225. #endif
  226. << llendl;
  227. }
  228. #if LL_WINDOWS
  229. U64 LLMemory::getCurrentRSS()
  230. {
  231. HANDLE self = GetCurrentProcess();
  232. PROCESS_MEMORY_COUNTERS counters;
  233. if (!GetProcessMemoryInfo(self, &counters, sizeof(counters)))
  234. {
  235. llwarns_once << "GetProcessMemoryInfo() failed !" << llendl;
  236. return 0;
  237. }
  238. return counters.WorkingSetSize;
  239. }
  240. #elif LL_DARWIN
  241. U64 LLMemory::getCurrentRSS()
  242. {
  243. U64 resident_size = 0;
  244. mach_task_basic_info_data_t basic_info;
  245. mach_msg_type_number_t basic_info_count = MACH_TASK_BASIC_INFO_COUNT;
  246. if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO,
  247. (task_info_t)&basic_info, &basic_info_count) == KERN_SUCCESS)
  248. {
  249. resident_size = basic_info.resident_size;
  250. }
  251. else
  252. {
  253. llwarns_once << "task_info() failed !" << llendl;
  254. }
  255. return resident_size;
  256. }
  257. #elif LL_LINUX
  258. U64 LLMemory::getCurrentRSS()
  259. {
  260. struct rusage usage;
  261. if (getrusage(RUSAGE_SELF, &usage))
  262. {
  263. llwarns_once << "getrusage() failed !" << llendl;
  264. return 0;
  265. }
  266. return usage.ru_maxrss * 1024;
  267. }
  268. #else
  269. U64 LLMemory::getCurrentRSS()
  270. {
  271. return 0;
  272. }
  273. #endif
  274. ///////////////////////////////////////////////////////////////////////////////
  275. // The following methods used to be part of LLMemoryInfo (in llsys.h/cpp),
  276. // which I removed to merge here, since they did not even relate with global
  277. // system info, but instead with the memory consumption of the viewer itself...
  278. // HB
  279. //static
  280. U32 LLMemory::getPhysicalMemoryKB()
  281. {
  282. #if LL_WINDOWS
  283. MEMORYSTATUSEX state;
  284. state.dwLength = sizeof(state);
  285. GlobalMemoryStatusEx(&state);
  286. U32 amount = state.ullTotalPhys >> 10;
  287. // *HACK: for some reason, the reported amount of memory is always wrong.
  288. // The original adjustment assumes it is always off by one meg, however
  289. // errors of as much as 2520 KB have been observed in the value returned
  290. // from the GetMemoryStatusEx function. Here we keep the original
  291. // adjustment from llfoaterabout.cpp until this can be fixed somehow. HB
  292. amount += 1024;
  293. return amount;
  294. #elif LL_DARWIN
  295. // This might work on Linux as well. Someone check...
  296. uint64_t phys = 0;
  297. int mib[2] = { CTL_HW, HW_MEMSIZE };
  298. size_t len = sizeof(phys);
  299. sysctl(mib, 2, &phys, &len, NULL, 0);
  300. return (U32)(phys >> 10);
  301. #elif LL_LINUX
  302. U64 phys = (U64)(sysconf(_SC_PAGESIZE)) * (U64)(sysconf(_SC_PHYS_PAGES));
  303. return (U32)(phys >> 10);
  304. #else
  305. return 0;
  306. #endif
  307. }
  308. //static
  309. void LLMemory::getMaxMemoryKB(U32& max_physical_mem_kb,
  310. U32& max_virtual_mem_kb)
  311. {
  312. static U32 saved_max_physical_mem_kb = 0;
  313. static U32 saved_max_virtual_mem_kb = 0;
  314. if (saved_max_virtual_mem_kb)
  315. {
  316. max_physical_mem_kb = saved_max_physical_mem_kb;
  317. max_virtual_mem_kb = saved_max_virtual_mem_kb;
  318. return;
  319. }
  320. U32 addressable = U32_MAX; // No limit...
  321. #if LL_WINDOWS
  322. MEMORYSTATUSEX state;
  323. state.dwLength = sizeof(state);
  324. GlobalMemoryStatusEx(&state);
  325. max_physical_mem_kb = (U32)(state.ullAvailPhys / 1024);
  326. U32 total_virtual_memory = (U32)(state.ullTotalVirtual / 1024);
  327. max_virtual_mem_kb = total_virtual_memory;
  328. #elif LL_LINUX || LL_DARWIN
  329. max_physical_mem_kb = getPhysicalMemoryKB();
  330. max_virtual_mem_kb = addressable;
  331. #else
  332. max_physical_mem_kb = max_virtual_mem_kb = addressable;
  333. #endif
  334. max_virtual_mem_kb = llmin(max_virtual_mem_kb, addressable);
  335. #if LL_WINDOWS
  336. LL_DEBUGS("Memory") << "Total physical memory: "
  337. << max_physical_mem_kb / 1024
  338. << "Mb - Total available virtual memory: "
  339. << total_virtual_memory / 1024
  340. << "Mb - Retained max virtual memory: "
  341. << max_virtual_mem_kb / 1024 << "Mb" << LL_ENDL;
  342. #else
  343. LL_DEBUGS("Memory") << "Total physical memory: "
  344. << max_physical_mem_kb / 1024
  345. << "Mb - Retained max virtual memory: "
  346. << max_virtual_mem_kb / 1024 << "Mb" << LL_ENDL;
  347. #endif
  348. saved_max_physical_mem_kb = max_physical_mem_kb;
  349. saved_max_virtual_mem_kb = max_virtual_mem_kb;
  350. }
  351. #if LL_LINUX || LL_DARWIN
  352. static U32 get_process_virtual_size_kb()
  353. {
  354. U32 virtual_size = 0;
  355. # if LL_LINUX
  356. LLFILE* status_filep = LLFile::open("/proc/self/status", "rb");
  357. if (status_filep)
  358. {
  359. constexpr S32 STATUS_SIZE = 8192;
  360. char buff[STATUS_SIZE];
  361. size_t nbytes = fread(buff, 1, STATUS_SIZE - 1, status_filep);
  362. buff[nbytes] = '\0';
  363. // All these guys return numbers in KB
  364. U32 temp = 0;
  365. char* memp = strstr(buff, "VmRSS:");
  366. if (memp)
  367. {
  368. sscanf(memp, "%*s %u", &temp);
  369. virtual_size = temp;
  370. }
  371. memp = strstr(buff, "VmStk:");
  372. if (memp)
  373. {
  374. sscanf(memp, "%*s %u", &temp);
  375. virtual_size += temp;
  376. }
  377. memp = strstr(buff, "VmExe:");
  378. if (memp)
  379. {
  380. sscanf(memp, "%*s %u", &temp);
  381. virtual_size += temp;
  382. }
  383. memp = strstr(buff, "VmLib:");
  384. if (memp)
  385. {
  386. sscanf(memp, "%*s %u", &temp);
  387. virtual_size += temp;
  388. }
  389. memp = strstr(buff, "VmPTE:");
  390. if (memp)
  391. {
  392. sscanf(memp, "%*s %u", &temp);
  393. virtual_size += temp;
  394. }
  395. LLFile::close(status_filep);
  396. }
  397. # elif LL_DARWIN
  398. struct task_basic_info t_info;
  399. mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
  400. if (KERN_SUCCESS != task_info(mach_task_self(), TASK_BASIC_INFO,
  401. (task_info_t)&t_info, &t_info_count))
  402. {
  403. return 0;
  404. }
  405. virtual_size = t_info.virtual_size / 1024;
  406. # endif
  407. return virtual_size;
  408. }
  409. #endif // LL_LINUX || LL_DARWIN
  410. //static
  411. void LLMemory::getAvailableMemoryKB(U32& avail_physical_mem_kb,
  412. U32& avail_virtual_mem_kb)
  413. {
  414. U32 max_physical_mem_kb, max_virtual_mem_kb;
  415. getMaxMemoryKB(max_physical_mem_kb, max_virtual_mem_kb);
  416. #if LL_DARWIN
  417. // Total installed and available physical memory are properties of the
  418. // host, not just our process.
  419. vm_statistics64_data_t vmstat;
  420. mach_msg_type_number_t count = HOST_VM_INFO64_COUNT;
  421. mach_port_t host = mach_host_self();
  422. vm_size_t page_size;
  423. host_page_size(host, &page_size);
  424. kern_return_t result =
  425. host_statistics64(host, HOST_VM_INFO64,
  426. reinterpret_cast<host_info_t>(&vmstat), &count);
  427. if (result == KERN_SUCCESS)
  428. {
  429. avail_physical_mem_kb = vmstat.free_count * page_size / 1024;
  430. }
  431. else
  432. #endif
  433. {
  434. avail_physical_mem_kb = max_physical_mem_kb;
  435. }
  436. #if LL_WINDOWS
  437. MEMORYSTATUSEX state;
  438. state.dwLength = sizeof(state);
  439. GlobalMemoryStatusEx(&state);
  440. avail_virtual_mem_kb = (U32)(state.ullAvailVirtual / 1024);
  441. LL_DEBUGS("Memory") << "Memory check: reported available virtual space: "
  442. << avail_virtual_mem_kb / 1024 << "Mb" << LL_ENDL;
  443. #else
  444. U32 virtual_size_kb = get_process_virtual_size_kb();
  445. if (virtual_size_kb < max_virtual_mem_kb)
  446. {
  447. avail_virtual_mem_kb = max_virtual_mem_kb - virtual_size_kb;
  448. }
  449. else
  450. {
  451. avail_virtual_mem_kb = 0;
  452. }
  453. LL_DEBUGS("Memory") << "Memory check: Retained available virtual space: "
  454. << avail_virtual_mem_kb / 1024 << "Mb" << LL_ENDL;
  455. #endif
  456. }
  457. //static
  458. std::string LLMemory::getInfo()
  459. {
  460. std::ostringstream s;
  461. #if LL_WINDOWS
  462. MEMORYSTATUSEX state;
  463. state.dwLength = sizeof(state);
  464. GlobalMemoryStatusEx(&state);
  465. s << "Percent Memory use: " << (U32)state.dwMemoryLoad << '%' << std::endl;
  466. s << "Total Physical KB: " << (U32)(state.ullTotalPhys / 1024) << std::endl;
  467. s << "Avail Physical KB: " << (U32)(state.ullAvailPhys / 1024) << std::endl;
  468. s << "Total page KB: " << (U32)(state.ullTotalPageFile / 1024) << std::endl;
  469. s << "Avail page KB: " << (U32)(state.ullAvailPageFile / 1024) << std::endl;
  470. s << "Total Virtual KB: " << (U32)(state.ullTotalVirtual / 1024) << std::endl;
  471. s << "Avail Virtual KB: " << (U32)(state.ullAvailVirtual / 1024) << std::endl;
  472. #elif LL_DARWIN
  473. uint64_t phys = 0;
  474. size_t len = sizeof(phys);
  475. if (sysctlbyname("hw.memsize", &phys, &len, NULL, 0) == 0)
  476. {
  477. s << "Total Physical KB: " << phys / 1024 << std::endl;
  478. }
  479. else
  480. {
  481. s << "Unable to collect memory information";
  482. }
  483. #else
  484. // *NOTE: This works on Linux. What will it do on other systems ?
  485. static const char MEMINFO_FILE[] = "/proc/meminfo";
  486. LLFILE* meminfo = LLFile::open(MEMINFO_FILE, "rb");
  487. if (meminfo)
  488. {
  489. char line[MAX_STRING];
  490. memset(line, 0, MAX_STRING);
  491. while (fgets(line, MAX_STRING, meminfo))
  492. {
  493. line[strlen(line) - 1] = ' ';
  494. s << line;
  495. }
  496. LLFile::close(meminfo);
  497. }
  498. else
  499. {
  500. s << "Unable to collect memory information";
  501. }
  502. #endif
  503. return s.str();
  504. }
  505. // End of former LLMemoryInfo methods
  506. ///////////////////////////////////////////////////////////////////////////////
  507. template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
  508. {
  509. return reinterpret_cast<T*>(
  510. (reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
  511. }
  512. // Used to be force-inlined in llmemory.h, but does not really deserve it. HB
  513. void ll_memcpy_nonaliased_aligned_16(char* __restrict dst,
  514. const char* __restrict src, size_t bytes)
  515. {
  516. llassert(src != NULL && dst != NULL);
  517. llassert(bytes > 0 && bytes % sizeof(F32) == 0 && bytes % 16 == 0);
  518. llassert(src < dst ? src + bytes <= dst : dst + bytes <= src);
  519. ll_assert_aligned(src, 16);
  520. ll_assert_aligned(dst, 16);
  521. char* end = dst + bytes;
  522. if (bytes > 64)
  523. {
  524. // Find start of 64 bytes-aligned area within block
  525. void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
  526. // At least 64 bytes before the end of the destination, switch to 16
  527. // bytes copies
  528. void* end_64 = end - 64;
  529. // Prefetch the head of the 64 bytes area now
  530. _mm_prefetch((char*)begin_64, _MM_HINT_NTA);
  531. _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
  532. _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
  533. _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
  534. // Copy 16 bytes chunks until we're 64 bytes aligned
  535. while (dst < begin_64)
  536. {
  537. _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
  538. dst += 16;
  539. src += 16;
  540. }
  541. // Copy 64 bytes chunks up to your tail
  542. //
  543. // Might be good to shmoo the 512b prefetch offset (characterize
  544. // performance for various values)
  545. while (dst < end_64)
  546. {
  547. _mm_prefetch((char*)src + 512, _MM_HINT_NTA);
  548. _mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
  549. _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
  550. _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
  551. _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
  552. _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
  553. dst += 64;
  554. src += 64;
  555. }
  556. }
  557. // Copy remainder 16 bytes tail chunks (or ALL 16 bytes chunks for
  558. // sub-64 bytes copies)
  559. while (dst < end)
  560. {
  561. _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
  562. dst += 16;
  563. src += 16;
  564. }
  565. }