adaptive_merge.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. //////////////////////////////////////////////////////////////////////////////
  2. //
  3. // (C) Copyright Ion Gaztanaga 2015-2016.
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // (See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. // See http://www.boost.org/libs/move for documentation.
  9. //
  10. //////////////////////////////////////////////////////////////////////////////
  11. #ifndef BOOST_MOVE_ADAPTIVE_MERGE_HPP
  12. #define BOOST_MOVE_ADAPTIVE_MERGE_HPP
  13. #include <boost/move/detail/config_begin.hpp>
  14. #include <boost/move/algo/detail/adaptive_sort_merge.hpp>
  15. #include <cassert>
  16. #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
  17. #pragma GCC diagnostic push
  18. #pragma GCC diagnostic ignored "-Wsign-conversion"
  19. #pragma GCC diagnostic ignored "-Wconversion"
  20. #endif
  21. namespace boost {
  22. namespace movelib {
  23. ///@cond
  24. namespace detail_adaptive {
  25. template<class RandIt, class Compare, class XBuf>
  26. inline void adaptive_merge_combine_blocks( RandIt first
  27. , typename iter_size<RandIt>::type len1
  28. , typename iter_size<RandIt>::type len2
  29. , typename iter_size<RandIt>::type collected
  30. , typename iter_size<RandIt>::type n_keys
  31. , typename iter_size<RandIt>::type l_block
  32. , bool use_internal_buf
  33. , bool xbuf_used
  34. , Compare comp
  35. , XBuf & xbuf
  36. )
  37. {
  38. typedef typename iter_size<RandIt>::type size_type;
  39. size_type const len = size_type(len1+len2);
  40. size_type const l_combine = size_type(len-collected);
  41. size_type const l_combine1 = size_type(len1-collected);
  42. if(n_keys){
  43. RandIt const first_data = first+collected;
  44. RandIt const keys = first;
  45. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2(" A combine: ", len);
  46. if(xbuf_used){
  47. if(xbuf.size() < l_block){
  48. xbuf.initialize_until(l_block, *first);
  49. }
  50. assert(xbuf.size() >= l_block);
  51. size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
  52. combine_params( keys, comp, l_combine
  53. , l_combine1, l_block, xbuf
  54. , n_block_a, n_block_b, l_irreg1, l_irreg2); //Outputs
  55. op_merge_blocks_with_buf
  56. (keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp, move_op(), xbuf.data());
  57. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1(" A mrg xbf: ", len);
  58. }
  59. else{
  60. size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
  61. combine_params( keys, comp, l_combine
  62. , l_combine1, l_block, xbuf
  63. , n_block_a, n_block_b, l_irreg1, l_irreg2); //Outputs
  64. if(use_internal_buf){
  65. op_merge_blocks_with_buf
  66. ( keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b
  67. , l_irreg2, comp, swap_op(), first_data-l_block);
  68. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2(" A mrg buf: ", len);
  69. }
  70. else{
  71. merge_blocks_bufferless
  72. (keys, comp, first_data, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp);
  73. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1(" A mrg nbf: ", len);
  74. }
  75. }
  76. }
  77. else{
  78. xbuf.shrink_to_fit(l_block);
  79. if(xbuf.size() < l_block){
  80. xbuf.initialize_until(l_block, *first);
  81. }
  82. size_type *const uint_keys = xbuf.template aligned_trailing<size_type>(l_block);
  83. size_type n_block_a, n_block_b, l_irreg1, l_irreg2;
  84. combine_params( uint_keys, less(), l_combine
  85. , l_combine1, l_block, xbuf
  86. , n_block_a, n_block_b, l_irreg1, l_irreg2, true); //Outputs
  87. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2(" A combine: ", len);
  88. assert(xbuf.size() >= l_block);
  89. op_merge_blocks_with_buf
  90. (uint_keys, less(), first, l_block, l_irreg1, n_block_a, n_block_b, l_irreg2, comp, move_op(), xbuf.data());
  91. xbuf.clear();
  92. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1(" A mrg buf: ", len);
  93. }
  94. }
  95. template<class RandIt, class Compare, class XBuf>
  96. inline void adaptive_merge_final_merge( RandIt first
  97. , typename iter_size<RandIt>::type len1
  98. , typename iter_size<RandIt>::type len2
  99. , typename iter_size<RandIt>::type collected
  100. , typename iter_size<RandIt>::type l_intbuf
  101. , typename iter_size<RandIt>::type //l_block
  102. , bool //use_internal_buf
  103. , bool xbuf_used
  104. , Compare comp
  105. , XBuf & xbuf
  106. )
  107. {
  108. typedef typename iter_size<RandIt>::type size_type;
  109. size_type n_keys = size_type(collected-l_intbuf);
  110. size_type len = size_type(len1+len2);
  111. if (!xbuf_used || n_keys) {
  112. xbuf.clear();
  113. const size_type middle = xbuf_used && n_keys ? n_keys: collected;
  114. unstable_sort(first, first + middle, comp, xbuf);
  115. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L2(" A k/b srt: ", len);
  116. stable_merge(first, first + middle, first + len, comp, xbuf);
  117. }
  118. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1(" A fin mrg: ", len);
  119. }
  120. template<class SizeType>
  121. inline static SizeType adaptive_merge_n_keys_without_external_keys(SizeType l_block, SizeType len1, SizeType len2, SizeType l_intbuf)
  122. {
  123. typedef SizeType size_type;
  124. //This is the minimum number of keys to implement the ideal algorithm
  125. size_type n_keys = size_type(len1/l_block + len2/l_block);
  126. const size_type second_half_blocks = size_type(len2/l_block);
  127. const size_type first_half_aux = size_type(len1 - l_intbuf);
  128. while(n_keys >= ((first_half_aux-n_keys)/l_block + second_half_blocks)){
  129. --n_keys;
  130. }
  131. ++n_keys;
  132. return n_keys;
  133. }
  134. template<class SizeType>
  135. inline static SizeType adaptive_merge_n_keys_with_external_keys(SizeType l_block, SizeType len1, SizeType len2, SizeType l_intbuf)
  136. {
  137. typedef SizeType size_type;
  138. //This is the minimum number of keys to implement the ideal algorithm
  139. size_type n_keys = size_type((len1-l_intbuf)/l_block + len2/l_block);
  140. return n_keys;
  141. }
  142. template<class SizeType, class Xbuf>
  143. inline SizeType adaptive_merge_n_keys_intbuf(SizeType &rl_block, SizeType len1, SizeType len2, Xbuf & xbuf, SizeType &l_intbuf_inout)
  144. {
  145. typedef SizeType size_type;
  146. size_type l_block = rl_block;
  147. size_type l_intbuf = xbuf.capacity() >= l_block ? 0u : l_block;
  148. if (xbuf.capacity() > l_block){
  149. l_block = xbuf.capacity();
  150. }
  151. //This is the minimum number of keys to implement the ideal algorithm
  152. size_type n_keys = adaptive_merge_n_keys_without_external_keys(l_block, len1, len2, l_intbuf);
  153. assert(n_keys >= ((len1-l_intbuf-n_keys)/l_block + len2/l_block));
  154. if(xbuf.template supports_aligned_trailing<size_type>
  155. ( l_block
  156. , adaptive_merge_n_keys_with_external_keys(l_block, len1, len2, l_intbuf)))
  157. {
  158. n_keys = 0u;
  159. }
  160. l_intbuf_inout = l_intbuf;
  161. rl_block = l_block;
  162. return n_keys;
  163. }
  164. // Main explanation of the merge algorithm.
  165. //
  166. // csqrtlen = ceil(sqrt(len));
  167. //
  168. // * First, csqrtlen [to be used as buffer] + (len/csqrtlen - 1) [to be used as keys] => to_collect
  169. // unique elements are extracted from elements to be sorted and placed in the beginning of the range.
  170. //
  171. // * Step "combine_blocks": the leading (len1-to_collect) elements plus trailing len2 elements
  172. // are merged with a non-trivial ("smart") algorithm to form an ordered range trailing "len-to_collect" elements.
  173. //
  174. // Explanation of the "combine_blocks" step:
  175. //
  176. // * Trailing [first+to_collect, first+len1) elements are divided in groups of cqrtlen elements.
  177. // Remaining elements that can't form a group are grouped in front of those elements.
  178. // * Trailing [first+len1, first+len1+len2) elements are divided in groups of cqrtlen elements.
  179. // Remaining elements that can't form a group are grouped in the back of those elements.
  180. // * In parallel the following two steps are performed:
  181. // * Groups are selection-sorted by first or last element (depending whether they are going
  182. // to be merged to left or right) and keys are reordered accordingly as an imitation-buffer.
  183. // * Elements of each block pair are merged using the csqrtlen buffer taking into account
  184. // if they belong to the first half or second half (marked by the key).
  185. //
  186. // * In the final merge step leading "to_collect" elements are merged with rotations
  187. // with the rest of merged elements in the "combine_blocks" step.
  188. //
  189. // Corner cases:
  190. //
  191. // * If no "to_collect" elements can be extracted:
  192. //
  193. // * If more than a minimum number of elements is extracted
  194. // then reduces the number of elements used as buffer and keys in the
  195. // and "combine_blocks" steps. If "combine_blocks" has no enough keys due to this reduction
  196. // then uses a rotation based smart merge.
  197. //
  198. // * If the minimum number of keys can't be extracted, a rotation-based merge is performed.
  199. //
  200. // * If auxiliary memory is more or equal than min(len1, len2), a buffered merge is performed.
  201. //
  202. // * If the len1 or len2 are less than 2*csqrtlen then a rotation-based merge is performed.
  203. //
  204. // * If auxiliary memory is more than csqrtlen+n_keys*sizeof(std::size_t),
  205. // then no csqrtlen need to be extracted and "combine_blocks" will use integral
  206. // keys to combine blocks.
  207. template<class RandIt, class Compare, class XBuf>
  208. void adaptive_merge_impl
  209. ( RandIt first
  210. , typename iter_size<RandIt>::type len1
  211. , typename iter_size<RandIt>::type len2
  212. , Compare comp
  213. , XBuf & xbuf
  214. )
  215. {
  216. typedef typename iter_size<RandIt>::type size_type;
  217. if(xbuf.capacity() >= min_value<size_type>(len1, len2)){
  218. buffered_merge( first, first+len1
  219. , first + len1+len2, comp, xbuf);
  220. }
  221. else{
  222. const size_type len = size_type(len1+len2);
  223. //Calculate ideal parameters and try to collect needed unique keys
  224. size_type l_block = size_type(ceil_sqrt(len));
  225. //One range is not big enough to extract keys and the internal buffer so a
  226. //rotation-based based merge will do just fine
  227. if(len1 <= l_block*2 || len2 <= l_block*2){
  228. merge_bufferless(first, first+len1, first+len1+len2, comp);
  229. return;
  230. }
  231. //Detail the number of keys and internal buffer. If xbuf has enough memory, no
  232. //internal buffer is needed so l_intbuf will remain 0.
  233. size_type l_intbuf = 0;
  234. size_type n_keys = adaptive_merge_n_keys_intbuf(l_block, len1, len2, xbuf, l_intbuf);
  235. size_type const to_collect = size_type(l_intbuf+n_keys);
  236. //Try to extract needed unique values from the first range
  237. size_type const collected = collect_unique(first, first+len1, to_collect, comp, xbuf);
  238. BOOST_MOVE_ADAPTIVE_SORT_PRINT_L1("\n A collect: ", len);
  239. //Not the minimum number of keys is not available on the first range, so fallback to rotations
  240. if(collected != to_collect && collected < 4){
  241. merge_bufferless(first, first+collected, first+len1, comp);
  242. merge_bufferless(first, first + len1, first + len1 + len2, comp);
  243. return;
  244. }
  245. //If not enough keys but more than minimum, adjust the internal buffer and key count
  246. bool use_internal_buf = collected == to_collect;
  247. if (!use_internal_buf){
  248. l_intbuf = 0u;
  249. n_keys = collected;
  250. l_block = lblock_for_combine(l_intbuf, n_keys, len, use_internal_buf);
  251. //If use_internal_buf is false, then then internal buffer will be zero and rotation-based combination will be used
  252. l_intbuf = use_internal_buf ? l_block : 0u;
  253. }
  254. bool const xbuf_used = collected == to_collect && xbuf.capacity() >= l_block;
  255. //Merge trailing elements using smart merges
  256. adaptive_merge_combine_blocks(first, len1, len2, collected, n_keys, l_block, use_internal_buf, xbuf_used, comp, xbuf);
  257. //Merge buffer and keys with the rest of the values
  258. adaptive_merge_final_merge (first, len1, len2, collected, l_intbuf, l_block, use_internal_buf, xbuf_used, comp, xbuf);
  259. }
  260. }
  261. } //namespace detail_adaptive {
  262. ///@endcond
  263. //! <b>Effects</b>: Merges two consecutive sorted ranges [first, middle) and [middle, last)
  264. //! into one sorted range [first, last) according to the given comparison function comp.
  265. //! The algorithm is stable (if there are equivalent elements in the original two ranges,
  266. //! the elements from the first range (preserving their original order) precede the elements
  267. //! from the second range (preserving their original order).
  268. //!
  269. //! <b>Requires</b>:
  270. //! - RandIt must meet the requirements of ValueSwappable and RandomAccessIterator.
  271. //! - The type of dereferenced RandIt must meet the requirements of MoveAssignable and MoveConstructible.
  272. //!
  273. //! <b>Parameters</b>:
  274. //! - first: the beginning of the first sorted range.
  275. //! - middle: the end of the first sorted range and the beginning of the second
  276. //! - last: the end of the second sorted range
  277. //! - comp: comparison function object which returns true if the first argument is is ordered before the second.
  278. //! - uninitialized, uninitialized_len: raw storage starting on "uninitialized", able to hold "uninitialized_len"
  279. //! elements of type iterator_traits<RandIt>::value_type. Maximum performance is achieved when uninitialized_len
  280. //! is min(std::distance(first, middle), std::distance(middle, last)).
  281. //!
  282. //! <b>Throws</b>: If comp throws or the move constructor, move assignment or swap of the type
  283. //! of dereferenced RandIt throws.
  284. //!
  285. //! <b>Complexity</b>: Always K x O(N) comparisons and move assignments/constructors/swaps.
  286. //! Constant factor for comparisons and data movement is minimized when uninitialized_len
  287. //! is min(std::distance(first, middle), std::distance(middle, last)).
  288. //! Pretty good enough performance is achieved when uninitialized_len is
  289. //! ceil(sqrt(std::distance(first, last)))*2.
  290. //!
  291. //! <b>Caution</b>: Experimental implementation, not production-ready.
  292. template<class RandIt, class Compare>
  293. void adaptive_merge( RandIt first, RandIt middle, RandIt last, Compare comp
  294. , typename iterator_traits<RandIt>::value_type* uninitialized = 0
  295. , typename iter_size<RandIt>::type uninitialized_len = 0)
  296. {
  297. typedef typename iter_size<RandIt>::type size_type;
  298. typedef typename iterator_traits<RandIt>::value_type value_type;
  299. if (first == middle || middle == last){
  300. return;
  301. }
  302. //Reduce ranges to merge if possible
  303. do {
  304. if (comp(*middle, *first)){
  305. break;
  306. }
  307. ++first;
  308. if (first == middle)
  309. return;
  310. } while(1);
  311. RandIt first_high(middle);
  312. --first_high;
  313. do {
  314. --last;
  315. if (comp(*last, *first_high)){
  316. ++last;
  317. break;
  318. }
  319. if (last == middle)
  320. return;
  321. } while(1);
  322. ::boost::movelib::adaptive_xbuf<value_type, value_type*, size_type> xbuf(uninitialized, size_type(uninitialized_len));
  323. ::boost::movelib::detail_adaptive::adaptive_merge_impl(first, size_type(middle - first), size_type(last - middle), comp, xbuf);
  324. }
  325. } //namespace movelib {
  326. } //namespace boost {
  327. #if defined(BOOST_CLANG) || (defined(BOOST_GCC) && (BOOST_GCC >= 40600))
  328. #pragma GCC diagnostic pop
  329. #endif
  330. #include <boost/move/detail/config_end.hpp>
  331. #endif //#define BOOST_MOVE_ADAPTIVE_MERGE_HPP