You mentioned that the size of the array was always known to be 23
. Moreover, the type used is unsigned short
. In this case, you might try to use a sorting network of size 23
; since your type is unsigned short
, sorting the whole array with a sorting network might be even faster than partially sorting it with std::nth_element
. Here is a very straightforward C++14 implementation of a sorting network of size 23
with 118
compare-exchange units, as described by Using Symmetry and Evolutionary Search to Minimize Sorting Networks:
template<typename RandomIt, typename Compare = std::less<>>
void network_sort23(RandomIt first, Compare compare={})
{
swap_if(first[1u], first[20u], compare);
swap_if(first[2u], first[21u], compare);
swap_if(first[5u], first[13u], compare);
swap_if(first[9u], first[17u], compare);
swap_if(first[0u], first[7u], compare);
swap_if(first[15u], first[22u], compare);
swap_if(first[4u], first[11u], compare);
swap_if(first[6u], first[12u], compare);
swap_if(first[10u], first[16u], compare);
swap_if(first[8u], first[18u], compare);
swap_if(first[14u], first[19u], compare);
swap_if(first[3u], first[8u], compare);
swap_if(first[4u], first[14u], compare);
swap_if(first[11u], first[18u], compare);
swap_if(first[2u], first[6u], compare);
swap_if(first[16u], first[20u], compare);
swap_if(first[0u], first[9u], compare);
swap_if(first[13u], first[22u], compare);
swap_if(first[5u], first[15u], compare);
swap_if(first[7u], first[17u], compare);
swap_if(first[1u], first[10u], compare);
swap_if(first[12u], first[21u], compare);
swap_if(first[8u], first[19u], compare);
swap_if(first[17u], first[22u], compare);
swap_if(first[0u], first[5u], compare);
swap_if(first[20u], first[21u], compare);
swap_if(first[1u], first[2u], compare);
swap_if(first[18u], first[19u], compare);
swap_if(first[3u], first[4u], compare);
swap_if(first[21u], first[22u], compare);
swap_if(first[0u], first[1u], compare);
swap_if(first[19u], first[22u], compare);
swap_if(first[0u], first[3u], compare);
swap_if(first[12u], first[13u], compare);
swap_if(first[9u], first[10u], compare);
swap_if(first[6u], first[15u], compare);
swap_if(first[7u], first[16u], compare);
swap_if(first[8u], first[11u], compare);
swap_if(first[11u], first[14u], compare);
swap_if(first[4u], first[11u], compare);
swap_if(first[6u], first[8u], compare);
swap_if(first[14u], first[16u], compare);
swap_if(first[17u], first[20u], compare);
swap_if(first[2u], first[5u], compare);
swap_if(first[9u], first[12u], compare);
swap_if(first[10u], first[13u], compare);
swap_if(first[15u], first[18u], compare);
swap_if(first[10u], first[11u], compare);
swap_if(first[4u], first[7u], compare);
swap_if(first[20u], first[21u], compare);
swap_if(first[1u], first[2u], compare);
swap_if(first[7u], first[15u], compare);
swap_if(first[3u], first[9u], compare);
swap_if(first[13u], first[19u], compare);
swap_if(first[16u], first[18u], compare);
swap_if(first[8u], first[14u], compare);
swap_if(first[4u], first[6u], compare);
swap_if(first[18u], first[21u], compare);
swap_if(first[1u], first[4u], compare);
swap_if(first[19u], first[21u], compare);
swap_if(first[1u], first[3u], compare);
swap_if(first[9u], first[10u], compare);
swap_if(first[11u], first[13u], compare);
swap_if(first[2u], first[6u], compare);
swap_if(first[16u], first[20u], compare);
swap_if(first[4u], first[9u], compare);
swap_if(first[13u], first[18u], compare);
swap_if(first[19u], first[20u], compare);
swap_if(first[2u], first[3u], compare);
swap_if(first[18u], first[20u], compare);
swap_if(first[2u], first[4u], compare);
swap_if(first[5u], first[17u], compare);
swap_if(first[12u], first[14u], compare);
swap_if(first[8u], first[12u], compare);
swap_if(first[5u], first[7u], compare);
swap_if(first[15u], first[17u], compare);
swap_if(first[5u], first[8u], compare);
swap_if(first[14u], first[17u], compare);
swap_if(first[3u], first[5u], compare);
swap_if(first[17u], first[19u], compare);
swap_if(first[3u], first[4u], compare);
swap_if(first[18u], first[19u], compare);
swap_if(first[6u], first[10u], compare);
swap_if(first[11u], first[16u], compare);
swap_if(first[13u], first[16u], compare);
swap_if(first[6u], first[9u], compare);
swap_if(first[16u], first[17u], compare);
swap_if(first[5u], first[6u], compare);
swap_if(first[4u], first[5u], compare);
swap_if(first[7u], first[9u], compare);
swap_if(first[17u], first[18u], compare);
swap_if(first[12u], first[15u], compare);
swap_if(first[14u], first[15u], compare);
swap_if(first[8u], first[12u], compare);
swap_if(first[7u], first[8u], compare);
swap_if(first[13u], first[15u], compare);
swap_if(first[15u], first[17u], compare);
swap_if(first[5u], first[7u], compare);
swap_if(first[9u], first[10u], compare);
swap_if(first[10u], first[14u], compare);
swap_if(first[6u], first[11u], compare);
swap_if(first[14u], first[16u], compare);
swap_if(first[15u], first[16u], compare);
swap_if(first[6u], first[7u], compare);
swap_if(first[10u], first[11u], compare);
swap_if(first[9u], first[12u], compare);
swap_if(first[11u], first[13u], compare);
swap_if(first[13u], first[14u], compare);
swap_if(first[8u], first[9u], compare);
swap_if(first[7u], first[8u], compare);
swap_if(first[14u], first[15u], compare);
swap_if(first[9u], first[10u], compare);
swap_if(first[8u], first[9u], compare);
swap_if(first[12u], first[14u], compare);
swap_if(first[11u], first[12u], compare);
swap_if(first[12u], first[13u], compare);
swap_if(first[10u], first[11u], compare);
swap_if(first[11u], first[12u], compare);
}
The swap_if
utility function compares two parameters x
and y
with the predicate compare
and swaps them if compare(y, x)
. My example uses a a generic swap_if
function, but you can used an optimized version if you known that you will be comparing unsigned short
values with operator<
anyway (you might not need such a function if your compiler recognizes and optimizes the compare-exchange, but unfortunately, not all compilers do that - I am using g++5.2 with -O3
and I still need the following function for performance):
void swap_if(unsigned short& x, unsigned short& y)
{
unsigned short dx = x;
unsigned short dy = y;
unsigned short tmp = x = std::min(dx, dy);
y ^= dx ^ tmp;
}
Now, just to make sure that it is indeed faster, I decided to time std::nth_element
when required to partial sort only the first 10 elements vs. sorting the whole 23 elements with the sorting network (1000000 times with different shuffled arrays). Here is what I get:
std::nth_element 1158ms
network_sort23 487ms
That said, my computer has been running for a bit of time and is a bit slow, but the difference in performance is neat. I believe that this difference will remain the same when I restart my computer. I may try it later and let you know.
Regarding how these times were generated, I used a modified version of this benchmark from my cpp-sort library. The original sorting network and swap_if
functions come from there as well, so you can be sure that they have been tested more than once :)
EDIT: here are the results now that I have restarted my computer. The network_sort23
version is still two times faster than std::nth_element
:
std::nth_element 369ms
network_sort23 154ms
EDIT²: if all you need in the median, you can trivially delete the compare-exchange units that are not needed to compute the final value that will be at the 11th position. The resulting median-finding network of size 23 that follows uses a different size-23 sorting network than the previous one, and it yields slightly better results:
swap_if(first[0u], first[1u], compare);
swap_if(first[2u], first[3u], compare);
swap_if(first[4u], first[5u], compare);
swap_if(first[6u], first[7u], compare);
swap_if(first[8u], first[9u], compare);
swap_if(first[10u], first[11u], compare);
swap_if(first[1u], first[3u], compare);
swap_if(first[5u], first[7u], compare);
swap_if(first[9u], first[11u], compare);
swap_if(first[0u], first[2u], compare);
swap_if(first[4u], first[6u], compare);
swap_if(first[8u], first[10u], compare);
swap_if(first[1u], first[2u], compare);
swap_if(first[5u], first[6u], compare);
swap_if(first[9u], first[10u], compare);
swap_if(first[1u], first[5u], compare);
swap_if(first[6u], first[10u], compare);
swap_if(first[5u], first[9u], compare);
swap_if(first[2u], first[6u], compare);
swap_if(first[1u], first[5u], compare);
swap_if(first[6u], first[10u], compare);
swap_if(first[0u], first[4u], compare);
swap_if(first[7u], first[11u], compare);
swap_if(first[3u], first[7u], compare);
swap_if(first[4u], first[8u], compare);
swap_if(first[0u], first[4u], compare);
swap_if(first[7u], first[11u], compare);
swap_if(first[1u], first[4u], compare);
swap_if(first[7u], first[10u], compare);
swap_if(first[3u], first[8u], compare);
swap_if(first[2u], first[3u], compare);
swap_if(first[8u], first[9u], compare);
swap_if(first[2u], first[4u], compare);
swap_if(first[7u], first[9u], compare);
swap_if(first[3u], first[5u], compare);
swap_if(first[6u], first[8u], compare);
swap_if(first[3u], first[4u], compare);
swap_if(first[5u], first[6u], compare);
swap_if(first[7u], first[8u], compare);
swap_if(first[12u], first[13u], compare);
swap_if(first[14u], first[15u], compare);
swap_if(first[16u], first[17u], compare);
swap_if(first[18u], first[19u], compare);
swap_if(first[20u], first[21u], compare);
swap_if(first[13u], first[15u], compare);
swap_if(first[17u], first[19u], compare);
swap_if(first[12u], first[14u], compare);
swap_if(first[16u], first[18u], compare);
swap_if(first[20u], first[22u], compare);
swap_if(first[13u], first[14u], compare);
swap_if(first[17u], first[18u], compare);
swap_if(first[21u], first[22u], compare);
swap_if(first[13u], first[17u], compare);
swap_if(first[18u], first[22u], compare);
swap_if(first[17u], first[21u], compare);
swap_if(first[14u], first[18u], compare);
swap_if(first[13u], first[17u], compare);
swap_if(first[18u], first[22u], compare);
swap_if(first[12u], first[16u], compare);
swap_if(first[15u], first[19u], compare);
swap_if(first[16u], first[20u], compare);
swap_if(first[12u], first[16u], compare);
swap_if(first[13u], first[16u], compare);
swap_if(first[19u], first[22u], compare);
swap_if(first[15u], first[20u], compare);
swap_if(first[14u], first[15u], compare);
swap_if(first[20u], first[21u], compare);
swap_if(first[14u], first[16u], compare);
swap_if(first[19u], first[21u], compare);
swap_if(first[15u], first[17u], compare);
swap_if(first[18u], first[20u], compare);
swap_if(first[15u], first[16u], compare);
swap_if(first[17u], first[18u], compare);
swap_if(first[19u], first[20u], compare);
swap_if(first[0u], first[12u], compare);
swap_if(first[2u], first[14u], compare);
swap_if(first[4u], first[16u], compare);
swap_if(first[6u], first[18u], compare);
swap_if(first[8u], first[20u], compare);
swap_if(first[10u], first[22u], compare);
swap_if(first[2u], first[12u], compare);
swap_if(first[10u], first[20u], compare);
swap_if(first[4u], first[12u], compare);
swap_if(first[6u], first[14u], compare);
swap_if(first[8u], first[16u], compare);
swap_if(first[10u], first[18u], compare);
swap_if(first[8u], first[12u], compare);
swap_if(first[10u], first[14u], compare);
swap_if(first[10u], first[12u], compare);
swap_if(first[1u], first[13u], compare);
swap_if(first[3u], first[15u], compare);
swap_if(first[5u], first[17u], compare);
swap_if(first[7u], first[19u], compare);
swap_if(first[9u], first[21u], compare);
swap_if(first[3u], first[13u], compare);
swap_if(first[11u], first[21u], compare);
swap_if(first[5u], first[13u], compare);
swap_if(first[7u], first[15u], compare);
swap_if(first[9u], first[17u], compare);
swap_if(first[11u], first[19u], compare);
swap_if(first[9u], first[13u], compare);
swap_if(first[11u], first[15u], compare);
swap_if(first[11u], first[13u], compare);
swap_if(first[11u], first[12u], compare);
There are probably smarter ways to generate median-finding networks, but I don't think that extensive research has been done on the subject. Therefore, it's probably the best method you can use as of now. The result isn't awesome but it still uses 104 compare-exchange units instead of 118.
arr[0]
?) – Sandrocottuspiksort
is faster it's due to cache friendliness, not inlining. Andsort
andnth_element
implementations are quite different. So I would suspect thatnth_element
is more cache friendly. You could try implementing actualpiko_nth_element
instead of full fledgedsort
. Also thispiksort
looks like bubble sort, or am I missing something. – Medamedalstd::nth_element
. Blog-post here: disnetwork.info/the-blog/median-value-selection-algorithm – Pentad