This post include
- wrapper classes over
realloc
, mremap
to provide reallocation functionality.
- A custom vector class.
- A performance test.
// C++17
#include <benchmark/benchmark.h> // Googleo benchmark lib, for benchmark.
#include <new> // For std::bad_alloc.
#include <memory> // For std::allocator_traits, std::uninitialized_move.
#include <cstdlib> // For C heap management API.
#include <cstddef> // For std::size_t, std::max_align_t.
#include <cassert> // For assert.
#include <utility> // For std::forward, std::declval,
namespace linux {
#include <sys/mman.h> // For mmap, mremap, munmap.
#include <errno.h> // For errno.
auto get_errno() noexcept {
return errno;
}
}
/*
* Allocators.
* These allocators will have non-standard compliant behavior if the type T's cp ctor has side effect.
*/
// class mrealloc are usefull for allocating small space for
// std::vector.
//
// Can prevent copy of data and memory fragmentation if there's enough
// continous memory at the original place.
template <class T>
struct mrealloc {
using pointer = T*;
using value_type = T;
auto allocate(std::size_t len) {
if (auto ret = std::malloc(len))
return static_cast<pointer>(ret);
else
throw std::bad_alloc();
}
auto reallocate(pointer old_ptr, std::size_t old_len, std::size_t len) {
if (auto ret = std::realloc(old_ptr, len))
return static_cast<pointer>(ret);
else
throw std::bad_alloc();
}
void deallocate(void *ptr, std::size_t len) noexcept {
std::free(ptr);
}
};
// class mmaprealloc is suitable for large memory use.
//
// It will be usefull for situation that std::vector can grow to a huge
// size.
//
// User can call reserve without worrying wasting a lot of memory.
//
// It can prevent data copy and memory fragmentation at any time.
template <class T>
struct mmaprealloc {
using pointer = T*;
using value_type = T;
auto allocate(std::size_t len) const
{
return allocate_impl(len, MAP_PRIVATE | MAP_ANONYMOUS);
}
auto reallocate(pointer old_ptr, std::size_t old_len, std::size_t len) const
{
return reallocate_impl(old_ptr, old_len, len, MREMAP_MAYMOVE);
}
void deallocate(pointer ptr, std::size_t len) const noexcept
{
assert(linux::munmap(ptr, len) == 0);
}
protected:
auto allocate_impl(std::size_t _len, int flags) const
{
if (auto ret = linux::mmap(nullptr, get_proper_size(_len), PROT_READ | PROT_WRITE, flags, -1, 0))
return static_cast<pointer>(ret);
else
fail(EAGAIN | ENOMEM);
}
auto reallocate_impl(pointer old_ptr, std::size_t old_len, std::size_t _len, int flags) const
{
if (auto ret = linux::mremap(old_ptr, old_len, get_proper_size(_len), flags))
return static_cast<pointer>(ret);
else
fail(EAGAIN | ENOMEM);
}
static inline constexpr const std::size_t magic_num = 4096 - 1;
static inline auto get_proper_size(std::size_t len) noexcept -> std::size_t {
return round_to_pagesize(len);
}
static inline auto round_to_pagesize(std::size_t len) noexcept -> std::size_t {
return (len + magic_num) & ~magic_num;
}
static inline void fail(int assert_val)
{
auto _errno = linux::get_errno();
assert(_errno == assert_val);
throw std::bad_alloc();
}
};
template <class T>
struct mmaprealloc_populate_ver: mmaprealloc<T> {
auto allocate(size_t len) const
{
return mmaprealloc<T>::allocate_impl(len, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE);
}
};
namespace impl {
struct disambiguation_t2 {};
struct disambiguation_t1 {
constexpr operator disambiguation_t2() const noexcept { return {}; }
};
template <class Alloc>
static constexpr auto has_reallocate(disambiguation_t1) noexcept -> decltype(&Alloc::reallocate, bool{}) { return true; }
template <class Alloc>
static constexpr bool has_reallocate(disambiguation_t2) noexcept { return false; }
template <class Alloc>
static inline constexpr const bool has_reallocate_v = has_reallocate<Alloc>(disambiguation_t1{});
} /* impl */
template <class Alloc>
struct allocator_traits: public std::allocator_traits<Alloc> {
using Base = std::allocator_traits<Alloc>;
using value_type = typename Base::value_type;
using pointer = typename Base::pointer;
using size_t = typename Base::size_type;
static auto reallocate(Alloc &alloc, pointer prev_ptr, size_t prev_len, size_t new_len) {
if constexpr(impl::has_reallocate_v<Alloc>)
return alloc.reallocate(prev_ptr, prev_len, new_len);
else {
auto new_ptr = Base::allocate(alloc, new_len);
// Move existing array
for(auto _prev_ptr = prev_ptr, _new_ptr = new_ptr; _prev_ptr != prev_ptr + prev_len; ++_prev_ptr, ++_new_ptr) {
new (_new_ptr) value_type(std::move(*_prev_ptr));
_new_ptr->~value_type();
}
Base::deallocate(alloc, prev_ptr, prev_len);
return new_ptr;
}
}
};
template <class T, class Alloc = std::allocator<T>>
struct vector: protected Alloc {
using alloc_traits = allocator_traits<Alloc>;
using pointer = typename alloc_traits::pointer;
using size_t = typename alloc_traits::size_type;
pointer ptr = nullptr;
size_t last = 0;
size_t avail = 0;
~vector() noexcept {
alloc_traits::deallocate(*this, ptr, avail);
}
template <class ...Args>
void emplace_back(Args &&...args) {
if (last == avail)
double_the_size();
alloc_traits::construct(*this, &ptr[last++], std::forward<Args>(args)...);
}
void double_the_size() {
if (__builtin_expect(!!(avail), true)) {
avail <<= 1;
ptr = alloc_traits::reallocate(*this, ptr, last, avail);
} else {
avail = 1 << 4;
ptr = alloc_traits::allocate(*this, avail);
}
}
};
template <class T>
static void BM_vector(benchmark::State &state) {
for(auto _: state) {
T c;
for(auto i = state.range(0); --i >= 0; )
c.emplace_back((char)i);
}
}
static constexpr const auto one_GB = 1 << 30;
BENCHMARK_TEMPLATE(BM_vector, vector<char>) ->Range(1 << 3, one_GB);
BENCHMARK_TEMPLATE(BM_vector, vector<char, mrealloc<char>>) ->Range(1 << 3, one_GB);
BENCHMARK_TEMPLATE(BM_vector, vector<char, mmaprealloc<char>>) ->Range(1 << 3, one_GB);
BENCHMARK_TEMPLATE(BM_vector, vector<char, mmaprealloc_populate_ver<char>>)->Range(1 << 3, one_GB);
BENCHMARK_MAIN();
- Performance test.
All the performance test are done on:
Debian 9.4, Linux version 4.9.0-6-amd64 ([email protected])(gcc version 6.3.0 20170516 (Debian 6.3.0-18+deb9u1) ) #1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02)
Compiled using clang++ -std=c++17 -lbenchmark -lpthread -Ofast main.cc
The command I used to run this test:
sudo cpupower frequency-set --governor performance
./a.out
Here's the output of google benchmark test:
Run on (8 X 1600 MHz CPU s)
CPU Caches:
L1 Data 32K (x4)
L1 Instruction 32K (x4)
L2 Unified 256K (x4)
L3 Unified 6144K (x1)
----------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------------------------------------------------
BM_vector<vector<char>>/8 58 ns 58 ns 11476934
BM_vector<vector<char>>/64 324 ns 324 ns 2225396
BM_vector<vector<char>>/512 1527 ns 1527 ns 453629
BM_vector<vector<char>>/4096 7196 ns 7196 ns 96695
BM_vector<vector<char>>/32768 50145 ns 50140 ns 13655
BM_vector<vector<char>>/262144 549821 ns 549825 ns 1245
BM_vector<vector<char>>/2097152 5007342 ns 5006393 ns 146
BM_vector<vector<char>>/16777216 42873349 ns 42873462 ns 15
BM_vector<vector<char>>/134217728 336225619 ns 336097218 ns 2
BM_vector<vector<char>>/1073741824 2642934606 ns 2642803281 ns 1
BM_vector<vector<char, mrealloc<char>>>/8 55 ns 55 ns 12914365
BM_vector<vector<char, mrealloc<char>>>/64 266 ns 266 ns 2591225
BM_vector<vector<char, mrealloc<char>>>/512 1229 ns 1229 ns 567505
BM_vector<vector<char, mrealloc<char>>>/4096 6903 ns 6903 ns 102752
BM_vector<vector<char, mrealloc<char>>>/32768 48522 ns 48523 ns 14409
BM_vector<vector<char, mrealloc<char>>>/262144 399470 ns 399368 ns 1783
BM_vector<vector<char, mrealloc<char>>>/2097152 3048578 ns 3048619 ns 229
BM_vector<vector<char, mrealloc<char>>>/16777216 24426934 ns 24421774 ns 29
BM_vector<vector<char, mrealloc<char>>>/134217728 262355961 ns 262357084 ns 3
BM_vector<vector<char, mrealloc<char>>>/1073741824 2092577020 ns 2092317044 ns 1
BM_vector<vector<char, mmaprealloc<char>>>/8 4285 ns 4285 ns 161498
BM_vector<vector<char, mmaprealloc<char>>>/64 5485 ns 5485 ns 125375
BM_vector<vector<char, mmaprealloc<char>>>/512 8571 ns 8569 ns 80345
BM_vector<vector<char, mmaprealloc<char>>>/4096 24248 ns 24248 ns 28655
BM_vector<vector<char, mmaprealloc<char>>>/32768 165021 ns 165011 ns 4421
BM_vector<vector<char, mmaprealloc<char>>>/262144 1177041 ns 1177048 ns 557
BM_vector<vector<char, mmaprealloc<char>>>/2097152 9229860 ns 9230023 ns 74
BM_vector<vector<char, mmaprealloc<char>>>/16777216 75425704 ns 75426431 ns 9
BM_vector<vector<char, mmaprealloc<char>>>/134217728 607661012 ns 607662273 ns 1
BM_vector<vector<char, mmaprealloc<char>>>/1073741824 4871003928 ns 4870588050 ns 1
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/8 3956 ns 3956 ns 175037
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/64 5087 ns 5086 ns 133944
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/512 8662 ns 8662 ns 80579
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/4096 23883 ns 23883 ns 29265
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/32768 158374 ns 158376 ns 4444
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/262144 1171514 ns 1171522 ns 593
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/2097152 9297357 ns 9293770 ns 74
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/16777216 75140789 ns 75141057 ns 9
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/134217728 636359403 ns 636368640 ns 1
BM_vector<vector<char, mmaprealloc_populate_ver<char>>>/1073741824 4865103542 ns 4864582150 ns 1
.capacity()
– Schismaticb.capacity()
, I get powers of 2. Using GCC. – Hexagonalvector
must always use a geometric growth rate.push_back
has amortized constant complexity, which can't be accomplished with an arithmetic growth rate. – Murreserve()
, how do you know how much tonew[]
? – Suppletionb.reserve(size)
which is basically the same asint * a = new int [size];
. – Ockhamvolatile int * a = new int [size];
in your C version. – Chuckwallavolatile
only makes the volatile- qualified thingy ordered with respect to observable side effects. It is more direct simply to create a visible side effect, like for example, returning the vector's final size from main(). For anything other than reading/writing memory-mapped hardware and the like,volatile
is intrinsically a dirty word. It does not mean what 99.9% of programmers think it does. – Woodshedb[i]=i
for this purpose, so it is not enough? – Ollivolatile
, it forces those assignment being actually performed. Many people misusing it doesn't mean it's evil in every case. – Chuckwallastd::cout << whatever[size - 1];
should do it or making the variable volatile will do it as well. – Ockhamrealloc
, due to requirements on allocator usage. I guess in theory, there could be a specialization for the default allocator of Trivally-Copyable types, but I wouldn't think any standard library implementation does that. It would be more fair to compare vs amalloc
,memcpy
,free
– Rompishoperator new
is user-replaceable, which means you need whole-program optimization and cooperation between compiler and library forstd::vector
to be able to userealloc
instead of stupidly always copying, even if it could have just left 1GiB of data in place andmmap
ed new memory at the end. But that's hard so AFAIK no real implementations do this optimization. Similarly, they don't optimize a zero-initializer tocalloc
to take advantage of already-zeroed memory, instead always dirtying it. Makes it more expensive to alloc extra then shrink to fit. – Influenzaoperator new
, but didn't think it was necessary to go into such details. I do wonder if it's undefined behavior to write a user-definedoperator new
for primitive types. If so, it would become doable to have the optimization forvector<int>
and the like. – Rompishstd::vector
without whole-program optimization. BTW, Is it guaranteed that C++ standard library containers call the replaceable new functions? was asked specifically to find out if this optimization would be possible for a C++ library without checking ifnew
was replaced. – Influenza