This is for C++11. Older standards need additional stuff
implemented in the allocator [1].
This is proof-of-concept code. It runs and solves the example
problem but suffers from several limitations. It still
demonstrates how a custom allocator can be used to improve
performance in a scenario where a lot of std::vector
s are
created and destroyed.
PoolAlloc.hh
:
template<typename T>
struct MemChunk
{
std::size_t buf_size=0;
T* buf=nullptr;
T* top=nullptr;
std::size_t used=0;
};
template<typename T>
class PoolAllocator
{
public:
using value_type=T;
PoolAllocator();
explicit PoolAllocator(std::size_t);
PoolAllocator(PoolAllocator const&) noexcept;
template<typename U>
PoolAllocator(PoolAllocator<U> const&) noexcept;
PoolAllocator(PoolAllocator&&) noexcept;
PoolAllocator& operator=(PoolAllocator const&)=delete;
PoolAllocator& operator=(PoolAllocator&&)=delete;
~PoolAllocator();
template <typename U>
struct rebind
{
using other=PoolAllocator<U>;
};
T* allocate(std::size_t);
void deallocate(T*, std::size_t) noexcept;
template<typename U1, typename U2>
friend bool operator==(PoolAllocator<U1> const&, PoolAllocator<U2> const&) noexcept;
private:
std::vector<MemChunk<T>>* memory_=nullptr;
int* ref_count_=nullptr;
std::size_t default_buf_size_=0;
};
template<typename T>
PoolAllocator<T>::PoolAllocator():
PoolAllocator{100000} {}
template<typename T>
PoolAllocator<T>::PoolAllocator(std::size_t buf_size):
memory_{new std::vector<MemChunk<T>>},
ref_count_{new int(0)},
default_buf_size_{buf_size}
{
memory_->emplace_back();
memory_->back().buf_size=buf_size;
memory_->back().buf=new T[buf_size];
memory_->back().top=memory_->back().buf;
++(*ref_count_);
}
template<typename T>
PoolAllocator<T>::PoolAllocator(PoolAllocator const& src) noexcept:
memory_{src.memory_},
ref_count_{src.ref_count_},
default_buf_size_{src.default_buf_size_}
{
++(*ref_count_);
}
template<typename T>
PoolAllocator<T>::PoolAllocator(PoolAllocator&& src) noexcept:
memory_{src.memory_},
ref_count_{src.ref_count_},
default_buf_size_{src.default_buf_size_}
{
src.memory_=nullptr;
src.ref_count_=nullptr;
}
template<typename T>
template<typename U>
PoolAllocator<T>::PoolAllocator(PoolAllocator<U> const& src) noexcept:
memory_{src.memory_},
ref_count_{src.ref_count_},
default_buf_size_{src.default_buf_size_}
{
++(*ref_count_);
}
template<typename T>
PoolAllocator<T>::~PoolAllocator()
{
if (ref_count_!=nullptr)
{
--(*ref_count_);
if (*ref_count_==0)
{
if (memory_!=nullptr)
{
for (auto& it : *memory_)
{
delete[] it.buf;
}
delete memory_;
}
delete ref_count_;
}
}
}
template<typename T>
T*
PoolAllocator<T>::allocate(std::size_t n)
{
MemChunk<T>* mem_chunk=&memory_->back();
if ((mem_chunk->used+n)>mem_chunk->buf_size)
{
default_buf_size_*=2;
memory_->emplace_back();
mem_chunk=&memory_->back();
std::size_t buf_size=default_buf_size_;
if (n>default_buf_size_)
{
buf_size=n;
}
mem_chunk->buf_size=buf_size;
mem_chunk->buf=new T[mem_chunk->buf_size];
mem_chunk->top=mem_chunk->buf;
}
T* r=mem_chunk->top;
mem_chunk->top+=n;
mem_chunk->used+=n;
return r;
}
template<typename T>
void
PoolAllocator<T>::deallocate(T* addr, std::size_t n) noexcept
{
MemChunk<T>* mem_chunk=&memory_->back();
if (mem_chunk->used>n and (mem_chunk->top-n)==addr)
{
mem_chunk->used-=n;
mem_chunk->top-=n;
}
}
template<typename U1, typename U2>
bool operator==(PoolAllocator<U1> const& lhs, PoolAllocator<U2> const& rhs) noexcept
{
return (std::is_same<U1, U2>::value and lhs.memory_==rhs.memory_);
}
Using your example modified in the following way:
#include <iostream>
#include <vector>
#include <random>
#include "PoolAlloc.hh"
using namespace std;
int main()
{
// Random generator and distribution
mt19937 gen(123456);
uniform_real_distribution<> dis01(0., 1.);
PoolAllocator<int> palloc{1000000};
// Make or delete 10E6 vectors.
vector< vector<int, PoolAllocator<int>> > v; //the inner vectors will make many calls to new and delete
v.reserve(10E5); //assume some size.
for(int i=0; i<10E6; ++i)
{
if(dis01(gen)<0.6) // if true: make new sub-vector
{
v.emplace_back(palloc); //new sub-vector
v.back().reserve(10);
for(int k=0; k<10; ++k)
v.back().emplace_back(k); //fill it with some numbers
}
else // else, delete the last entry if there is one.
if(!v.empty())
v.pop_back();
}
cout<<"v.size()= "<<v.size();
return 0;
}
The number of calls to malloc
drops from ~6e6 down to 21. Total
number of instructions drops from 3.7e9 to 2.5e9 (using -O3,
measured with valgrind --tool=callgrind
).
There are a couple of implementation details that will impact the
performance in different situations of usage.
Currently multiple buffers are used. If one is full, another one
is created. This way there never has to be a reallocation
operation which would get you into a world of hurt (see
comments).
The biggest question is, how to deal with deallocated memory.
Currently a trivial approach is used that only makes deallocated
memory available to later allocates when it is at the end of the
buffer. For your example that is sufficient, as you only
deallocate memory at the end of the buffer.
For more complex scenarios you will need a more sophisticated
mechanism. Some data structure is needed to store the addresses
and sizes of free memory chunks. Multiple concepts are possible
here and their performance will vary with the concrete situation
they are used in. I doubt there is a good one-size-fits-all
solution here.
[1] http://howardhinnant.github.io/stack_alloc.html
stack_alloc
: it works using a fixed size buffer and falling back to the heap if too much space is requested. – Arletha