Custom allocator performance
Asked Answered
E

4

9

I'm building an AVL tree class which will have a fixed maximum number of items. So I thought instead of allocating each item by itself, I'd just allocate the entire chunk at once and use a bitmap to assign new memory when needed.

My allocation / deallocation code:

avltree::avltree(UINT64 numitems)
{
  root = NULL;

  if (!numitems)
    buffer = NULL;
  else {
    UINT64 memsize = sizeof(avlnode) * numitems + bitlist::storagesize(numitems);
    buffer = (avlnode *) malloc(memsize);
    memmap.init(numitems, buffer + numitems);
    memmap.clear_all();
    freeaddr = 0;
  }
}

avlnode *avltree::newnode(keytype key)
{
  if (!buffer)
    return new avlnode(key);
  else 
  {
    UINT64 pos;
    if (freeaddr < memmap.size_bits)
      pos = freeaddr++;
    else
      pos = memmap.get_first_unset();
    memmap.set_bit(pos);
    return new (&buffer[pos]) avlnode(key);
  }
}

void avltree::deletenode(avlnode *node)
{
  if (!buffer)
    delete node;
  else
    memmap.clear_bit(node - buffer);
}

In order to use standard new / delete, I have to construct the tree with numitems == 0. In order to use my own allocator, I just pass number of items. All functions are inlined for maximum performance.

This is all fine and dandy, but my own allocator is abut 20% slower than new / delete. Now, I know how complex memory allocators are, there's no way that code can run faster than an array lookup + one bit set, but that is exactly the case here. What's worse: my deallocator is slower even if I remove all code from it?!?

When I check assembly output, my allocator's code path is ridden with QWORD PTR instructions dealing with bitmap, avltree or avlnode. It doesn't seem to be much different for the new / delete path.

For example, assembly output of avltree::newnode:

;avltree::newnode, COMDAT
mov     QWORD PTR [rsp+8], rbx
push    rdi
sub     rsp, 32

;if (!buffer)
cmp     QWORD PTR [rcx+8], 0
mov     edi, edx
mov     rbx, rcx
jne     SHORT $LN4@newnode

;  return new avlnode(key);
mov     ecx, 24
call    ??2@YAPEAX_K@Z         ; operator new
jmp     SHORT $LN27@newnode

;$LN4@newnode:
;else {
;  UINT64 pos;
;  if (freeaddr < memmap.size_bits)
mov     r9, QWORD PTR [rcx+40]
cmp     r9, QWORD PTR [rcx+32]
jae     SHORT $LN2@newnode

;    pos = freeaddr++;
lea     rax, QWORD PTR [r9+1]
mov     QWORD PTR [rcx+40], rax

;  else
jmp     SHORT $LN1@newnode
$LN2@newnode:

;    pos = memmap.get_first_unset();
add     rcx, 16
call    ?get_first_unset@bitlist@@QEAA_KXZ ; bitlist::get_first_unset
mov     r9, rax

$LN1@newnode:
; memmap.set_bit(pos);
mov     rcx, QWORD PTR [rbx+16]                    ;data[bindex(pos)] |= bmask(pos);
mov     rdx, r9                                    ;return pos / (sizeof(BITINT) * 8);
shr     rdx, 6
lea     r8, QWORD PTR [rcx+rdx*8]                  ;data[bindex(pos)] |= bmask(pos);
movzx   ecx, r9b                                   ;return 1ull << (pos % (sizeof(BITINT) * 8));
mov     edx, 1
and     cl, 63
shl     rdx, cl

;   return new (&buffer[pos]) avlnode(key);
lea     rcx, QWORD PTR [r9+r9*2]
; File c:\projects\vvd\vvd\util\bitlist.h
or      QWORD PTR [r8], rdx                        ;data[bindex(pos)] |= bmask(pos)

; 195  :     return new (&buffer[pos]) avlnode(key);
mov     rax, QWORD PTR [rbx+8]
lea     rax, QWORD PTR [rax+rcx*8]
; $LN27@newnode:
test    rax, rax
je      SHORT $LN9@newnode

; avlnode constructor; 
mov     BYTE PTR [rax+4], 1
mov     QWORD PTR [rax+8], 0
mov     QWORD PTR [rax+16], 0
mov     DWORD PTR [rax], edi

; 196  :   }
; 197  : }
; $LN9@newnode:
mov     rbx, QWORD PTR [rsp+48]
add     rsp, 32                        ; 00000020H
pop     rdi
ret     0
?newnode@avltree@@QEAAPEAUavlnode@@H@Z ENDP             ; avltree::newnode
_TEXT   ENDS

I have checked multiple times the output of compilation when I construct my avltree with default / custom allocator and it remains the same in this particular region of code. I have tried removing / replacing all relevant parts to no significant effect.

To be honest, I expected compiler to inline all of this since there are very few variables. I was hoping for everything except the avlnode objects themselves to be placed in registers, but that doesn't seem to be the case.

Yet the speed difference is clearly measurable. I'm not calling 3 seconds per 10 million nodes inserted slow, but I expected my code to be faster, not slower than generic allocator (2.5 seconds). That goes especially for the slower deallocator which is slower even when all code is stripped from it.

Why is it slower?

Edit: Thank you all for excellent thoughts on this. But I would like to stress again that the issue is not so much within my method of allocation as it is in suboptimal way of using the variables: the entire avltree class contains just 4 UINT64 variables, bitlist only has 3.

However, despite that, the compiler doesn't optimise this into registers. It insists on QWORD PTR instructions that are orders of magnitude slower. Is this because I'm using classes? Should I move to C / plain variables? Scratch that. Stupid me. I have all the avltree code in there as well, things can't be in registers.

Also, I am at a total loss why my deallocator would still be slower, even if I remove ALL code from it. Yet QueryPerformanceCounter tells me just that. It's insane to even think that: that same deallocator also gets called for the new / delete code path and it has to delete the node... It doesn't have to do anything for my custom allocator (when I strip the code).

Edit2: I have now completely removed the bitlist and implemented free space tracking through a singly-linked list. The avltree::newnode function is now much more compact (21 instructions for my custom allocator path, 7 of those are QWORD PTR ops dealing with avltree and 4 are used for constructor of avlnode). The end result (time) decreased from ~3 seconds to ~2.95 seconds for 10 million allocations.

Edit3: I also rewrote the entire code such that now everything is handled by the singly linked list. Now the avltree class only has two relevant members: root and first_free. The speed difference remains.

Edit4: Rearranging code and looking at performance figures, these things are what helped the most:

  1. As pointed out by all contributors, having a bitmap in there was just plain bad. Removed in favour of singly-linked free slot list.
  2. Code locality: by adding dependent functions (avl tree handling ones) into a function-local class instead of having them declared globally helped some 15% with code speed (3 secs --> 2.5 secs)
  3. avlnode struct size: just adding #pragma pack(1) before struct declaration decreased execution time a further 20% (2,5 secs --> 2 secs)

Edit 5:

Since this querstion seems to have been quite popular, I have posted the final complete code as an answer below. I'm quite satisfied with its performance.

Espousal answered 15/4, 2015 at 12:55 Comment(2)
You seem to be doing a linear search with get_first_unset once the has been filled, this will hurt performance. The traditional data structure choice for this case would be a singly-linked free list with the next pointer overlaying the allocated data, thereby avoiding search. The standard allocator likely has a couple of such specialized pool allocators for fixed-length objects, though it ought to still suffer from some additional overhead to handle the general case (object size inference, locking, etc).Marmite
Yes, but for the benchmark the linear search isn't ever called because of freeaddr shortcut.Espousal
P
3

Your method only allocates the raw memory in one chunk and then has to do a placement new for each element. Combine that with all the overhead in your bitmap and its not too surprising that the default new allocation beats yours assuming an empty heap.

To get the most speed improvement when allocating you can allocate the entire object in one large array and then assign to it from there. If you look at a very simple and contrived benchmark:

struct test_t {
    float f;
    int i;
    test_t* pNext;
};

const size_t NUM_ALLOCS = 50000000;

void TestNew (void)
{
    test_t* pPtr = new test_t;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = new test_t;
        pPtr = pPtr->pNext;
    }

}

void TestBucket (void)
{
    test_t* pBuckets = new test_t[NUM_ALLOCS + 2];
    test_t* pPtr = pBuckets++;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = pBuckets++;
        pPtr = pPtr->pNext;
    }

}

With this code on MSVC++ 2013 with 50M allocations TestBucket() outperforms TestNew() by over a factor of x16 (130 vs 2080 ms). Even if you add a std::bitset<> to track allocations it is still x4 faster (400 ms).

An important thing to remember about new is that the time its takes to allocate an object generally depends on the state of the heap. An empty heap will be able to allocate a bunch of constant sized objects like this relatively fast, which is probably one reason why your code seems slower than new. If you have a program that runs for a while and allocates a large number of differently sized objects the heap can become fragmented and allocating objects can take much (much) longer.

As an example, one program I wrote was loading a 200MB file with millions of records...lots of differently sized allocations. On the first load it took ~15 seconds but if I deleted that file and tried to load it again it took something like x10-x20 longer. This was entirely due to memory allocation and switching to a simple bucket/arena allocator fixed the issue. So, that contrived benchmark I did showing a x16 speedup might actually get show a significantly larger difference with a fragmented heap.

It gets even more tricky when you realize that different systems/platforms may use different memory allocation schemes so the benchmark results on one system may be different from another.

To distil this into a few short points:

  1. Benchmarking memory allocation is tricky (performance depends on a lot of things)
  2. In some cases you can get better performance with a custom allocator. In a few cases you can get much better.
  3. Creating a custom allocator can be tricky and takes time to profile/benchmark your specific use case.

Note -- Benchmarks like this aren't meant to be realistic but are useful to determine the upper bound of how fast something can be. It can be used along with the profile/benchmark of your actual code to help determine what should/shouldn't be optimized.

Update -- I can't seem to replicate your results in my code under MSVC++ 2013. Using the same structure as your avlnode and trying a placement new yields the same speed as my non-placement bucket allocator tests (placement new was actually a little bit faster). Using a class similar to your avltree doesn't affect the benchmark much. With 10 million allocations/deallocations I'm getting ~800 ms for the new/delete and ~200ms for the custom allocator (both with and without placement new). While I'm not worried about the difference in absolute times, the relative time difference seems odd.

I would suggest taking a closer look at your benchmark and make sure you are measuring what you think you are. If the code exists in a larger code-base then create a minimal test case to benchmark it. Make sure that your compiler optimizer is not doing something that would invalidate the benchmark (it happens too easily these days).

Note that it would be far easier to answer your question if you had reduced it to a minimal example and included the complete code in the question, including the benchmark code. Benchmarking is one of those things that seems easy but there are a lot of "gotchas" involved in it.

Update 2 -- Including the basic allocator class and benchmark code I'm using so others can try to duplicate my results. Note that this is for testing only and is far from actual working/production code. It is far simpler than your code which may be why we're getting different results.

#include <string>
#include <Windows.h>

struct test_t 
{
    __int64 key;
    __int64 weight;
    __int64 left;
    __int64 right;
    test_t* pNext;     // Simple linked list

    test_t() : key(0), weight(0), pNext(NULL), left(0), right(0) { }
    test_t(const __int64 k) : key(k), weight(0), pNext(NULL), left(0), right(0) { }
};

const size_t NUM_ALLOCS = 10000000;
test_t* pLast; //To prevent compiler optimizations from being "smart"

struct CTest 
{
    test_t* m_pBuffer;
    size_t  m_MaxSize;
    size_t  m_FreeIndex;
    test_t* m_pFreeList;

    CTest(const size_t Size) :
            m_pBuffer(NULL),
            m_MaxSize(Size),
            m_pFreeList(NULL),
            m_FreeIndex(0)
    {
        if (m_MaxSize > 0) m_pBuffer = (test_t *) new char[sizeof(test_t) * (m_MaxSize + 1)];
    }

    test_t* NewNode(__int64 key)
    {
        if (!m_pBuffer || m_FreeIndex >= m_MaxSize) return new test_t(key);

        size_t Pos = m_FreeIndex;
        ++m_FreeIndex;
        return new (&m_pBuffer[Pos]) test_t(key);
    }

    void DeleteNode(test_t* pNode)
    {
        if (!m_pBuffer) {
            delete pNode;
        }
        else
        {
            pNode->pNext = m_pFreeList;
            m_pFreeList = pNode;
        }
    }

};


void TestNew(void)
{
    test_t* pPtr = new test_t;
    test_t* pFirst = pPtr;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = new test_t;
        pPtr = pPtr->pNext; 
    }

    pPtr = pFirst;

    while (pPtr)
    {
        test_t* pTemp = pPtr;
        pPtr = pPtr->pNext;
        delete pTemp;
    }

    pLast = pPtr;    
}


void TestClass(const size_t BufferSize)
{
    CTest Alloc(BufferSize);
    test_t* pPtr = Alloc.NewNode(0);
    test_t* pFirstPtr = pPtr;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = Alloc.NewNode(i);
        pPtr = pPtr->pNext;
    }

    pLast = pPtr;
    pPtr = pFirstPtr;

    while (pPtr != NULL)
    {
        test_t* pTmp = pPtr->pNext;
        Alloc.DeleteNode(pPtr);
        pPtr = pTmp;
    }
}


int main(void)
{
    DWORD StartTick = GetTickCount();
    TestClass(0);
    //TestClass(NUM_ALLOCS + 10);
    //TestNew();
    DWORD EndTick = GetTickCount();

    printf("Time = %u ms\n", EndTick - StartTick);
    printf("Last = %p\n", pLast);

    return 0;
}

Currently I'm getting ~800ms for both TestNew() and TestClass(0) and under 200ms for TestClass(NUM_ALLOCS + 10). The custom allocator is pretty fast as it operates on the memory in a completely linear fashion which allows the memory cache to work its magic. I'm also using GetTickCount() for simplicity and it is accurate enough so long as times are above ~100ms.

Paronymous answered 15/4, 2015 at 14:42 Comment(8)
Why would placement new be inefficient? In the general case of complex constructors it is surely preferable to initializing the objects twice, and judging by the generated assembly it mostly seems to perform the member initialization you would expect. Admittedly the C++ semantics do force a silly NULL test in there, but that should hardly be fatal.Marmite
How performant allocating a bucket of complete objects like this definitely depends on the complexity of the object. In the example benchmark having to double-initialize the two members reduces the performance increase to x5 (420 vs 2120 ms). If you just default initialize this increases to x7.5 (260 vs 2090 ms). Will be adding short note on the purpose of benchmarks like this.Paronymous
Indeed, but what is the performance problem with the original bucketizing allocator which the OP posted? Beyond the silly choice of a bitmap for marking allocated entries anyway.Marmite
Ahh, good question. We would need to know the definition/implementation of avlnode and bitset along with how the OP was benchmarking the code.Paronymous
I also don't see why placement new would be worse than "ordinary" new. In fact, the same code is called in both cases (jmp short $LN27@newnode). The "ordinary" new has to call malloc first. As seen from assembly, avlnode has 4 members (key, weight, left, right) which are initialized to (key, 1, null, null). The bitset is an array of UINT64, setting / getting is performed through div and mod. But this is all beside the point as bitset is never even used for my benchmark as the original bucket just gets used up at the end of it. The get_first_unset (while slow) doesn't get called at all.Espousal
Oh, and there is no double construction: I just do a malloc to get the large bucket, not a new [].Espousal
See update...I can't replicate your results. This could just be differences between our platforms or it could mean one set of benchmarks is not quite correct.Paronymous
Thanks for the update. I'd love to see the "class similar to my avltree". Granted, my implementation is a textbook one without optimisations, but 200 ms for 10 million inserts / deletes ??? That's just awesome. I do have a min implementation here, but initially, together with bitlist it took about 400 lines of code, which is too much for SO I think. Even with bitlist out of the picture I'm still @ 300 lines. SO guys should buy Ideone or some similar service :)Espousal
D
2

It's hard to be certain with such little code to study, but I'm betting on locality of reference. Your bitmap with metadata is not on the same cacheline as the allocated memory itself. And get_first_unset might be a linear search.

Diena answered 15/4, 2015 at 13:3 Comment(2)
OK, I'll fix that and make a singly linked list of the unused entries. That won't however change my code performance. As explained, the bitmap doesn't even get accessed because of freeaddr path (which makes sure that the entire array is used up before starting any search for empty slots.Espousal
Final version of my code heavily takes this into account. I'm not really sure if I should accept your answer or uesp's one ?:-/Espousal
I
0

Now, I know how complex memory allocators are, there's no way that code can run faster than an array lookup + one bit set, but that is exactly the case here.

This isn't even nearly correct. A decent bucketing low fragmentation heap is O(1) with very low constant time (and effectively zero additional space overhead). I've seen a version that came down to ~18 asm instructions (with one branch) before. Thats a lot less than your code. Remember, heaps may be massively complex in total, but the fast path through them may be really, really fast.

Ingate answered 15/4, 2015 at 13:49 Comment(0)
E
0

Just for reference, the following code was the most performant for the problem at hand.

It's just a simple avltree implementation, but it does reach 1,7 secs for 10 million inserts and 1,4 secs for equal number of deletes on my 2600K @ 4.6 GHz.

#include "stdafx.h"
#include <iostream>
#include <crtdbg.h>
#include <Windows.h>
#include <malloc.h>
#include <new>

#ifndef NULL
#define NULL 0
#endif

typedef int keytype;
typedef unsigned long long UINT64;

struct avlnode;

struct avltree
{
  avlnode *root;
  avlnode *buffer;
  avlnode *firstfree;

  avltree() : avltree(0) {};
  avltree(UINT64 numitems);

  inline avlnode *newnode(keytype key);
  inline void deletenode(avlnode *node);

  void insert(keytype key) { root = insert(root, key); }
  void remove(keytype key) { root = remove(root, key); }
  int height();
  bool hasitems() { return root != NULL; }
private:
  avlnode *insert(avlnode *node, keytype k);
  avlnode *remove(avlnode *node, keytype k);
};

#pragma pack(1)
struct avlnode
{
  avlnode *left;     //left pointer
  avlnode *right;    //right pointer
  keytype key;       //node key
  unsigned char hgt; //height of the node

  avlnode(int k)
  {
    key = k;
    left = right = NULL;
    hgt = 1;
  }

  avlnode &balance()
  {
    struct F
    {
      unsigned char height(avlnode &node)
      {
        return &node ? node.hgt : 0;
      }
      int balance(avlnode &node)
      {
        return &node ? height(*node.right) - height(*node.left) : 0;
      }
      int fixheight(avlnode &node)
      {
        unsigned char hl = height(*node.left);
        unsigned char hr = height(*node.right);
        node.hgt = (hl > hr ? hl : hr) + 1;
        return (&node) ? hr - hl : 0;
      }
      avlnode &rotateleft(avlnode &node)
      {
        avlnode &p = *node.right;
        node.right = p.left;
        p.left = &node;
        fixheight(node);
        fixheight(p);
        return p;
      }
      avlnode &rotateright(avlnode &node)
      {
        avlnode &q = *node.left;
        node.left = q.right;
        q.right = &node;
        fixheight(node);
        fixheight(q);
        return q;
      }
      avlnode &b(avlnode &node)
      {
        int bal = fixheight(node);
        if (bal == 2) {
          if (balance(*node.right) < 0)
            node.right = &rotateright(*node.right);
          return rotateleft(node);
        }
        if (bal == -2) {
          if (balance(*node.left) > 0)
            node.left = &rotateleft(*node.left);
          return rotateright(node);
        }
        return node; // balancing is not required
      }
    } f;
    return f.b(*this);
  }
};

avltree::avltree(UINT64 numitems)
{
  root = buffer = firstfree = NULL;
  if (numitems) {
    buffer = (avlnode *) malloc(sizeof(avlnode) * (numitems + 1));
    avlnode *tmp = &buffer[numitems];
    while (tmp > buffer) {
      tmp->right = firstfree;
      firstfree = tmp--;
    }
  }
}

avlnode *avltree::newnode(keytype key)
{
  avlnode *node = firstfree;
  /*
  If you want to support dynamic allocation, uncomment this.
  It does present a bit of an overhead for bucket allocation though (8% slower)
  Also, if a condition is met where bucket is too small, new nodes will be dynamically allocated, but never freed
  if (!node)
  return new avlnode(key);
  */
  firstfree = firstfree->right;
  return new (node) avlnode(key);
}

void avltree::deletenode(avlnode *node)
{
  /*
  If you want to support dynamic allocation, uncomment this.
  if (!buffer)
  delete node;
  else {
  */
  node->right = firstfree;
  firstfree = node;
}

int avltree::height()
{
  return root ? root->hgt : 0;
}

avlnode *avltree::insert(avlnode *node, keytype k)
{
  if (!node)
    return newnode(k);
  if (k == node->key)
    return node;
  else if (k < node->key)
    node->left = insert(node->left, k);
  else
    node->right = insert(node->right, k);
  return &node->balance();
}

avlnode *avltree::remove(avlnode *node, keytype k) // deleting k key from p tree
{
  if (!node)
    return NULL;
  if (k < node->key)
    node->left = remove(node->left, k);
  else if (k > node->key)
    node->right = remove(node->right, k);
  else //  k == p->key 
  {
    avlnode *l = node->left;
    avlnode *r = node->right;
    deletenode(node);
    if (!r) return l;

    struct F
    {
      //findmin finds the minimum node
      avlnode &findmin(avlnode *node)
      {
        return node->left ? findmin(node->left) : *node;
      }
      //removemin removes the minimum node
      avlnode &removemin(avlnode &node)
      {
        if (!node.left)
          return *node.right;
        node.left = &removemin(*node.left);
        return node.balance();
      }
    } f;

    avlnode &min = f.findmin(r);
    min.right = &f.removemin(*r);
    min.left = l;
    return &min.balance();
  }
  return &node->balance();
}
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
  // 64 bit release performance (for 10.000.000 nodes)
  // malloc:       insertion: 2,595  deletion 1,865
  // my allocator: insertion: 2,980  deletion 2,270
  const int nodescount = 10000000;

  avltree &tree = avltree(nodescount);
  cout << "sizeof avlnode " << sizeof(avlnode) << endl;
  cout << "inserting " << nodescount << " nodes" << endl;
  LARGE_INTEGER t1, t2, freq;
  QueryPerformanceFrequency(&freq);
  QueryPerformanceCounter(&t1);
  for (int i = 1; i <= nodescount; i++)
    tree.insert(i);
  QueryPerformanceCounter(&t2);
  cout << "Tree height " << (int) tree.height() << endl;
  cout << "Insertion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;
  QueryPerformanceCounter(&t1);
  while (tree.hasitems())
    tree.remove(tree.root->key);
  QueryPerformanceCounter(&t2);
  cout << "Deletion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;

#ifdef _DEBUG
  _CrtMemState mem;
  _CrtMemCheckpoint(&mem);
  cout << "Memory used: " << mem.lTotalCount << " high: " << mem.lHighWaterCount << endl;
#endif
    return 0;
}
Espousal answered 17/4, 2015 at 19:57 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.