I did it myself and implemented an thread-safe "shared_ptr" with DW-CAS.
I'm using split reference counting, having a counter in the second DW-CAS word which counts the number of increments in the control-block which will proceed soon. So there's no difference to other implementations. The problem with other implementations is that after the counter beside the pointer is incremented the whole pointer may be overwritten multiple times and at last the pointer-part will be the same like before, but the counter might have a value from other threads which are about to increment the counter in the control block. So the first incrementer might become scheduled away and get back after the same pointer is re-established with an update-counter from other threads. That is the point where other implementations fail.
So my solution is was to make this counter only 32 bit wide and use the upper 32 bits as a sequence counter which is incremented if the pointer is overwritten. So when the control block's counter is updated and the threads comes back to decrement the update-counter beside the pointer it can see if the update-counter is the one which the thread has updated before or not.
This is my code:
#pragma once
#include <atomic>
#include <utility>
#include <cassert>
#include <stdexcept>
#include <type_traits>
#include <new>
#include "dcas_atomic.h"
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) || defined(__clang__)
#include <x86intrin.h>
#endif
template<typename T>
struct shared_obj;
template<typename T, typename ... Args>
shared_obj<T> make_shared_obj( Args &&... args );
template<typename T>
struct tshared_obj;
template<typename T>
struct shared_obj
{
shared_obj() noexcept;
shared_obj( shared_obj && ) noexcept;
shared_obj( shared_obj const & ) noexcept;
shared_obj( tshared_obj<T> const & ) noexcept;
shared_obj( tshared_obj<T> && ) noexcept;
shared_obj &operator =( shared_obj && ) noexcept;
shared_obj &operator =( shared_obj const & ) noexcept;
shared_obj &operator =( tshared_obj<T> const & ) noexcept;
shared_obj &operator =( tshared_obj<T> && ) noexcept;
~shared_obj();
T *get() const noexcept;
operator bool() const noexcept;
size_t use_count() const noexcept;
T &operator *() const noexcept;
T *operator ->() const noexcept;
void reset() noexcept;
private:
template<typename T2, typename ... Args>
friend shared_obj<T2> make_shared_obj( Args &&... args );
template<typename T2>
friend struct tshared_obj;
struct ctrl_t
{
template<typename ... Args>
ctrl_t( Args &&... args );
T m_obj;
std::atomic<size_t> m_refCount;
};
shared_obj( ctrl_t *obj );
ctrl_t *m_ctrl;
};
template<typename T>
inline shared_obj<T>::shared_obj() noexcept :
m_ctrl( nullptr )
{
}
template<typename T>
inline shared_obj<T>::shared_obj( shared_obj &&other ) noexcept :
m_ctrl( other.m_ctrl )
{
other.m_ctrl = nullptr;
}
template<typename T>
inline shared_obj<T>::shared_obj( shared_obj const &other ) noexcept :
m_ctrl( other.m_ctrl )
{
size_t before = m_ctrl->m_refCount.fetch_add( 1, std::memory_order_relaxed );
assert(before != (size_t)-1);
}
template<typename T>
template<typename ... Args>
inline shared_obj<T>::ctrl_t::ctrl_t( Args &&... args ) :
m_obj( std::forward<Args>( args ) ... ),
m_refCount( 1 )
{
}
template<typename T>
shared_obj<T> &shared_obj<T>::operator =( shared_obj &&other ) noexcept
{
// both pointed to objects are equal ?
if( m_ctrl == other.m_ctrl ) [[unlikely]]
{
// remove reference from other
other.m_ctrl = nullptr;
size_t before = m_ctrl->m_refCount.fetch_sub( 1, std::memory_order_relaxed );
assert(before > 1);
return *this;
}
// delete before object if necessary
if( m_ctrl && m_ctrl->m_refCount.fetch_sub( 1, std::memory_order_release ) == 1 )
delete m_ctrl;
// move object ponter
m_ctrl = other.m_ctrl;
other.m_ctrl = nullptr;
return *this;
}
template<typename T>
shared_obj<T> &shared_obj<T>::operator =( shared_obj const &other ) noexcept
{
// our and other pointer are equal ?
if( m_ctrl != other.m_ctrl ) [[likely]]
{
// no: quit with our current object
this->~shared_obj();
// copy other's object
new( (void *)this ) shared_obj<T>( other.m_ctrl );
m_ctrl->fetch_add( 1, std::memory_order_acquire );
}
return *this;
}
template<typename T>
shared_obj<T>::~shared_obj()
{
// we have a managed object and we're the only one pointing to it ?
if( m_ctrl && m_ctrl->m_refCount.fetch_sub( 1, std::memory_order_release ) == 1 )
// yes: delete the object
delete m_ctrl;
}
template<typename T>
inline T *shared_obj<T>::get() const noexcept
{
return &m_ctrl->m_obj;
}
template<typename T>
inline shared_obj<T>::operator bool() const noexcept
{
return (bool)m_ctrl;
}
template<typename T>
inline size_t shared_obj<T>::use_count() const noexcept
{
// we've a managed object ?
if( m_ctrl ) [[likely]]
// yes: return its use count
return m_ctrl->m_refCount.load( std::memory_order_relaxed );
// no: use-count is zero
return 0;
}
template<typename T>
inline shared_obj<T>::shared_obj( ctrl_t *obj ) :
m_ctrl( obj )
{
}
template<typename T>
inline T &shared_obj<T>::operator *() const noexcept
{
assert(m_ctrl);
return m_ctrl->m_obj;
}
template<typename T>
inline T *shared_obj<T>::operator ->() const noexcept
{
return &m_ctrl->m_obj;
}
template<typename T>
inline void shared_obj<T>::reset() noexcept
{
this->~shared_obj();
new( (void *)this ) shared_obj<T>( nullptr );
}
template<typename T, typename ... Args>
shared_obj<T> make_shared_obj( Args &&... args )
{
return shared_obj<T>( new shared_obj<T>::ctrl_t( std::forward<Args>( args ) ... ) );
}
template<typename T>
struct tshared_obj
{
tshared_obj() noexcept;
tshared_obj( shared_obj<T> const & ) noexcept;
tshared_obj( shared_obj<T> && ) noexcept;
~tshared_obj();
tshared_obj &operator =( shared_obj<T> const & ) noexcept;
tshared_obj &operator =( shared_obj<T> && ) noexcept;
void reset() noexcept;
private:
template<typename T2>
friend struct shared_obj;
using ctrl_t = shared_obj<T>::ctrl_t;
using pair_t = dcas_atomic::pair_t;
static constexpr unsigned
SZT_BITS = sizeof(size_t) * 8,
UPDATER_BITS = SZT_BITS / 2,
SEQENCE_BITS = SZT_BITS / 2;
static constexpr size_t
UPDATER_VALUE = 1,
SEQUENCE_VALUE = (size_t)1 << UPDATER_BITS,
UPDATER_MASK = SEQUENCE_VALUE - 1,
SEQUENCE_MASK = ~UPDATER_MASK;
mutable dcas_atomic m_dca;
ctrl_t *copy() const noexcept;
ctrl_t *remove() noexcept;
void store( ctrl_t *obj ) noexcept;
};
template<typename T>
shared_obj<T>::shared_obj( tshared_obj<T> const &ts ) noexcept :
m_ctrl( ts.copy() )
{
}
template<typename T>
shared_obj<T>::shared_obj( tshared_obj<T> &&ts ) noexcept :
m_ctrl( ts.remove() )
{
}
template<typename T>
shared_obj<T> &shared_obj<T>::operator =( tshared_obj<T> const &ts ) noexcept
{
using namespace std;
// important optimization for RCU-like patterns:
// we're managing the same object like the thread-safe object-pointer ?
if( (size_t)m_ctrl == ts.m_dca.first().load( memory_order_relaxed ) ) [[likely]]
// yes: nothing to do
return *this;
// erase our object pointer
this->~shared_obj();
// copy() increments the reference count for ourself
new( (void *)this ) shared_obj<T>( ts.copy() );
return *this;
}
template<typename T>
shared_obj<T> &shared_obj<T>::operator =( tshared_obj<T> &&ts ) noexcept
{
// erase our object pointer
this->~shared_obj();
// move ts' object-pointer
new( (void *)this ) shared_obj<T>( ts.remove() );
return *this;
}
template<typename T>
inline tshared_obj<T>::tshared_obj() noexcept :
m_dca( pair_t( 0, 0 ) )
{
}
template<typename T>
tshared_obj<T>::~tshared_obj()
{
// pointer to managed object
ctrl_t *obj = (ctrl_t *)m_dca.first().load( std::memory_order_relaxed );
// no managed object ?
if( !obj )
// yes: nothing to do
return;
// there must be no current updaters
assert(!(m_dca.second() & UPDATER_MASK));
// decrement reference count of managed object
size_t before = obj->m_refCount.fetch_sub( 1, std::memory_order_release );
assert(before > 0);
// we're the only one pointing to the managed object ?
if( before == 1 )
// yes: delete the object
delete obj;
}
template<typename T>
inline tshared_obj<T>::tshared_obj( shared_obj<T> const &so ) noexcept :
m_dca( pair_t( (size_t)so.m_ctrl, 0 ) )
{
// increment so's reference count
size_t before = so.m_ctrl->m_refCount.fetch_add( 1, std::memory_order_release );
assert(before > 0 && before != (size_t)-1);
}
template<typename T>
inline tshared_obj<T>::tshared_obj( shared_obj<T> &&so ) noexcept :
m_dca( pair_t( (size_t)so.m_ctrl, 0 ) )
{
so.m_ctrl = nullptr;
}
template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( shared_obj<T> const &so ) noexcept
{
// pointer to so's object
ctrl_t *obj = so.m_ctrl;
// we're pointing to the same object ?
if( (size_t)obj == m_dca.first().load( std::memory_order_relaxed ) )
// yes: nothing to do
return *this;
// increment reference count
size_t before = obj->m_refCount.fetch_add( 1, std::memory_order_relaxed );
assert(before && before != (size_t)-1);
// store object pointer, maybe deallocate old
store( obj );
return *this;
}
template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( shared_obj<T> &&so ) noexcept
{
// move so's object pointer
ctrl_t *obj = so.m_ctrl;
so.m_ctrl = nullptr;
// store object pointer, maybe deallocate old
store( obj );
return *this;
}
template<typename T>
inline void tshared_obj<T>::reset() noexcept
{
using namespace std;
// remove object pointer
ctrl_t *obj = remove();
// there was no object ?
if( !obj ) [[unlikely]]
// yes: nothing further to do
return;
// decrement reference count
size_t before = obj->m_refCount.fetch_sub( 1, memory_order_relaxed );
assert(before > 0);
// we were the only pointing to obj ?
if( before == 1 ) [[unlikely]]
// yes: delete the object
delete obj;
}
// Copies the content of an atomic shared object to a pointer.
// Proceeds in three steps:
// 1. Increment the update counter.
// 2. Increment the reference counter
// 3. Decrement the update counter if pointer and sequence counter hasn't changed.
// Reference counter has been incremented for the next sharer
template<typename T>
typename tshared_obj<T>::ctrl_t *tshared_obj<T>::copy() const noexcept
{
using namespace std;
// increment updater value of our DCAS'd pointer
pair_t expected( m_dca.first().load( memory_order_relaxed ), m_dca.second().load( memory_order_relaxed ) ), desired;
do [[unlikely]]
// there's a managed object ?
if( expected.first ) [[likely]]
// yes: desired is the same except for one more updater
desired = { expected.first, expected.second + UPDATER_VALUE };
else
// no: nothing to do, return no managed object
return nullptr;
while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
// pointer to managed object
ctrl_t *obj = (ctrl_t *)desired.first;
// increment reference count
size_t rcBefore = obj->m_refCount.fetch_add( 1, memory_order_acquire );
assert(rcBefore != (size_t)-1);
size_t
// remember managed pointer, it must be the same with the next CMPXCHG
ptr = desired.first,
// remember managed sequence counter, it must be the same with the next CMPXCHG
seq = desired.second >> UPDATER_BITS;
// expect that we've still the last set DCAS-atomic
expected = desired;
// make desirecd.second to decrement the updater-counter
auto updateCtr = [&]
{
assert((expected.second & UPDATER_MASK) >= UPDATER_VALUE);
desired.second = expected.second - UPDATER_VALUE;
};
updateCtr();
// decrement updater counter failed ?
while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) ) [[unlikely]]
// yes: pointer and sequence-counter still the same ?
if( expected.first == ptr && expected.second >> UPDATER_BITS == seq ) [[likely]]
// yes: update updater's count
updateCtr();
else
// no: new managed object, nothing more to do
break;
return obj;
}
// Removes the atomic pointer, preparing it to be moved.
// Reference-count remains.
template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::remove() noexcept
{
using namespace std;
pair_t expected( m_dca.first().load( memory_order_relaxed ), m_dca.second().load( memory_order_relaxed ) ), desired;
desired.first = 0;
do [[unlikely]]
{
if( !expected.first ) [[unlikely]]
return nullptr;
desired.second = expected.second + SEQUENCE_VALUE & SEQUENCE_MASK;
} while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
return (ctrl_t *)expected.first;
}
// Replaces the atomic pointer and deletes the old if there aren't pending loads.
// Reference count of new object must have been already at least one.
// New object's reference should be set to at least one before.
// If old object's reference count drops to zero object is destroyed.
template<typename T>
inline void tshared_obj<T>::store( ctrl_t *obj ) noexcept
{
using namespace std;
pair_t
// set the expected value to the current state of the DCA (maybe split and not consistent)
expected( m_dca.first().load( memory_order_relaxed ), m_dca.second().load( memory_order_relaxed ) ),
// the desired object pointer should be "obj"
desired( (size_t)obj, 0 );
// set new object pointer and counters
do
// failed: we're still pointing to a different object than "obj" ?
if( expected.first != desired.first ) [[likely]]
// no: set new sequence counter to the current plus one and no updaters
desired.second = expected.second + SEQUENCE_VALUE & SEQUENCE_MASK;
else
{
// one reference count too much since we're moving obj into our state
size_t before = obj->m_refCount.fetch_sub( 1, memory_order_relaxed );
assert(before > 1);
return;
}
while( !m_dca.compare_exchange( expected, desired, dcas_release() ) );
// pointer to object before
ctrl_t *oldObj = (ctrl_t *)expected.first;
// empty: nothing to delete
if( !oldObj ) [[unlikely]]
return;
// decrement reference counter of before object
size_t before = oldObj->m_refCount.fetch_sub( 1, std::memory_order_relaxed );
assert(before);
// last reference count and there were no current updaters ?
if( before == 1 && !(desired.second & UPDATER_MASK) ) [[unlikely]]
// yes: delete object
delete oldObj;
}
This is the DW-CAS header dcas_atomic.h
:
#pragma once#pragma once
#include <cstdint>
#include <utility>
#include <atomic>
#include <type_traits>
#include <climits>
#include <bit>
#include <memory>
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 26495)
#endif
#if defined(__llvm__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Watomic-alignment"
#endif
constexpr int
DCAS_ACQ_REL = (int)std::memory_order::acq_rel,
DCAS_ACQUIRE = (int)std::memory_order::acquire,
DCAS_RELAXED = (int)std::memory_order::relaxed,
DCAS_RELEASE = (int)std::memory_order::release,
DCAS_SEQ_CST = (int)std::memory_order::seq_cst;
template<int Type>
using dcas_type = std::integral_constant<int, Type>;
using dcas_acq_rel = dcas_type<DCAS_ACQ_REL>;
using dcas_acquire = dcas_type<DCAS_ACQUIRE>;
using dcas_relaxed = dcas_type<DCAS_RELAXED>;
using dcas_release = dcas_type<DCAS_RELEASE>;
using dcas_seq_cst = dcas_type<DCAS_SEQ_CST>;
struct dcas_atomic
{
static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4, "must be 64 or 32 bit architecture");
struct pair_t { size_t first, second; };
dcas_atomic() = default;
dcas_atomic( pair_t const &desired );
std::atomic<size_t> &first() noexcept;
std::atomic<size_t> &second() noexcept;
operator pair_t() noexcept;
template<int Type>
bool compare_exchange( pair_t &expected, pair_t const &desired, dcas_type<Type> = dcas_seq_cst() ) noexcept;
template<int Type>
pair_t load( dcas_type<Type> = dcas_seq_cst() ) const noexcept;
template<int Type>
void store( pair_t const &niu, dcas_type<Type> = dcas_seq_cst() ) noexcept;
private:
#if defined(__GNUC__) || defined(__clang__) && !defined(_MSC_VER)
#if SIZE_WIDTH == 64
using dword_t = unsigned __int128;
#elif SIZE_WIDTH == 32
using dword_t = unsigned long long;
#else
#error unknown architecture
#endif
#endif
union alignas(2 * sizeof(size_t)) U
{
U() {}
static_assert(sizeof(std::atomic<size_t>) == sizeof(size_t), "sizeof(atomic<size_t>) must be == sizeof(size_t)");
std::atomic<size_t> m_atomics[2];
#if defined(_MSC_VER)
#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
__int64 volatile m_firstAndSecond[2];
#elif defined(_M_IX86)
__int64 volatile m_firstAndSecond;
#else
#error unknown architecture
#endif
#elif defined(__GNUC__) || defined(__clang__)
dword_t volatile m_firstAndSecond;
#endif
} u;
};
inline dcas_atomic::dcas_atomic( pair_t const &desired )
{
u.m_atomics[0].store( desired.first, std::memory_order_relaxed );
u.m_atomics[1].store( desired.second, std::memory_order_relaxed );
}
inline std::atomic<size_t> &dcas_atomic::first() noexcept
{
return u.m_atomics[0];
}
inline std::atomic<size_t> &dcas_atomic::second() noexcept
{
return u.m_atomics[1];
}
inline dcas_atomic::operator pair_t() noexcept
{
return pair_t( first().load( std::memory_order_relaxed ), second().load( std::memory_order_relaxed ) );
}
template<int Type>
inline bool dcas_atomic::compare_exchange( pair_t &expected, pair_t const &desired, dcas_type<Type> ) noexcept
{
using namespace std;
static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
#if defined(_MSC_VER)
#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
__int64 expectedA[2];
expectedA[0] = (__int64)expected.first;
expectedA[1] = (__int64)expected.second;
char fRet;
#if defined(_M_X64)
fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
#else
if constexpr( Type == DCAS_ACQ_REL || Type == DCAS_SEQ_CST )
fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
else if constexpr( Type == DCAS_ACQUIRE )
fRet = _InterlockedCompareExchange128_acq( u.m_firstAndSecond, desired.second, desired.first, expectedA );
else if constexpr( Type == DCAS_RELAXED )
fRet = _InterlockedCompareExchange128_nf( u.m_firstAndSecond, desired.second, desired.first, expectedA );
else
fRet = _InterlockedCompareExchange128_rel( u.m_firstAndSecond, desired.second, desired.first, expectedA );
#endif
if( fRet )
return true;
expected.first = expectedA[0];
expected.second = expectedA[1];
return false;
#elif defined(_M_IX86)
auto compose = []( pair_t const &p ) -> uint64_t { return p.first | (uint64_t)p.second << 32; };
uint64_t
cDesired = compose( desired ),
cExpected = compose( expected ),
cResult = _InterlockedCompareExchange64( &u.m_firstAndSecond, cDesired, cExpected );
if( cResult == cExpected ) [[likely]]
return true;
expected.first = (uint32_t)cResult;
expected.second = (uint32_t)(cResult >> 32);
return false;
#else
#error unspupported Windows-platform
#endif
#elif defined(__GNUC__) || defined(__clang__)
constexpr auto
pair_t::*FIRST = std::endian::native == std::endian::little ? &pair_t::first : &pair_t::second,
pair_t::*SECOND = std::endian::native == std::endian::little ? &pair_t::second : &pair_t::first;
auto compose = []( pair_t const &p ) -> dword_t { return (dword_t)(p.*FIRST) | (dword_t)(p.*SECOND) << SIZE_WIDTH; };
dword_t
dwExpected = compose( expected ),
dwDesired = compose( desired );
bool ret;
if constexpr( Type == DCAS_ACQ_REL )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED );
else if constexpr( Type == DCAS_ACQUIRE )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED );
else if constexpr( Type == DCAS_RELAXED )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED );
else if constexpr( Type == DCAS_RELEASE )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED );
else
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED );
if( ret ) [[likely]]
return true;
expected.*FIRST = (size_t)dwExpected;
expected.*SECOND = (size_t)(dwExpected >> SIZE_WIDTH);
return false;
#else
#error unspupported platform
#endif
}
template<int Type>
inline typename dcas_atomic::pair_t dcas_atomic::load( dcas_type<Type> ) const noexcept
{
using namespace std;
static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
pair_t cmp( 0, 0 );
const_cast<dcas_atomic *>(this)->compare_exchange( cmp, cmp, dcas_type<Type>() );
return cmp;
}
template<int Type>
inline void dcas_atomic::store( pair_t const &niu, dcas_type<Type> ) noexcept
{
using namespace std;
static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
pair_t cmp( *this );
while( !this->compare_exchange( cmp, niu, dcas_type<Type>() ) );
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__llvm__)
#pragma clang diagnostic pop
#endif
The solution is slightly faster than the futex'd solution from MSVC and multiple times faster than the current libstdc++- and libc++-solution.
_Repptr
in theatomic
has a locking mechanism in control block. – Unsearchable_InterlockedCompareExchange128
/cmpxchg16b
(but I'm not sure if anyone currently does so) – Sublingualatomic<shared_ptr<X>>
for it to be a development priority. – Withindoors