The way std::variant
dispatches to different visitor methods when std::visit
is called is pretty reasonable when the variant alternatives are completely different types. Essentially a visitor-specific vtable
is built at compile-time and after some error checking1 the appropriate visitor function is looked by indexing the table based the current index()
which resolves to something like an indirect jump on most platforms.
If the alternatives share a common base class, however, calling a (non-virtual) member function or accessing state on the base class with a visitor is conceptually much simpler: you are always calling the same method and usually using the same pointer2 to the base class.
Still, the implementation ends up just as slow. For example:
#include <variant>
struct Base {
int m_base;
int getBaseMember() { return m_base; }
};
struct Foo : public Base {
int m_foo;
};
struct Bar : public Base {
int m_bar;
};
using Foobar = std::variant<Foo,Bar>;
int getBaseMemVariant(Foobar& v) {
return std::visit([](auto&& e){ return e.getBaseMember(); }, v);
}
The generated code on x86 for the most recent version of gcc
and clang
is similar3 (clang shown):
getBaseMemVariant(std::__1::variant<Foo, Bar>&): # @getBaseMemVariant(std::__1::variant<Foo, Bar>&)
sub rsp, 24
mov rax, rdi
mov ecx, dword ptr [rax + 8]
mov edx, 4294967295
cmp rcx, rdx
je .LBB0_2
lea rdx, [rsp + 8]
mov qword ptr [rsp + 16], rdx
lea rdi, [rsp + 16]
mov rsi, rax
call qword ptr [8*rcx + decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix]
add rsp, 24
ret
.LBB0_2:
mov edi, 8
call __cxa_allocate_exception
mov qword ptr [rax], vtable for std::bad_variant_access+16
mov esi, typeinfo for std::bad_variant_access
mov edx, std::exception::~exception()
mov rdi, rax
call __cxa_throw
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
mov eax, dword ptr [rsi]
ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
mov eax, dword ptr [rsi]
ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix:
.quad decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
.quad decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
The call qword ptr [8*rcx + ...
is the actual indirect call to a function pointed to by the vtable (the vtable itself appears at the bottom of the listing). The code before that is first checking the "is empty" state, and then setting up the visit
call (I'm not sure what the weirdness with rdi
is, I guess it's setting up a pointer to the visitor as the first argument or something).
The actual methods which are pointer to by the vtable and executed by the call
are very simple, a single mov
to read the member. Critically, both are indentical:
mov eax, dword ptr [rsi]
ret
So we have a huge mess. To execute that single mov
we have a dozen setup instructions, and more importantly an indirect branch: which if targeting a series of Foobar
variant
objects with different contained alternatives will mispredict very badly. Finally, the indirect call seems like an insurmountable barrier to further optimization: here will look at a simple call without any surrounding context, but in real use this might be optimized into a larger function with significant opportunities for further optimization - but I think the indirect call will block it.
You can play with the code yourself on godbolt.
Versus a Union
The slowness isn't inherent: here's a very simple "discriminated union" struct
that combines the two classes in a union
along with an isFoo
discriminator that keeps track of what class is contained:
struct FoobarUnion {
bool isFoo;
union {
Foo foo;
Bar bar;
};
Base *asBase() {return isFoo ? (Base *)&foo : &bar; };
};
int getBaseMemUnion(FoobarUnion& v) {
return v.asBase()->getBaseMember();
}
The corresponding getBaseMemUnion
function compiles down to a single mov
instruction on both gcc and clang:
getBaseMemUnion(FoobarUnion&): # @getBaseMemUnion(FoobarUnion&)
mov eax, dword ptr [rdi + 4]
ret
Granted, the discriminated union doesn't have to check the "is valueless" error condition, but that's not the main reason for variant
slowness and in any case such a condition is impossible with Foo
and Bar
since none of their constructors throw4. Even if you wanted to support such a state, the resulting function with the union
is still very efficient - only a small check is added, but the behavior of calling the base class is the same.
Is there anything I can do to this use of variant
efficient in this case of calling a common base class function, or is the promise of a zero-cost abstraction just not panning out here?
I'm open to a different call pattern, compiler options, whatever.
1 In particular, checking whether the variant is valueless_by_exception
due to an earlier assignment that failed.
2 The pointer to the base class doesn't always have the same relationship to the most derived pointer for all alternatives, e.g., when multiple inheritance is involved.
3 Well gcc
is a bit worse since it seems to redundantly perform the "is valueless" check both upfront prior to the call to visit
and also in each auto-generated method pointed to by the vtable
. clang only does it upfront. Keep in mind when I say "gcc" I really mean "gcc with libstdc++" while "clang" really means "clang with libc++". Some differences, like the redundant index()
check in the generated visitor functions are likely due to library differences rather than compiler optimization differences.
4 If the valueless
state is problematic, one could also consider something like strict_variant
which never has an empty state but still uses local storage if the move constructor cannot throw.
variant
alternative. In particular at least one case is worse than shown above, since the indirect call in thevariant
inhibits vectorization which brings the slowdown close to 100x (this is an example of the "optimization barrier" mentioned above). – Ossetiastd::variant<Foo,Bar>
to ever be in an uninitialized valueless state. But that does seem to be the case. Only exceptions can "empty" a variant. (I don't remember the function being calledvalueless_by_exception
last time we were talking aboutstd::variant
for this sort of problem, but it's not new according to the rev history for it on cppreference, so I must have just missed that last time.) – Takeoverstd::variant
. Here's a brief overview with some pointers to further reading. It does seem like implementations could specialize this at compile-time based onis_nothrow_move_constructible
- if that were true for all types in thevariant
the valueless state should be impossible and the checks could be skipped (at compile-time). It's mostly a side-issue here I think: the real cost is the indirect call. @PeterCordes – Ossetiavariant
. – Takeoverindex()
value. – Ossetiagcc
, at least in part because the two functions at not identical forgcc
. I showed theclang
output above, but gcc actually generates a bit of extra code at the start of each generated visitor function which checks that the variant has the right alternative for this generated code and then throws otherwise. Here's a gist. Note thecmp BYTE PTR [rsi+0x8],0x1
as the first line, the other version compares with0x0
. – Ossetiavariant
was never empty (you are just screwed if it throws) the poor performance would persist. The poor performance is mostly a function of the dispatch mechanism, not of the check for valueless state (which is anyways "out of line" and hence probably largely hidden by OoO in many cases). – Ossetiastd::variant
. But I wonder if the compiler would have an easier time "devirtualizing" if the library source didn't have that check. As you say, probably not, because I guess it doesn't see through how the "vtable" was initialized by some other function, and that nothing else could have modified it. (If that's even the case the way it's implemented. I assume it'sprivate:
but I didn't check.) Or maybe it's not sure the vtable always was initialized at all with function pointers? – Takeover[](const Base& e){ return e.getBaseMember(); }
lambda instead of theauto&&
version should make life a bit easier for compilers. However, I think this is an optimization, that has to be performed on the library level – Gleichauto&&
lambda shown, a functor object that had explicitFoo&
andBar&
overrides, and a lambda explicitly taking a aBase&
as you suggest. All generate the same code. The real problem is that even if the same labda is used by each visitor, internally the vtable generation is creating unique functions, one forFoo
and one forBar
each of which calls the provided visitor: if the visitor internally takes aBase
or aT
or whatever doesn't change much: there are already two functions. – Ossetiavtable
methods that gcc does (since this makes folding impossible). To understand a bit more about what's possible it's worth looking at what happens if you have avariant
object of whose alternative is statically known. See here for example, where aFoobar(Bar{})
is created and the base method called.clang
is able to completely optimize this to a simplemov eax, 2
, so all the dispatch and any checking is completely eliminated. – Ossetiavtable
contains two identical functions, or even identical pointers to the same function, how can the compiler inline it? I suppose in the latter case it's possible if the compiler realizes all table entires point to the same value (it would also have to understand that the range ofindex()
is limited to the two entries).clang
apparently can can do it in a less general case where it determines a specific table entry is used. – Ossetiagcc
is able to optimize away the up-front "is valueless" check and also the dispatch, but just ends up with a staticcall 0x400xxx
which calls directly the generated visitor method for theBar
visitor. So gcc definitely knows the function to call and that it hasn't changed (it'sconstexpr
after all) but doesn't/can't inline the underlying call, perhaps because the vtable generation and the dispatch are too separate. Note also that clang and gcc are using different c++ libs here: clang barfs onvariant
with libstdc++. – Ossetiastd::variant
is often used to implement a kind of compile-time unobtrusive polymorphism. In many cases I do pass around a base class reference, but it is entirely unsuitable for storage, or places you need value semantics, and many other places (i.e,. many of the same reasons you'd usestd::variant
in the first place). Critically, the base class alone doesn't offer the ability to to dispatch to different implementations of a common method whose implementation differs (absentvirtual
). – Ossetiavariant
when dealing with value polymorphism between objects of distinct types, typically when I did not write for this purpose. A database field might be a variant ofint
,float
, andstring
, for example. If I have total control over the types, I don't needvariant
; I'll just usevirtual
functions. – Inspanvirtual
the majority of the time in this scenario (it solves this "base class" issue: leave it non-virtual
). In this case it wasn't suitable because virtual-polymorphism both tripled the size of the underlying object and prevents things like straightfoward value-semantic storage in containers (i.e,. you're using pointers everywhere and doing a lot of dynamic allocations, or using per-type containers for storage and a separate container of pointers which is your logical collection, etc). Functionally there is no problem, but this is a performance question :) – Ossetia