Is there any hope to call a common base class method on a std::variant efficiently?
Asked Answered
O

2

18

The way std::variant dispatches to different visitor methods when std::visit is called is pretty reasonable when the variant alternatives are completely different types. Essentially a visitor-specific vtable is built at compile-time and after some error checking1 the appropriate visitor function is looked by indexing the table based the current index() which resolves to something like an indirect jump on most platforms.

If the alternatives share a common base class, however, calling a (non-virtual) member function or accessing state on the base class with a visitor is conceptually much simpler: you are always calling the same method and usually using the same pointer2 to the base class.

Still, the implementation ends up just as slow. For example:

#include <variant>

struct Base {
  int m_base;
  int getBaseMember() { return m_base; }
};

struct Foo : public Base {
  int m_foo;
};

struct Bar : public Base {
  int m_bar;
};

using Foobar = std::variant<Foo,Bar>;

int getBaseMemVariant(Foobar& v) {
  return std::visit([](auto&& e){ return e.getBaseMember(); }, v);
}

The generated code on x86 for the most recent version of gcc and clang is similar3 (clang shown):

getBaseMemVariant(std::__1::variant<Foo, Bar>&): # @getBaseMemVariant(std::__1::variant<Foo, Bar>&)
        sub     rsp, 24
        mov     rax, rdi
        mov     ecx, dword ptr [rax + 8]
        mov     edx, 4294967295
        cmp     rcx, rdx
        je      .LBB0_2
        lea     rdx, [rsp + 8]
        mov     qword ptr [rsp + 16], rdx
        lea     rdi, [rsp + 16]
        mov     rsi, rax
        call    qword ptr [8*rcx + decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix]
        add     rsp, 24
        ret
.LBB0_2:
        mov     edi, 8
        call    __cxa_allocate_exception
        mov     qword ptr [rax], vtable for std::bad_variant_access+16
        mov     esi, typeinfo for std::bad_variant_access
        mov     edx, std::exception::~exception()
        mov     rdi, rax
        call    __cxa_throw
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
        mov     eax, dword ptr [rsi]
        ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
        mov     eax, dword ptr [rsi]
        ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix:
        .quad   decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
        .quad   decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)

The call qword ptr [8*rcx + ... is the actual indirect call to a function pointed to by the vtable (the vtable itself appears at the bottom of the listing). The code before that is first checking the "is empty" state, and then setting up the visit call (I'm not sure what the weirdness with rdi is, I guess it's setting up a pointer to the visitor as the first argument or something).

The actual methods which are pointer to by the vtable and executed by the call are very simple, a single mov to read the member. Critically, both are indentical:

    mov     eax, dword ptr [rsi]
    ret

So we have a huge mess. To execute that single mov we have a dozen setup instructions, and more importantly an indirect branch: which if targeting a series of Foobar variant objects with different contained alternatives will mispredict very badly. Finally, the indirect call seems like an insurmountable barrier to further optimization: here will look at a simple call without any surrounding context, but in real use this might be optimized into a larger function with significant opportunities for further optimization - but I think the indirect call will block it.

You can play with the code yourself on godbolt.

Versus a Union

The slowness isn't inherent: here's a very simple "discriminated union" struct that combines the two classes in a union along with an isFoo discriminator that keeps track of what class is contained:

struct FoobarUnion {
  bool isFoo;
  union {
    Foo foo;
    Bar bar;
  };
  Base *asBase() {return isFoo ? (Base *)&foo : &bar; };
};

int getBaseMemUnion(FoobarUnion& v) {
  return v.asBase()->getBaseMember();
}

The corresponding getBaseMemUnion function compiles down to a single mov instruction on both gcc and clang:

getBaseMemUnion(FoobarUnion&):      # @getBaseMemUnion(FoobarUnion&)
        mov     eax, dword ptr [rdi + 4]
        ret

Granted, the discriminated union doesn't have to check the "is valueless" error condition, but that's not the main reason for variant slowness and in any case such a condition is impossible with Foo and Bar since none of their constructors throw4. Even if you wanted to support such a state, the resulting function with the union is still very efficient - only a small check is added, but the behavior of calling the base class is the same.

Is there anything I can do to this use of variant efficient in this case of calling a common base class function, or is the promise of a zero-cost abstraction just not panning out here?

I'm open to a different call pattern, compiler options, whatever.


1 In particular, checking whether the variant is valueless_by_exception due to an earlier assignment that failed.

2 The pointer to the base class doesn't always have the same relationship to the most derived pointer for all alternatives, e.g., when multiple inheritance is involved.

3 Well gcc is a bit worse since it seems to redundantly perform the "is valueless" check both upfront prior to the call to visit and also in each auto-generated method pointed to by the vtable. clang only does it upfront. Keep in mind when I say "gcc" I really mean "gcc with libstdc++" while "clang" really means "clang with libc++". Some differences, like the redundant index() check in the generated visitor functions are likely due to library differences rather than compiler optimization differences.

4 If the valueless state is problematic, one could also consider something like strict_variant which never has an empty state but still uses local storage if the move constructor cannot throw.

Ossetia answered 20/11, 2017 at 0:29 Comment(25)
Just to fend off the inevitable "did you profile!!!?" or "CPUs are so fast, does this even matter!?" or "is this the real bottleneck in your application?" comments: yes, I profiled, and yes it is an order of magnitude slower than the non-variant alternative. In particular at least one case is worse than shown above, since the indirect call in the variant inhibits vectorization which brings the slowdown close to 100x (this is an example of the "optimization barrier" mentioned above).Ossetia
I hadn't realized that non-throwing constructors for the contained types meant it was impossible for std::variant<Foo,Bar> to ever be in an uninitialized valueless state. But that does seem to be the case. Only exceptions can "empty" a variant. (I don't remember the function being called valueless_by_exception last time we were talking about std::variant for this sort of problem, but it's not new according to the rev history for it on cppreference, so I must have just missed that last time.)Takeover
Yes, this particular behavior was apparently the biggest sticking point when it came to standardizing std::variant. Here's a brief overview with some pointers to further reading. It does seem like implementations could specialize this at compile-time based on is_nothrow_move_constructible - if that were true for all types in the variant the valueless state should be impossible and the checks could be skipped (at compile-time). It's mostly a side-issue here I think: the real cost is the indirect call. @PeterCordesOssetia
I wonder if aggressive comdat folding would make this optimization easier to happen. Can you try this with MSVC or the Gold linker with LTO?Gromwell
Interesting. That claim of It offers the basic exception guarantee at no relevant performance loss in the blog you linked is probably considering much larger types, and maybe overlooking the polymorphism without indirection use-case. Although I agree it should be possible, and appears to be a missed-optimization somewhere in the compilers, the libraries, or both, that it fails to devirtualize. Anyway, this seems like a good reason for not providing a way to "unset" a variant.Takeover
@Yakk - COMDAT folding crossed by mind too: but it certainly can't generate better code for the jump or solve most of the problems. It could, however, at least fold the two generated visitor functions into one and then end up populating the vtable with the same (folded) value which would solve the branch prediction issue at least (since the target is now always the same). LTO could work magic, of course. I have my doubts though: it seems that if the compiler could optimize this it could have already done so here without LTO (everything needed is visible in the compilation unit).Ossetia
With an array of identical const values, the array lookup itself could be elided, which could then cause knock on optimizations. To optimize without this, you need to understand that distinct functions are identical in implementation, the array lookup is only for execution, and then elide the array lookup. One is a general optimization, the other has to be specifically crafted to handle array lookups of identical implementation functions. Which is why I thought it would have a hope of working.Gromwell
@Yakk - yes, I was thinking along the same lines, but even with LTO I think the COMDAT folding which results in array of identical values happens during the "link" phase which logically occurs after the compile phase. I don't think the linker triggers another compilation after folding so that the compiler can take advantage of the new information (but I could be wrong). I'm still working on my example. I did find that when the compiler knows the type of the contained alternative it can optimize the indirect call - but that's an expected result of CSE on the index() value.Ossetia
@Yakk - I played around with ICF folding and LTO, but there is no folding at least for gcc, at least in part because the two functions at not identical for gcc. I showed the clang output above, but gcc actually generates a bit of extra code at the start of each generated visitor function which checks that the variant has the right alternative for this generated code and then throws otherwise. Here's a gist. Note the cmp BYTE PTR [rsi+0x8],0x1 as the first line, the other version compares with 0x0.Ossetia
So it couldn't be folded away in any case. I don't know why the check is there: this is an internal function and the dispatch is already ensuring the right thing happens so it should be guaranteed that this check always passes. The clang generated functions have no such cruft (but I don't have clang installed).Ossetia
@PeterCordes - I don't think the "unset" or "valueless" behavior is very relevant here. It does result in a small amount of additional boilerplate in the disassembly I showed, but I'm quite sure that even if you removed that functionality so the variant was never empty (you are just screwed if it throws) the poor performance would persist. The poor performance is mostly a function of the dispatch mechanism, not of the check for valueless state (which is anyways "out of line" and hence probably largely hidden by OoO in many cases).Ossetia
@BeeOnRope: yeah, sorry for getting off topic into the design of std::variant. But I wonder if the compiler would have an easier time "devirtualizing" if the library source didn't have that check. As you say, probably not, because I guess it doesn't see through how the "vtable" was initialized by some other function, and that nothing else could have modified it. (If that's even the case the way it's implemented. I assume it's private: but I didn't check.) Or maybe it's not sure the vtable always was initialized at all with function pointers?Takeover
It doesn't help with the implementations I checked, but I believe passing a [](const Base& e){ return e.getBaseMember(); } lambda instead of the auto&& version should make life a bit easier for compilers. However, I think this is an optimization, that has to be performed on the library levelGleich
@Gleich yeah, although I didn't mention it above, I actually tried three variants of visitors: the auto&& lambda shown, a functor object that had explicit Foo& and Bar& overrides, and a lambda explicitly taking a a Base& as you suggest. All generate the same code. The real problem is that even if the same labda is used by each visitor, internally the vtable generation is creating unique functions, one for Foo and one for Bar each of which calls the provided visitor: if the visitor internally takes a Base or a T or whatever doesn't change much: there are already two functions.Ossetia
@BeeOnRope: My hope was that - depending on the inlining order - the compiler would recognize at some point that the same function is called on two branches. However, that would require top-down inlining through the internal vtable (replace the table lookup with a switch statement -> inline all the functions at each lable -> detect that the code in both branches is the same).Gleich
@PeterCordes - right, it could help - particularly for the second redundant check inside generated vtable methods that gcc does (since this makes folding impossible). To understand a bit more about what's possible it's worth looking at what happens if you have a variant object of whose alternative is statically known. See here for example, where a Foobar(Bar{}) is created and the base method called. clang is able to completely optimize this to a simple mov eax, 2, so all the dispatch and any checking is completely eliminated.Ossetia
@Gleich - a core problem seems to be that I don't think that a compiler will ever be able to inline across the indirect jump via a lookup table. So even if your vtable contains two identical functions, or even identical pointers to the same function, how can the compiler inline it? I suppose in the latter case it's possible if the compiler realizes all table entires point to the same value (it would also have to understand that the range of index() is limited to the two entries). clang apparently can can do it in a less general case where it determines a specific table entry is used.Ossetia
@PeterCordes - but in the "statically known alternative" case gcc is able to optimize away the up-front "is valueless" check and also the dispatch, but just ends up with a static call 0x400xxx which calls directly the generated visitor method for the Bar visitor. So gcc definitely knows the function to call and that it hasn't changed (it's constexpr after all) but doesn't/can't inline the underlying call, perhaps because the vtable generation and the dispatch are too separate. Note also that clang and gcc are using different c++ libs here: clang barfs on variant with libstdc++.Ossetia
@Ossetia So the first step would be teaching compilers to optimize this, where the only defined behavior is to do nothing. If they cannot optimize that down to nothing, then they have no hope at the variant vtable stuff. Note that compilers can handle that without the array bit and inline over function pointers.Gromwell
@BeeOnRope: "in this important case of calling a common base class function" What makes this an "important case"? If you're using a variant, you don't need common base classes. And if you're not, then why aren't you passing around the base class reference instead of the variant?Inspan
@NicolBolas - it's important to me :), and I imagine common base classes are not uncommon, given that std::variant is often used to implement a kind of compile-time unobtrusive polymorphism. In many cases I do pass around a base class reference, but it is entirely unsuitable for storage, or places you need value semantics, and many other places (i.e,. many of the same reasons you'd use std::variant in the first place). Critically, the base class alone doesn't offer the ability to to dispatch to different implementations of a common method whose implementation differs (absent virtual).Ossetia
@BeeOnRope: "many of the same reasons you'd use std::variant in the first place" I use variant when dealing with value polymorphism between objects of distinct types, typically when I did not write for this purpose. A database field might be a variant of int, float, and string, for example. If I have total control over the types, I don't need variant; I'll just use virtual functions.Inspan
@NicolBolas - indeed, and I use virtual the majority of the time in this scenario (it solves this "base class" issue: leave it non-virtual). In this case it wasn't suitable because virtual-polymorphism both tripled the size of the underlying object and prevents things like straightfoward value-semantic storage in containers (i.e,. you're using pointers everywhere and doing a lot of dynamic allocations, or using per-type containers for storage and a separate container of pointers which is your logical collection, etc). Functionally there is no problem, but this is a performance question :)Ossetia
@BeeOnRope: I found this question because I have the same problem: I want a container of objects with a common base class... But these objects are small, so I really want their storage directly in the container, not in a million little pieces on the heap. It seems like there is a gap in library+compiler support to handle this sort of thing efficiently.Luddite
It appears that many years later, GCC 12.1 can now make the optimization. Even Clang 17 won't with stdlib=libc++ but Clang 15 will with the default library. Both versions came out the same year so I suspect someone intentionally decided to fix it for both (but I haven't taken the time to look that far into it).Saury
I
12

For what it's worth, a totally handrolled visit with a switch does pretty well:

// use a code generator to write out all of these
template <typename F, typename V>
auto custom_visit(F f, V&& v, std::integral_constant<size_t, 2> )
{
    switch (v.index()) {
    case 0: return f(std::get<0>(std::forward<V>(v)));
    case 1: return f(std::get<1>(std::forward<V>(v)));
#ifdef VALUELESS
    case std::variant_npos: {
        []() [[gnu::cold, gnu::noinline]] {
            throw std::bad_variant_access();
        }();
    }
#endif
    }
    __builtin_unreachable();

}

template <typename F, typename V>
auto custom_visit(F f, V&& v) {
    return custom_visit(f, std::forward<V>(v),
        std::variant_size<std::decay_t<V>>{});
}

Which you'd use like:

int getBaseMemVariant2(Foobar& v) {
  return custom_visit([](Base& b){ return &b; }, v)->getBaseMember();
}

With VALUELESS, this emits:

getBaseMemVariant2(std::variant<Foo, Bar>&):
    movzx   eax, BYTE PTR [rdi+8]
    cmp     al, -1
    je      .L27
    cmp     al, 1
    ja      .L28
    mov     eax, DWORD PTR [rdi]
    ret
.L27:
    sub     rsp, 8
    call    auto custom_visit<getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&>(getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&, std::integral_constant<unsigned long, 2ul>)::{lambda()#1}::operator()() const [clone .isra.1]

Which is pretty good. Without VALUELESS, this emits :

getBaseMemVariant2(std::variant<Foo, Bar>&):
    mov     eax, DWORD PTR [rdi]
    ret

as desired.

I don't really know what conclusion, if any, to draw from this. Clearly, there's hope?

Insufficiency answered 20/11, 2017 at 17:42 Comment(0)
N
3

I am not competent enough to perform an assembly-level analysis, but I have written a small wrapper around std::variant explicitly for handling variants where all alternatives inherit from a common base class.

Inside, I am essentially obtaining a pointer to where the variant stores its contents and then use this as a normal pointer to a base class. So once my new variant is created, I expect the actual function call to have about the same overhead as a regular virtual function call on a base class pointer.

pv::polymorphic_value< Base, Foo, Bar > variant;
variant->getBaseMember();

The library is freely available under https://github.com/Krzmbrzl/polymorphic_variant

Normand answered 5/9, 2022 at 16:39 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.