I have the following C++17 code that I compile with VS 2019 (version 16.8.6) in x64 mode:
struct __declspec(align(16)) Vec2f { float v[2]; };
struct __declspec(align(16)) Vec4f { float v[4]; };
static constexpr std::uint64_t N = 100'000'000ull;
const Vec2f p{};
Vec4f acc{};
// Using virtual method:
for (std::uint64_t i = 0; i < N; ++i)
acc += foo->eval(p);
// Using function pointer:
for (std::uint64_t i = 0; i < N; ++i)
acc += eval_fn(p);
In the first loop, foo
is a std::shared_ptr
and eval()
is a virtual method:
__declspec(noinline) virtual Vec4f eval(const Vec2f& p) const noexcept
{
return { p.v[0], p.v[1], p.v[0], p.v[1] };
}
In the second loop, eval_fn
is a pointer to the function below:
__declspec(noinline) Vec4f eval_fn_impl(const Vec2f& p) noexcept
{
return { p.v[0], p.v[1], p.v[0], p.v[1] };
}
Finally, I have two implementations of operator+=
for Vec4f
:
One implemented using an explicit loop:
Vec4f& operator+=(Vec4f& lhs, const Vec4f& rhs) noexcept { for (std::uint32_t i = 0; i < 4; ++i) lhs.v[i] += rhs.v[i]; return lhs; }
And one implemented with an SSE intrinsic:
Vec4f& operator+=(Vec4f& lhs, const Vec4f& rhs) noexcept { _mm_store_ps(lhs.v, _mm_add_ps(_mm_load_ps(lhs.v), _mm_load_ps(rhs.v))); return lhs; }
You can find the full (standalone, Windows-only) code of the test below.
Here is the generated code for the two loops, along with the runtime in milliseconds (for 100M iterations) when executed on an AMD Threadripper 3970X CPU (Zen 2 architecture):
With the SSE intrinsic implementation of
operator+=(Vec4f&, const Vec4f&)
:// Using virtual method: 649 ms $LL4@main: mov rax, QWORD PTR [rdi] // fetch vtable base pointer (rdi = foo) lea r8, QWORD PTR p$[rsp] // r8 = &p lea rdx, QWORD PTR $T3[rsp] // not sure what $T3 is (some kind of temporary, but why?) mov rcx, rdi // rcx = this call QWORD PTR [rax] // foo->eval(p) addps xmm6, XMMWORD PTR [rax] sub rbp, 1 jne SHORT $LL4@main // Using function pointer: 602 ms $LL7@main: lea rdx, QWORD PTR p$[rsp] // rdx = &p lea rcx, QWORD PTR $T2[rsp] // same question as above call rbx // eval_fn(p) addps xmm6, XMMWORD PTR [rax] sub rsi, 1 jne SHORT $LL7@main
With the explicit loop implementation of
operator+=(Vec4f&, const Vec4f&)
:// Using virtual method: 167 ms [3.5x to 4x FASTER!] $LL4@main: mov rax, QWORD PTR [rdi] lea r8, QWORD PTR p$[rsp] lea rdx, QWORD PTR $T5[rsp] mov rcx, rdi call QWORD PTR [rax] addss xmm9, DWORD PTR [rax] addss xmm8, DWORD PTR [rax+4] addss xmm7, DWORD PTR [rax+8] addss xmm6, DWORD PTR [rax+12] sub rbp, 1 jne SHORT $LL4@main // Using function pointer: 600 ms $LL7@main: lea rdx, QWORD PTR p$[rsp] lea rcx, QWORD PTR $T4[rsp] call rbx addps xmm6, XMMWORD PTR [rax] sub rsi, 1 jne SHORT $LL7@main
(On AMD Zen 2 arch, as far as I can tell, addss
and addps
instructions have a latency of 3 cycles, and up to two such instructions can be executed simultaneously.)
The case that puzzles me is when using a virtual method and an explicit loop implementation of operator+=
:
Why is it 3.5x to 4x faster than the other three variants?
What are the relevant architectural effects at play here? Fewer dependencies between registers across subsequent iterations of the loop? Or some kind of bad luck regarding caching?
Full source code:
#include <Windows.h>
#include <cstdint>
#include <cstdio>
#include <memory>
#include <xmmintrin.h>
struct __declspec(align(16)) Vec2f
{
float v[2];
};
struct __declspec(align(16)) Vec4f
{
float v[4];
};
Vec4f& operator+=(Vec4f& lhs, const Vec4f& rhs) noexcept
{
#if 0
_mm_store_ps(lhs.v, _mm_add_ps(_mm_load_ps(lhs.v), _mm_load_ps(rhs.v)));
#else
for (std::uint32_t i = 0; i < 4; ++i)
lhs.v[i] += rhs.v[i];
#endif
return lhs;
}
std::uint64_t get_timer_freq()
{
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
return static_cast<std::uint64_t>(frequency.QuadPart);
}
std::uint64_t read_timer()
{
LARGE_INTEGER count;
QueryPerformanceCounter(&count);
return static_cast<std::uint64_t>(count.QuadPart);
}
struct Foo
{
__declspec(noinline) virtual Vec4f eval(const Vec2f& p) const noexcept
{
return { p.v[0], p.v[1], p.v[0], p.v[1] };
}
};
using SampleFn = Vec4f (*)(const Vec2f&);
__declspec(noinline) Vec4f eval_fn_impl(const Vec2f& p) noexcept
{
return { p.v[0], p.v[1], p.v[0], p.v[1] };
}
__declspec(noinline) SampleFn make_eval_fn()
{
return &eval_fn_impl;
}
int main()
{
static constexpr std::uint64_t N = 100'000'000ull;
const auto timer_freq = get_timer_freq();
const Vec2f p{};
Vec4f acc{};
{
const auto foo = std::make_shared<Foo>();
const auto start_time = read_timer();
for (std::uint64_t i = 0; i < N; ++i)
acc += foo->eval(p);
std::printf("foo->eval: %llu ms\n", 1000 * (read_timer() - start_time) / timer_freq);
}
{
const auto eval_fn = make_eval_fn();
const auto start_time = read_timer();
for (std::uint64_t i = 0; i < N; ++i)
acc += eval_fn(p);
std::printf("eval_fn: %llu ms\n", 1000 * (read_timer() - start_time) / timer_freq);
}
return acc.v[0] + acc.v[1] + acc.v[2] + acc.v[3] > 0.0f ? 1 : 0;
}
__vectorcall
that passes / returns__m128
in XMM regs? (If that would even help, ifVec4f
isn't treated the same as__m128
). I wonder if store-forwarding is the problem if the callee does scalar stores. (update, that's what harold is suggesting). – Henryk