Given an multi-dimensional array with shape [A][B][C][D]
but stored as a 1-dim array with length [A*B*C*D]
. I want to use template meta-programming to simplify the index-computation. The index (a,b,c,d)
should be at position
a*B*C*D + b*C*D + c*D + d
I currently use
#include <iostream>
#include <cstdlib>
#include <array>
template<size_t start, size_t AXES>
struct prod_func
{
constexpr inline size_t operator()(const std::array<const size_t, AXES> arr) const
{
return arr[start] * prod_func < start + 1, AXES > ()(arr);
}
} ;
template<size_t AXES>
struct prod_func<AXES, AXES>
{
constexpr inline size_t operator()(const std::array<const size_t, AXES> arr) const
{
return 1;
}
} ;
template<int AXES>
class index
{
const std::array<const size_t, AXES> shapes;
public:
index(std::array<const size_t, AXES> s) : shapes(s) {}
template <typename... Dims>
constexpr inline size_t operator()(int off, Dims... dims) const {
return off * (prod_func < AXES - (sizeof...(Dims)), AXES > ()(shapes)) + operator()(dims...);
}
constexpr inline size_t operator()(int t) const {
return t;
}
};
int main()
{
size_t A=2, B=3, C=6, D=7;
auto idx = index<4>({A,B,C,D});
int a=1, b=1, c=1, d=1;
std::cin >> a;
std::cin >> b;
std::cin >> c;
std::cin >> d;
asm ("nop");
size_t result = idx(a,b,c,d);
asm ("nop");
std::cout << result << std::endl;
asm ("nop");
result = (a*B*C*D + b*C*D + c*D + d);
asm ("nop");
std::cout << result << std::endl;
return 0;
}
The cin
is just to ensure run-time values. Inspecting the assembly g++ -O2 -S ../main.cpp -std=c++11
gives
imull $105, 8(%rsp), %edx
imull $35, 12(%rsp), %eax
movl $_ZSt4cout, %edi
addl %edx, %eax
movl 16(%rsp), %edx
leal (%rax,%rdx,8), %esi
subl %edx, %esi
addl 20(%rsp), %esi
for the (a*B*C*D + b*C*D + c*D + d)
part. This is what I was expecting from the compiler. But for the index-class it produces some more operations:
movslq 8(%rsp), %rax
movl $_ZSt4cout, %edi
leaq (%rax,%rax,2), %rdx
leaq (%rax,%rdx,4), %rdx
leaq (%rax,%rdx,8), %rcx
movslq 12(%rsp), %rax
leaq (%rax,%rax,4), %rdx
leaq (%rcx,%rdx,8), %rax
subq %rdx, %rax
movslq 20(%rsp), %rdx
addq %rdx, %rax
movslq 16(%rsp), %rdx
leaq (%rax,%rdx,8), %rsi
subq %rdx, %rsi
and does not get the optimization B*C*D=105
.
Is there any way to get similar assembly? I would like to wrap some CUDA code, so it really needs to be identical code (in C++11). To be clear, only the number of axes is known at compile-time.
Or any other ways to write this?
edit: Although I am now conviced, that it has the same efficiency, I would like to still get the same assembly: https://godbolt.org/g/RHwBV6
std::array
s? – Kampongindex<4>({A,B,C,D})(a,b,c,d)
orindex<4>(A,B,C,D)(a,b,c,d)
. Not sure how nested std::arrays can help. – Thromboplasticlea
instructions actually a multiply by 105 (it calculates 1+8*(1+4*3)=105). But, if A-D is not a compile time constant, you'll have a different assembly anyway. You can simulate it with initializing A-D with volatile variables. Like this:volatile size_t vA = 3; A = vA;
– Pyrologyconst size_t
I do not get the same assembly. – Thromboplasticindex
constructorconstexpr
? – Upstart