Fast memory transpose with SSE, AVX, and OpenMP

Asked 5/6, 2013 at 13:22 Answered 7/9, 2017 at 20:7

I need a fast memory transpose algorithm for my Gaussian convolution function in C/C++. What I do now is

convolute_1D
transpose
convolute_1D
transpose

It turns out that with this method the filter size has to be large (or larger than I expected) or the transpose takes longer than the convolution (e.g. for a 1920x1080 matrix the convolution takes the same time as the transpose for a filter size of 35). The current transpose algorithm I am using uses loop blocking/tiling along with SSE and OpenMP. I have tried a version using AVX but it's no faster. Any suggestions on how I can speed this up?

inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
    __m128 row1 = _mm_load_ps(&A[0*lda]);
    __m128 row2 = _mm_load_ps(&A[1*lda]);
    __m128 row3 = _mm_load_ps(&A[2*lda]);
    __m128 row4 = _mm_load_ps(&A[3*lda]);
     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
     _mm_store_ps(&B[0*ldb], row1);
     _mm_store_ps(&B[1*ldb], row2);
     _mm_store_ps(&B[2*ldb], row3);
     _mm_store_ps(&B[3*ldb], row4);
}
//block_size = 16 works best
inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
    #pragma omp parallel for
    for(int i=0; i<n; i+=block_size) {
        for(int j=0; j<m; j+=block_size) {
            int max_i2 = i+block_size < n ? i + block_size : n;
            int max_j2 = j+block_size < m ? j + block_size : m;
            for(int i2=i; i2<max_i2; i2+=4) {
                for(int j2=j; j2<max_j2; j2+=4) {
                    transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
                }
            }
        }
    }
}

Transpose 8x8 float matrix using AVX. It's no faster than four 4x4 transposes.

inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
    __t0 = _mm256_unpacklo_ps(row0, row1);
    __t1 = _mm256_unpackhi_ps(row0, row1);
    __t2 = _mm256_unpacklo_ps(row2, row3);
    __t3 = _mm256_unpackhi_ps(row2, row3);
    __t4 = _mm256_unpacklo_ps(row4, row5);
    __t5 = _mm256_unpackhi_ps(row4, row5);
    __t6 = _mm256_unpacklo_ps(row6, row7);
    __t7 = _mm256_unpackhi_ps(row6, row7);
    __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
    __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
    __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
    __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
    __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
    __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
    __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
    __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}

inline void transpose8x8_avx(float *A, float *B, const int lda, const int ldb) {
    __m256 row0 = _mm256_load_ps(&A[0*lda]);
    __m256 row1 = _mm256_load_ps(&A[1*lda]);
    __m256 row2 = _mm256_load_ps(&A[2*lda]);
    __m256 row3 = _mm256_load_ps(&A[3*lda]);
    __m256 row4 = _mm256_load_ps(&A[4*lda]);
    __m256 row5 = _mm256_load_ps(&A[5*lda]);
    __m256 row6 = _mm256_load_ps(&A[6*lda]);
    __m256 row7 = _mm256_load_ps(&A[7*lda]);
    transpose8_ps(row0, row1, row2, row3, row4, row5, row6, row7);
    _mm256_store_ps(&B[0*ldb], row0);
    _mm256_store_ps(&B[1*ldb], row1);
    _mm256_store_ps(&B[2*ldb], row2);
    _mm256_store_ps(&B[3*ldb], row3);
    _mm256_store_ps(&B[4*ldb], row4);
    _mm256_store_ps(&B[5*ldb], row5);
    _mm256_store_ps(&B[6*ldb], row6);
    _mm256_store_ps(&B[7*ldb], row7);

}

Qualls answered 5/6, 2013 at 13:22 Comment(0)

I'd guess that your best bet would be to try and combine the convolution and the transpose - i.e. write out the results of the convolve transposed as you go. You're almost certainly memory bandwidth limited on the transpose so reducing the number of instructions used for the transpose isn't really going to help (hence the lack of improvement from using AVX). Reducing the number of passes over your data is going to give you the best performance improvements.

Cardwell answered 5/6, 2013 at 15:41 Comment(7)

Good points. The reason I'm doing the transpose in the first place is because reading data the is not contiguous causes a lot of cache hits. So it's a fight between cache hits and the time to do the transpose. I'm not sure writing out the results of the transpose will help in the convolution though. Probably I have to come up with a different algorithm for smaller filter sizes. – Qualls 5/6, 2013 at 16:32

I'll do some tests on smaller matrix sizes which fit in the L2 or L3 cache and get back to you. You might be right about the reason AVX does not appear to better in this example. – Qualls 5/6, 2013 at 16:45

I tried the transpose on 64x64, 192x192, 896x896, and 5008x5008. Those should correspond to my l1, l2, l3 and main memory regions. AVX is only marginally better than SSE for 64x64 (L1 cache). I turned OpenMP off for the tests. – Qualls 5/6, 2013 at 17:36

You're doing a 2D Gaussian convolution by doing 2 1D passes (separable Guassian blur)? Why is the data not contiguous when you read it in on the first pass? Seems like you should be able to read contiguous data for the horizontal blur pass and write out the result transposed ready to read in again contiguous on the second vertical pass and write back out transposed back to the original layout. – Cardwell 5/6, 2013 at 22:44

It also might be worth looking at how this is implemented fast on a GPU. You can do the complete 2D blur in blocks in one go - read your entire 2D kernel input region in roughly L1 cache sized blocks then do your two separable 1D passes on the block and write out to a separate output image. – Cardwell 5/6, 2013 at 22:49

I can look into writing the transpose right after the horizontal pass rather than as a separate function. It's sorta the same number of instructions. I agree looking at how it's done the GPU is a good idea. Actually, even better might be to look at the Intel OpenCL example which should be optimized for the CPU. – Qualls 6/6, 2013 at 5:50

mattnewport, How would you unpack the data form the SSE variable into the "Temp" image in Transpose fashion efficiently and fast? Thank You. – Lalo 14/10, 2016 at 15:27

FWIW, on a 3 years old Core i7 M laptop CPU, this naive 4x4 transpose was barely slower than your SSE version, while almost 40% faster on a newer Intel Xeon E5-2630 v2 @ 2.60GHz desktop CPU.

inline void transpose4x4_naive(float *A, float *B, const int lda, const int ldb) {
    const float r0[] = { A[0], A[1], A[2], A[3] }; // memcpy instead?
    A += lda;
    const float r1[] = { A[0], A[1], A[2], A[3] };
    A += lda;
    const float r2[] = { A[0], A[1], A[2], A[3] };
    A += lda;
    const float r3[] = { A[0], A[1], A[2], A[3] };

    B[0] = r0[0];
    B[1] = r1[0];
    B[2] = r2[0];
    B[3] = r3[0];
    B += ldb;
    B[0] = r0[1];
    B[1] = r1[1];
    B[2] = r2[1];
    B[3] = r3[1];
    B += ldb;
    B[0] = r0[2];
    B[1] = r1[2];
    B[2] = r2[2];
    B[3] = r3[2];
    B += ldb;
    B[0] = r0[3];
    B[1] = r1[3];
    B[2] = r2[3];
    B[3] = r3[3];
}

Strangely enough, the older laptop CPU is faster than the dual E5-2630 v2 desktop with twice the core, but that's a different story :)

Otherwise, you might also be interested in ~~http://research.colfaxinternational.com/file.axd?file=2013%2F8%2FColfax_Transposition-7110P.pdf~~ http://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/ (requires login now...)

Sheatfish answered 8/9, 2014 at 11:49 Comment(3)

Well, that link's dead. – Santamaria 12/2, 2016 at 18:40

I think I've found the new link, and updated it above, but one needs to login now. – Sheatfish 12/2, 2016 at 19:4

Haswell only has one shuffle unit, but Nehalem through IvB have two, for a shuffle throughput of one per 0.5c. (agner.org/optimize). E5-xxxx v2 is IvyBridge, though, so maybe that's not it. Unless you were running this with multiple threads, IDK why you even mention the core count when comparing the laptop to the Xeon. – Buss 13/2, 2016 at 19:39

Consider this 4x4 transpose.

struct MATRIX {
    union {
        float  f[4][4];
        __m128 m[4];
        __m256 n[2];
    };
};
MATRIX myTranspose(MATRIX in) {

    // This takes 15 assembler instructions (compile not inline), 
    // and is faster than XMTranspose
    // Comes in like this  1  2  3  4  5  6  7  8
    //                     9 10 11 12 13 14 15 16
    //
    // Want the result     1  5  9 13  2  6 10 14
    //                     3  7 11 15  4  8 12 16

    __m256 t0, t1, t2, t3, t4, t5, n0, n1;
    MATRIX result;

    n0 = in.n[0];                                               // n0 =  1,  2,  3,  4,  5,  6,  7,  8
    n1 = in.n[1];                                               // n1 =  9, 10, 11, 12, 13, 14, 15, 16
    t0 = _mm256_unpacklo_ps(n0, n1);                            // t0 =  1,  9,  2, 10,  5, 13,  6, 14
    t1 = _mm256_unpackhi_ps(n0, n1);                            // t1 =  3, 11,  4, 12,  7, 15,  8, 16

    t2 = _mm256_permute2f128_ps(t0, t1, 0x20);                  // t2 =  1,  9,  2, 10,  3, 11,  4, 12 
    t3 = _mm256_permute2f128_ps(t0, t1, 0x31);                  // t3 =  5, 13,  6, 14,  7, 15,  8, 16

    t4 = _mm256_unpacklo_ps(t2, t3);                            // t2 =  1,  5,  9, 13,  3,  7, 11, 15
    t5 = _mm256_unpackhi_ps(t2, t3);                            // t3 =  2,  6, 10, 14,  4,  8, 12, 16

    result.n[0] = _mm256_permute2f128_ps(t4, t5, 0x20);         // t6 =  1,  5,  9, 13,  2,  6, 10, 14
    result.n[1] = _mm256_permute2f128_ps(t4, t5, 0x31);         // t7 =  3,  7, 11, 15,  4,  8, 12, 16
    return result;
}

Lavalava answered 7/9, 2017 at 20:7 Comment(1)

__m128 xmm[4]; __m256 ymm[2] would be more obvious names. But that union is a pretty ugly hack that can easily compile to inefficient code if you misuse it. – Buss 10/4, 2018 at 8:30

Recommended topics

Hot tags