void ascii_pack_neon(uint8_t *pBin, uint8_t *pAscii, intptr_t len)
{
assert(len >= 64);
assert((len & 63) == 0);
uint8x8x4_t ina, inb, outa;
uint8x8x3_t outb;
uint8x8_t row1, row2, row3, row4, row5, row6, row7;
do {
len -= 64;
ina = vld4_u8(pAscii); pAscii += 32;
inb = vld4_u8(pAscii); pAscii += 32;
// finish transposing
outa.val[0] = vuzp1_u8(ina.val[0], inb.val[0]);
row1 = vuzp1_u8(ina.val[1], inb.val[1]);
row2 = vuzp1_u8(ina.val[2], inb.val[2]);
row3 = vuzp1_u8(ina.val[3], inb.val[3]);
row4 = vuzp2_u8(ina.val[0], inb.val[0]);
row5 = vuzp2_u8(ina.val[1], inb.val[1]);
row6 = vuzp2_u8(ina.val[2], inb.val[2]);
row7 = vuzp2_u8(ina.val[3], inb.val[3]);
outa.val[1] = vshr_n_u8(row1, 1);
outa.val[2] = vshr_n_u8(row2, 2);
outa.val[3] = vshr_n_u8(row3, 3);
outb.val[0] = vshr_n_u8(row4, 4);
outb.val[1] = vshr_n_u8(row5, 5);
outb.val[2] = vshr_n_u8(row6, 6);
outa.val[0] = vsli_n_u8(outa.val[0], row1, 7);
outa.val[1] = vsli_n_u8(outa.val[1], row2, 6);
outa.val[2] = vsli_n_u8(outa.val[2], row3, 5);
outa.val[3] = vsli_n_u8(outa.val[3], row4, 4);
outb.val[0] = vsli_n_u8(outb.val[0], row5, 3);
outb.val[1] = vsli_n_u8(outb.val[1], row6, 2);
outb.val[2] = vsli_n_u8(outb.val[2], row7, 1);
vst4_lane_u8(pBin, outa, 0); pBin += 4;
vst3_lane_u8(pBin, outb, 0); pBin += 3;
vst4_lane_u8(pBin, outa, 1); pBin += 4;
vst3_lane_u8(pBin, outb, 1); pBin += 3;
vst4_lane_u8(pBin, outa, 2); pBin += 4;
vst3_lane_u8(pBin, outb, 2); pBin += 3;
vst4_lane_u8(pBin, outa, 3); pBin += 4;
vst3_lane_u8(pBin, outb, 3); pBin += 3;
vst4_lane_u8(pBin, outa, 4); pBin += 4;
vst3_lane_u8(pBin, outb, 4); pBin += 3;
vst4_lane_u8(pBin, outa, 5); pBin += 4;
vst3_lane_u8(pBin, outb, 5); pBin += 3;
vst4_lane_u8(pBin, outa, 6); pBin += 4;
vst3_lane_u8(pBin, outb, 6); pBin += 3;
vst4_lane_u8(pBin, outa, 7); pBin += 4;
vst3_lane_u8(pBin, outb, 7); pBin += 3;
} while (len);
}
Below is the conventional version without transposing, which is much longer than the previous one:
static inline uint64x1_t pack8(uint64x1_t in)
{
const uint64x1_t mask1 = vdup_n_u64(0x007f007f007f007f);
const uint64x1_t mask2 = vdup_n_u64(0x00003fff00003fff);
const uint64x1_t mask4 = vdup_n_u64(0x000000000fffffff);
in = vbsl_u64(mask1, in, vshr_n_u64(in, 1));
in = vbsl_u64(mask2, in, vshr_n_u64(in, 2));
in = vbsl_u64(mask4, in, vshr_n_u64(in, 4));
return in;
}
void ascii_pack_neon_conventional(uint8_t *pBin, uint8_t *pAscii, intptr_t len)
{
// assert(len >= 64);
// assert((len & 63) == 0);
uint64x1x4_t ina, inb, outa;
uint64x1x3_t outb;
uint64x1_t row1, row2, row3, row4, row5, row6, row7;
do {
len -= 64;
ina = vld1_u64_x4((uint64_t *)pAscii); pAscii += 32;
inb = vld1_u64_x4((uint64_t *)pAscii); pAscii += 32;
outa.val[0] = pack8(ina.val[0]);
row1 = pack8(ina.val[1]);
row2 = pack8(ina.val[2]);
row3 = pack8(ina.val[3]);
row4 = pack8(inb.val[0]);
row5 = pack8(inb.val[1]);
row6 = pack8(inb.val[2]);
row7 = pack8(inb.val[3]);
outa.val[1] = vshr_n_u64(row1, 8);
outa.val[2] = vshr_n_u64(row2, 16);
outa.val[3] = vshr_n_u64(row3, 24);
outb.val[0] = vshr_n_u64(row4, 32);
outb.val[1] = vshr_n_u64(row5, 40);
outb.val[2] = vshr_n_u64(row6, 48);
outa.val[0] = vsli_n_u64(outa.val[0], row1, 56);
outa.val[1] = vsli_n_u64(outa.val[1], row2, 48);
outa.val[2] = vsli_n_u64(outa.val[2], row3, 40);
outa.val[3] = vsli_n_u64(outa.val[3], row4, 32);
outb.val[0] = vsli_n_u64(outa.val[0], row5, 24);
outb.val[1] = vsli_n_u64(outa.val[1], row6, 16);
outb.val[2] = vsli_n_u64(outa.val[2], row7, 8);
vst1_u64_x4((uint64_t *)pBin, outa); pBin += 32;
vst1_u64_x3((uint64_t *)pBin, outb); pBin += 24;
} while (len);
}
It seems that GCC
is the culprit here: godbolt link (transposing)
And GCC
keeps being a disaster even in conventional version
Conclusion: ditch GCC
. Use Clang
instead, or better - write in assembly:
.arch armv8-a
.global ascii_pack_asm_transpose, ascii_pack_asm_conventional
.text
pBin .req x0
pAscii .req x1
len .req w2
.balign 64
.func
ascii_pack_asm_transpose:
1:
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [pAscii], #32
ld4 {v20.8b, v21.8b, v22.8b, v23.8b}, [pAscii], #32
subs len, len, #64
uzp1 v0.8b, v16.8b, v20.8b
uzp1 v24.8b, v17.8b, v21.8b
uzp1 v25.8b, v18.8b, v22.8b
uzp1 v26.8b, v19.8b, v23.8b
uzp2 v27.8b, v16.8b, v20.8b
uzp2 v28.8b, v17.8b, v21.8b
uzp2 v29.8b, v18.8b, v22.8b
uzp2 v30.8b, v19.8b, v23.8b
ushr v1.8b, v24.8b, #1
ushr v2.8b, v25.8b, #2
ushr v3.8b, v26.8b, #3
ushr v4.8b, v27.8b, #4
ushr v5.8b, v28.8b, #5
ushr v6.8b, v29.8b, #6
sli v0.8b, v24.8b, #7
sli v1.8b, v25.8b, #6
sli v2.8b, v26.8b, #5
sli v3.8b, v27.8b, #4
sli v4.8b, v28.8b, #3
sli v5.8b, v29.8b, #2
sli v6.8b, v30.8b, #1
st4 {v0.b, v1.b, v2.b, v3.b}[0], [pBin], #4
st3 {v4.b, v5.b, v6.b}[0], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[1], [pBin], #4
st3 {v4.b, v5.b, v6.b}[1], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[2], [pBin], #4
st3 {v4.b, v5.b, v6.b}[2], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[3], [pBin], #4
st3 {v4.b, v5.b, v6.b}[3], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[4], [pBin], #4
st3 {v4.b, v5.b, v6.b}[4], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[5], [pBin], #4
st3 {v4.b, v5.b, v6.b}[5], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[6], [pBin], #4
st3 {v4.b, v5.b, v6.b}[6], [pBin], #3
st4 {v0.b, v1.b, v2.b, v3.b}[7], [pBin], #4
st3 {v4.b, v5.b, v6.b}[7], [pBin], #3
b.gt 1b
.balign 16
ret
.endfunc
/////////////////////////////////////////////////////////////
.balign 64
.func
ascii_pack_asm_conventional:
adr x3, 2f
sub pAscii, pAscii, #16
sub pBin, pBin, #8
movi v0.4h, #0x007f // mask1
ldp d1, d2, [x3] // mask2, mask4
b 1f
.balign 16
2:
.long 0x00003fff, 0x00003fff
.long 0x0fffffff, 0x00000000
.balign 64
1:
ldp d16, d17, [pAscii, #16]
ldp d18, d19, [pAscii, #32]
ldp d20, d21, [pAscii, #48]
ldp d22, d23, [pAscii, #64]!
subs len, len, #64
ushr d24, d16, #1
ushr d25, d17, #1
ushr d26, d18, #1
ushr d27, d19, #1
ushr d28, d20, #1
ushr d29, d21, #1
ushr d30, d22, #1
ushr d31, d23, #1
bif v16.8b, v24.8b, v0.8b
bif v17.8b, v25.8b, v0.8b
bif v18.8b, v26.8b, v0.8b
bif v19.8b, v27.8b, v0.8b
bif v20.8b, v28.8b, v0.8b
bif v21.8b, v29.8b, v0.8b
bif v22.8b, v30.8b, v0.8b
bif v23.8b, v31.8b, v0.8b
ushr d24, d16, #2
ushr d25, d17, #2
ushr d26, d18, #2
ushr d27, d19, #2
ushr d28, d20, #2
ushr d29, d21, #2
ushr d30, d22, #2
ushr d31, d23, #2
bif v16.8b, v24.8b, v1.8b
bif v17.8b, v25.8b, v1.8b
bif v18.8b, v26.8b, v1.8b
bif v19.8b, v27.8b, v1.8b
bif v20.8b, v28.8b, v1.8b
bif v21.8b, v29.8b, v1.8b
bif v22.8b, v30.8b, v1.8b
bif v23.8b, v31.8b, v1.8b
ushr d24, d16, #4
ushr d25, d17, #4
ushr d26, d18, #4
ushr d27, d19, #4
ushr d28, d20, #4
ushr d29, d21, #4
ushr d30, d22, #4
ushr d31, d23, #4
bif v16.8b, v24.8b, v2.8b
bif v17.8b, v25.8b, v2.8b
bif v18.8b, v26.8b, v2.8b
bif v19.8b, v27.8b, v2.8b
bif v20.8b, v28.8b, v2.8b
bif v21.8b, v29.8b, v2.8b
bif v22.8b, v30.8b, v2.8b
bif v23.8b, v31.8b, v2.8b
ushr d24, d17, #8
ushr d25, d18, #16
ushr d26, d19, #24
ushr d27, d20, #32
ushr d28, d21, #40
ushr d29, d22, #48
sli d16, d17, #56
sli d24, d18, #48
sli d25, d19, #40
sli d26, d20, #32
sli d27, d21, #24
sli d28, d22, #16
sli d29, d23, #8
stp d16, d24, [pBin, #8]
stp d25, d26, [pBin, #24]
stp d27, d28, [pBin, #40]
str d29, [pBin, #56]!
b.gt 1b
.balign 16
ret
.endfunc
.end
Now you can see clearly that the transposing version is vastly superior, provided the chip doesn't mind unaligned stores much. (most armv8a
ones don't).
You may ask why I don't use quad registers instead of double ones: on armv8
, most instructions on quad registers have half the throughput of double ones. There is hardly any gain, if any while being less flexible. This might be different on more advanced cores.
memcpy
store is probably best done with an 8-bytememcpy
so it can just be one unalignedstr
instruction. The next 8-byte store will overlap with it by 1, and that's fine. Adjust the loop condition accordingly to not write past the end, although it looks like you already check a conservative condition. Oh, I see, you don't even pack the tail since it would save less than 1 byte. Makes sense. – Stinkpot_mm256_maddubs_epi16
and_mm256_shuffle_epi8
? I'd expect the shift/OR to be not bad, although perhaps AArch64 SIMD has some tricks available that can do even better. – Stinkpotsri
(shift right and insert) is indeed useful. Your problem might be pretty similar since you don't need to move bits across wider element boundaries until the end. – Stinkpotv = vsriq_n_u16(v, v, 1);
v = vsriq_n_u32(v,v,2);
v = vsriq_n_u64(v,v,4);
might do the trick for the first 3 steps. If I'm understanding the docs right about which bits it keeps from the non-shifted operand. I'm not sure I am. – Stinkpotv>>8
thensli
by #7, 2 shifts per step if doing it that way. So that's not ideal. – StinkpotUSHL
- developer.arm.com/documentation/ddi0596/2020-12/… - per-element variable-count shifts can shift left or right depending on the sign of the shift count. So first step can left shift the even elements by 1, joining into 14-bit groups in the middle of u16 elements. Next step can shift left+right into the middle of u32, etc. Then one final right-shift of a full u64, and byte shuffle. Also interesting wasuhadd
, but that would take an AND:uhadd(v.4s, v.4s&0x00ff00ff..)
to right-shift the high halves by not self-adding – Stinkpotvshr
), and left shift insert (vsli
) next rows each. You will have a transposed 8x7 matrix that you can store lane by lane (vst4_lane
/vst3_lane
) – Deciduous