64 bit: https://rust.godbolt.org/z/rsxh8P8Er
32 bit: https://rust.godbolt.org/z/3P3ejsnh1
I have a little experience with Rust and Assembly but I added some tests.
#[cfg(target_feature = "avx2")]
pub mod avx2 {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
fn first_nonzero_tiny(arr: &[u32]) -> Option<usize> {
arr.iter().position(|&x| x != 0)
}
fn find_u32_zeros_8elems(arr: &[u32], offset: isize) -> i32 {
unsafe {
let ymm0 = _mm256_setzero_si256();
let mut ymm1 = _mm256_loadu_si256(arr.as_ptr().offset(offset) as *const __m256i);
ymm1 = _mm256_cmpeq_epi32(ymm1, ymm0);
let ymm2 = _mm256_castsi256_ps(ymm1);
_mm256_movemask_ps(ymm2)
}
}
pub fn first_nonzero(arr: &[u32]) -> Option<usize> {
let size = arr.len();
if size < 8 {
return first_nonzero_tiny(arr);
}
let mut i: usize = 0;
let simd_size = size / 8 * 8;
while i < simd_size {
let mask: i32 = find_u32_zeros_8elems(&arr, i as isize);
//println!("mask = {}", mask);
if mask != 255 {
return Some((mask.trailing_ones() as usize) + i);
}
i += 8;
//println!("i = {}", i);
}
let last_chunk = size - 8;
let mask: i32 = find_u32_zeros_8elems(&arr, last_chunk as isize);
if mask != 255 {
return Some((mask.trailing_ones() as usize) + last_chunk);
}
None
}
}
use avx2::first_nonzero;
pub fn main() {
let v = [0];
let test1 = first_nonzero(&v);
assert_eq!(test1, None);
let v = [2];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(0));
let v = [1, 0, 0, 0, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(0));
let v = [0, 1, 0, 0, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(1));
let v = [0, 0, 1, 0, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(2));
let v = [0, 0, 0, 1, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(3));
let v = [0, 0, 0, 0, 1, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(4));
let v = [0, 0, 0, 0, 0, 1, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(5));
let v = [0, 0, 0, 0, 0, 1, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(5));
let v = [0, 0, 0, 0, 0, 0, 1, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(6));
let v = [0, 0, 0, 0, 0, 0, 0, 1, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(7));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 1];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(8));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, None);
let v = [0, 0, 0, 0, 0, 0, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, None);
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(16));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(15));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 3, 4, 5];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(14));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(17));
let v = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49];
let test1 = first_nonzero(&v);
assert_eq!(test1, Some(18));
}
let left
skips the zeros 4 by 4, by interpreting adjacent 4 words as a single u128. If we cannot skip zeros this way, we fall back to scanning one by one. – Dictatorialend
parameter because the slicearr[left..]
contains that part – Dictatorialmemchr()
. Other than that, in similar cases, use SIMD. – Heteroeciousmemx
crate appears to have a bug at the moment formemnechr
(at least for 0.1.18) – Dictatorial[simd]
tag) – Trilbiu32
inside a chunk. I just couldn't getrustc
to vectorize the middle part, which is going to make the most impact – Dictatorialpcmpeqd
/movmskps
anyway, so you already have the compare-result bitmap in an integer register just waiting for a bit-scan. – Luciau32
elements in an inner loop, but it's probably hard to get rustc to spit out a simple pcmpeqd / pmovmskb, rather than some silly horizontal reduction. – Lucia