How to set bits of a bit vector efficiently in CUDA?

The task is like How to set bits of a bit vector efficiently in parallel?, but for CUDA.

Consider a bit vector of N bits in it (N is large, e.g. 4G) and an array of M numbers (M is also large, e.g. 1G), each in range 0..N-1 indicating which bit of the vector must be set to 1. The bit vector is just an array of integers, specifically uint32_t.

I've tried a naive implementation with atomicOr() on the global memory:

__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices,
      const uint32_t n_bits, uint32_t *bitset)
{
  const uint32_t n_threads = blockDim.x * gridDim.x;
  const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
  for(uint32_t i=i_thread; i<n_indices; i +=n_threads) {
    const uint32_t index = indices[i];
    assert(index < n_bits);
    const uint32_t i_word = index >> 5;
    const uint32_t i_bit = index & 31;
    atomicOr(bitset+i_word, 1u<<(i_bit));
  }
}

And it produces interesting results for 4G bits and 1G indices:

RTX3090: 0.0383266 sec. for sorted indices vs. 0.332674 sec. for unsorted (8.68x improvement)
RTX2080: 0.0564464 sec. for sorted indices vs. 1.23666 sec. for unsorted (21.91x improvement)

So it seems the devices coalesce/unite multiple atomicOr() operations within a warp, and perhaps L1 cache kicks in, so when indices conflict (which is the case for sorted indices), 32-bit assignments are in reality much faster than for non-conflicting indices (the unsorted case).

Can we further improve the sorted or unsorted case?

UPDATE: answering the comments, any solution is of interest, whether it's for sorted or unsorted case, with or without repetitions. Unsorted and with repetitions is a more generic case, so it would be of the most interest.

UPDATE2: following the suggestion to vectorize memory accesses, I implemented this:

__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices, const uint32_t n_bits, uint32_t *bitset) {
  const uint32_t n_threads = blockDim.x * gridDim.x;
  const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
  const uint32_t n_vectors = n_indices / 4;
  for(uint32_t i=i_thread; i<n_vectors; i +=n_threads) {
    const uint4 v_index = reinterpret_cast<const uint4*>(indices)[i];
    assert(v_index.x < n_bits);
    assert(v_index.y < n_bits);
    assert(v_index.z < n_bits);
    assert(v_index.w < n_bits);
    uint4 vi_word, vi_bit;
    vi_word.x = v_index.x >> 5;
    vi_word.y = v_index.y >> 5;
    vi_word.z = v_index.z >> 5;
    vi_word.w = v_index.w >> 5;
    vi_bit.x = v_index.x & 31;
    vi_bit.y = v_index.y & 31;
    vi_bit.z = v_index.z & 31;
    vi_bit.w = v_index.w & 31;
    atomicOr(bitset+vi_word.x, 1u<<vi_bit.x);
    atomicOr(bitset+vi_word.y, 1u<<vi_bit.y);
    atomicOr(bitset+vi_word.z, 1u<<vi_bit.z);
    atomicOr(bitset+vi_word.w, 1u<<vi_bit.w);
  }
  if(i_thread < 4) {
    const uint32_t tail_start = n_vectors*4;
    const uint32_t tail_len = n_indices - tail_start;
    if(i_thread < tail_len) {
      const uint32_t index = indices[tail_start+i_thread];
      assert(index < n_bits);
      const uint32_t i_word = index >> 5;
      const uint32_t i_bit = index & 31;
      atomicOr(bitset+i_word, 1u<<i_bit);
    }
  }
}

But at least on RTX2080 it's slower (I don't have the eGPU with RTX3090 with me right now to test):

RTX2080: 0.0815998 sec. for sorted vs. 1.39829 sec. for unsorted (17.14x ratio)

//prefetch count cannot be greater than 8! template <int my_blockdim, int prefetchcount> __global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices, const uint32_t n_bits, uint32_t *bitset) { constexpr auto buffersize = myblockdim * prefetchcount; __shared__ s_indices[buffersize]; auto pipeline = cuda::make_pipeline(); //pipeline with thread_scope_thread //every block handles its own section of the data. const auto start = blockDim.x * blockIdx.x; const auto end = std::min(n_indices, start + ((n_indices + gridDim.x - 1) / gridDim.x); const auto prefetch = [&](uint32_t i){ //pipeline.producer_acquire(); //no-op for thread_scope_thread const auto source = &indices[start + i]; const auto dest = &s_indices[i % buffersize]; constexpr auto size = sizeof(int); memcpy_async(dest, source, size, pipeline); pipeline.producer_commit(); }; //prime the pump for (auto i = 0; i < prefetchcount; i ++) { const auto a = start + threadIdx.x + (blockDim.x * i); prefetch(a); } const auto dowork = [&]<bool in_tail>(uint32_t start, uint32_t end) { //skip prefetch items, we'll process those in the tail. for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) { pipeline.consumer_wait(); //wait for one batch //__syncwarp(); no need for sync here const auto index = s_indices[i % buffersize]; //fast because mod by constant //prefetch the next batch if constexpr (in_tail) { prefetch(i); } //const uint32_t index = indices[i]; assert(index < n_bits); const uint32_t i_word = index >> 5; const uint32_t i_bit = index & 31; atomicOr(bitset+i_word, (1u << i_bit)); }; const auto start2 = start + buffersize; dowork.template operator()<false>(start2, end); dowork.template operator()<true>(0, buffersize); }

Recommended topics

Hot tags