As @DietrichEpp suggests, you'll have to make two loads; and as @tera suggests, you can combine these two loads generically for cheap even when the misalignment is not known in advance (i.e. when the initial address of uint8Array
is arbitrary) using the prmt
PTX instruction.
I'll offer a solution based on @tera's which will let you do:
value = read_unaligned(&uint8Array[offset]);
safely and (relatively) efficiently. Also, it will only have one inline PTX assembly instruction, and an "unsafe" variant if you need it:
#include <cstdint>
#include <cuda_runtime_api.h>
__device__ __forceinline__ uint32_t prmt_forward_4_extract(
uint32_t first_word,
uint32_t second_word,
uint32_t control_bits)
{
uint32_t result;
asm("prmt.b32.f4e %0, %1, %2, %3;"
: "=r"(result)
: "r"(first_word), "r"(second_word), "r"(control_bits) );
return result;
}
/*
* This unsafe, faster variant may read past the 32-bit naturally-aligned
* word containing the last relevant byte
*/
__device__ inline uint32_t read_unaligned_unsafe(const uint32_t* __restrict__ ptr)
{
/*
* Clear the bottom 2 bits of the address, making the result aligned
* for the purposes of reading a 32-bit (= 4-byte) value
*/
auto aligned_ptr = (uint32_t*) ((uint64_t) ptr & ~((uint64_t) 0x3));
auto first_value = *aligned_ptr;
auto second_value = *(aligned_ptr + 1);
auto lower_word_of_ptr = (uint32_t)((uint64_t)(ptr));
return prmt_forward_4_extract(first_value, second_value, lower_word_of_ptr);
}
__device__ inline uint32_t read_unaligned(const uint32_t* __restrict__ ptr)
{
auto ptr_is_already_aligned = ((uint64_t)(ptr) & 0x3 == 0);
if (ptr_is_already_aligned) { return *ptr; }
return read_unaligned_unsafe(ptr);
}