CUDA shared memory wrapped in templated class, points to same memory

__global__ void test() { // Shared array SharedArray<int, 5> sharedArray; int* x0 = sharedArray(); int* y0 = sharedArray(); x0[0] = 1; y0[0] = 0; printf("%i %i\n\n", x0[0], y0[0]); // Prints: // 0 0 }

__global__ void test() { int& x1 = StaticSharedArrayWithID<__COUNTER__, int, 5>::shared(); int& y1 = StaticSharedArrayWithID<__COUNTER__, int, 5>::shared(); x1[0] = 1; y1[0] = 0; printf("%i %i\n\n", x1[0], y1[0]); // Prints: // 1 0 }

The reason for this is because __shared__ variables are static by default. Same instance of the same function refers to the same variable. The original reason for this behavior is because the compiler cannot deduct when the memory can be reclaimed. Having a variable static makes it live as long as the kernel.

A side effect is that if you have the same function called twice it two places in the program - you get the same result. In fact that is what you expect anyway when multiple CUDA threads call your function in the same spot, don't you?

There is no clean way to allocate shared memory dynamically. In my projects I did it through my own shared memory memory manager (ugly pointer arithmetic ahead, beware!):

typedef unsigned char byte;

/*
  Simple shared memory manager.
    With any luck if invoked with constant parameters this will not take up any register whatsoever
    Must be called uniformly by whole block which is going to use these
    sSize - amount of preallocated memory
*/
template <size_t sSize>
class SharedMemoryManager {
private:
    byte* shArray;
    byte* head;
public:

    __device__ SharedMemoryManager() {
        __shared__ byte arr[sSize];
        shArray=arr;
        head=arr;
    }

    __device__  void reset() {
        head=shArray;
    }

    __device__  byte* getHead() {return head;}
    __device__  void setHead(byte* newHead) {head=newHead;}

    template <typename T>
    __device__  T* alloc(size_t count) {
      size_t addr = head;
      size_t alignment = __alignof(T); //assuming alignment is power of 2
      addr = ((addr-1) | (alignment-1)) +1; //round up to match the alignment requirement
      head = (byte*)(addr);
      T* var = (T*)(head);
      head+=sizeof(T)*size;
      return allocAt<T>(head,count);
    }

template <typename T>
    __device__  T& alloc() {
      return *alloc<T>(1);
    }

};

You can use getHead/setHead to reclaim shared memory when you know it can be reclaimed, but only in a stack manner.

This approach should be easy to abstract over non-shared memory when CUDA is not your target.

Then you should be able to write:

__global__
 void test() {
   SharedMemoryManager shMem<1024>();

   int& xValue = shMem.alloc<int>();
   int& yValue = shMem.alloc<int>();
   int* xArray = shMem.alloc<int>(5);
   int* yArray = shMem.alloc<int>(5);

   xArray[0] = 1;
   yArray[0] = 0;
   printf("%i %i\n\n", xArray[0], yArray[0]);
   __syncthreads();
   shMem.reset(); //memory reclaimed
   ... 
}

Recommended topics

Hot tags