Negative array indexing in shared memory based 1d stencil CUDA implementation

Asked 29/10, 2014 at 7:42 Answered 8/1, 2019 at 9:4

I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:

__global__ void stencil_1d(int *in, int *out) {

    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];

    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    temp[lindex] = in[gindex];
    if (threadIdx.x < RADIUS) {
        temp[lindex - RADIUS] = in[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
    }

....

To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.

The confusion comes in on the following line:

 temp[lindex - RADIUS] = in[gindex - RADIUS];

If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.

Neurologist answered 29/10, 2014 at 7:42 Comment(0)

The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.

To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.

(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    unsigned int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}

Oversee answered 29/10, 2014 at 11:4 Comment(0)

You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).

My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:

stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);

So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.

Elodiaelodie answered 29/10, 2014 at 8:53 Comment(0)

The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.

To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.

(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    unsigned int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}

Oversee answered 29/10, 2014 at 11:4 Comment(0)

There are already good answers, but to focus on the actual point that caused the confusion:

In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.

For example, consider a pointer like this:

int* data= ... // Points to some memory

When you then write a statement like

data[3] = 42;

you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written

int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]

and consequently,

dataWithOffset[-3] = 123; // This will write into data[0]

_{In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)}

Feticide answered 29/10, 2014 at 11:43 Comment(0)

I can compile @JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":

And when run, it will abort like:

I have modified the code to the following and the warning disappeared and it can get right result:

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}

The result is like this:

Lodged answered 8/1, 2019 at 9:4 Comment(0)

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags