cuda shared memory - inconsistent results

#include <cstdlib> #include <iostream> #include <cuda.h> #include <cuda_runtime_api.h> #include <helper_cuda.h> #include <host_config.h> #define THREADS_PER_BLOCK 256 #define CUDA_ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } using namespace std; inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } struct double3c { double x; double y; double z; __host__ __device__ double3c() : x(0), y(0), z(0) {} __host__ __device__ double3c(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {} __host__ __device__ double3c& operator+=(const double3c& rhs) { x += rhs.x; y += rhs.y; z += rhs.z;} __host__ __device__ double3c& operator/=(const double& rhs) { x /= rhs; y /= rhs; z /= rhs;} }; class VectorField { public: double3c *data; int size_x, size_y, size_z; bool is_copy; __host__ VectorField () {} __host__ VectorField (int x, int y, int z) { size_x = x; size_y = y; size_z = z; is_copy = false; CUDA_ERROR_CHECK (cudaMalloc(&data, x * y * z * sizeof(double3c))); } __host__ VectorField (const VectorField& other) { size_x = other.size_x; size_y = other.size_y; size_z = other.size_z; this->data = other.data; is_copy = true; } __host__ ~VectorField() { if (!is_copy) CUDA_ERROR_CHECK (cudaFree(data)); } }; __global__ void KernelCalculateMeanFieldBlock (VectorField m, double3c* result) { __shared__ double3c blockmean[THREADS_PER_BLOCK]; int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0); else blockmean[threadIdx.x] = double3c(0,0,0); __syncthreads(); for(int s = THREADS_PER_BLOCK / 2; s > 0; s /= 2) { if (threadIdx.x < s) blockmean[threadIdx.x] += blockmean[threadIdx.x + s]; __syncthreads(); } if(threadIdx.x == 0) result[blockIdx.x] = blockmean[0]; } double3c CalculateMeanField (VectorField& m) { int blocknum = (m.size_x * m.size_y * m.size_z - 1) / THREADS_PER_BLOCK + 1; double3c *mean = new double3c[blocknum](); double3c *cu_mean; CUDA_ERROR_CHECK (cudaMalloc(&cu_mean, sizeof(double3c) * blocknum)); CUDA_ERROR_CHECK (cudaMemset (cu_mean, 0, sizeof(double3c) * blocknum)); KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK>>> (m, cu_mean); CUDA_ERROR_CHECK (cudaPeekAtLastError()); CUDA_ERROR_CHECK (cudaDeviceSynchronize()); CUDA_ERROR_CHECK (cudaMemcpy(mean, cu_mean, sizeof(double3c) * blocknum, cudaMemcpyDeviceToHost)); CUDA_ERROR_CHECK (cudaFree(cu_mean)); for (int i = 1; i < blocknum; i++) {mean[0] += mean[i];} mean[0] /= m.size_x * m.size_y * m.size_z; double3c aux = mean[0]; delete[] mean; return aux; } int main() { VectorField m(100,100,100); double3c sum = CalculateMeanField (m); cout << sum.x << '\t' << sum.y << '\t' <<sum.z; return 0; }

When I compile the code you have now on linux, I get the following warning:

t614.cu(55): warning: __shared__ memory variable with non-empty constructor or destructor (potential race between threads)

This type of warning should not be ignored. It is associated with this line of code:

__shared__ double3c blockmean[THREADS_PER_BLOCK];

Since the initialization of these objects stored in shared memory (by the constructor) will happen in some arbitrary order, and you have no barrier between that and the subsequent code that will also set these values, unpredictable things (*) can happen.

If I insert a __syncthreads() in the code to isolate the constructor activity from the subsequent code, I get expected results:

__shared__ double3c blockmean[THREADS_PER_BLOCK];    
int index = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();  // add this line
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
else blockmean[threadIdx.x] = double3c(0,0,0);
__syncthreads();

This still leaves us with the warning, however. A modification to fix this and make the warning go away would be to allocate the necessary __shared__ size dynamically. Change your shared memory declaration to this:

extern __shared__ double3c blockmean[];

and modify your kernel call:

KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK, THREADS_PER_BLOCK*sizeof(double3c)>>> (m, cu_mean);

This will eliminate the warning, produce the correct result, and avoid the unnecessary constructor traffic on the shared memory variable. (And the additional __syncthreads() described above is no longer necessary.)

*regarding "unpredictable things", if you look under the hood by inspecting either the generated SASS (cuobjdump -sass ...) or the PTX (**) (nvcc -ptx ...), you will see that each thread initializes the entire __shared__ array of objects to zero (the behavior of the default constructor). As a result of this, some of the threads (i.e. warps) can race ahead and begin populating the shared memory area according to this line:

if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);

Then, when other warps begin executing, those threads will clear out the entire shared memory array again. This racing behavior leads to unpredictable results.

** I don't normally suggest judging code behavior by inspecting the PTX, but in this case it is equally instructive. The final compile stages will not optimize away the constructor behavior.

Recommended topics

Hot tags