When you use the pitch2D
variant for the texture operation, the underlying allocation is supposed to be a proper pitched allocation. I think typically people would create this with cudaMallocPitch
. However the requirement stated is:
cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to cudaDeviceProp::texturePitchAlignment.
On my GPU, that last property is 32. I don't know about your GPU, but I bet that property is not 4 for your GPU. However you are specifying 4 here:
resDesc.res.pitch2D.pitchInBytes = 4;
Again, I think people would typically use a pitched allocation created with cudaMallocPitch
for this. However it does appear to be possible to me to pass an ordinary linear allocation if the row-to-row dimension (in bytes) is divisible by texturePitchAlignment
(32 in my case).
Another change I made is to use cudaCreateChannelDesc<>()
instead of manually setting the parameters like you did. This creates a different set of desc
parameters and seems to affect the result also. It should not be difficult to study the differences.
When I adjust your code to address those issues, I get results that seem sensible to me:
$ cat t30.cu
#include <stdio.h>
#include <stdint.h>
typedef uint8_t mt; // use an integer type
__global__ void kernel(cudaTextureObject_t tex)
{
int x = threadIdx.x;
int y = threadIdx.y;
mt val = tex2D<mt>(tex, x, y);
printf("%d, ", val);
}
int main(int argc, char **argv)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
cudaTextureObject_t tex;
const int num_rows = 4;
const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
const int ts = num_cols*num_rows;
const int ds = ts*sizeof(mt);
mt dataIn[ts];
for (int i = 0; i < ts; i++) dataIn[i] = i;
mt* dataDev = 0;
cudaMalloc((void**)&dataDev, ds);
cudaMemcpy(dataDev, dataIn, ds, cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = dataDev;
resDesc.res.pitch2D.width = num_cols;
resDesc.res.pitch2D.height = num_rows;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
resDesc.res.pitch2D.pitchInBytes = num_cols*sizeof(mt);
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
dim3 threads(4, 4);
kernel<<<1, threads>>>(tex);
cudaDeviceSynchronize();
printf("\n");
return 0;
}
$ nvcc -o t30 t30.cu
$ cuda-memcheck ./t30
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$
As asked in the comments, if I were going to do something similar to this but using cudaMallocPitch
and cudaMemcpy2D
, it could look something like this:
$ cat t1421.cu
#include <stdio.h>
#include <stdint.h>
typedef uint8_t mt; // use an integer type
__global__ void kernel(cudaTextureObject_t tex)
{
int x = threadIdx.x;
int y = threadIdx.y;
mt val = tex2D<mt>(tex, x, y);
printf("%d, ", val);
}
int main(int argc, char **argv)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
cudaTextureObject_t tex;
const int num_rows = 4;
const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
const int ts = num_cols*num_rows;
const int ds = ts*sizeof(mt);
mt dataIn[ts];
for (int i = 0; i < ts; i++) dataIn[i] = i;
mt* dataDev = 0;
size_t pitch;
cudaMallocPitch((void**)&dataDev, &pitch, num_cols*sizeof(mt), num_rows);
cudaMemcpy2D(dataDev, pitch, dataIn, num_cols*sizeof(mt), num_cols*sizeof(mt), num_rows, cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = dataDev;
resDesc.res.pitch2D.width = num_cols;
resDesc.res.pitch2D.height = num_rows;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
resDesc.res.pitch2D.pitchInBytes = pitch;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
dim3 threads(4, 4);
kernel<<<1, threads>>>(tex);
cudaDeviceSynchronize();
printf("\n");
return 0;
}
$ nvcc -o t1421 t1421.cu
$ cuda-memcheck ./t1421
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$
Although what we have here are texture objects, its easy enough to demonstrate that similar issues occur with texture references. You cannot create an arbitrarily small 2D texture reference just as you cannot create an arbitrarily small 2D texture object. I'm not going to provide a demonstration of that also, as it would largely duplicate the above, and folks shouldn't be using texture references anymore for new development work - texture objects are the better approach.