CUDA: how to create 2D texture object?
Asked Answered
C

1

3

I'm trying to create 2D texture object, 4x4 uint8_t. Here is the code:

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    uint8_t val = tex2D<uint8_t>(tex, x, y);
    printf("%d, ", val);
    return;
}

int main(int argc, char **argv)
{
    cudaTextureObject_t tex;
    uint8_t dataIn[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    uint8_t* dataDev = 0;
    cudaMalloc((void**)&dataDev, 16);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.desc.x = 8;
    resDesc.res.pitch2D.desc.y = 8;
    resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
    resDesc.res.pitch2D.width = 4;
    resDesc.res.pitch2D.height = 4;
    resDesc.res.pitch2D.pitchInBytes = 4;
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    cudaMemcpy(dataDev, &dataIn[0], 16, cudaMemcpyHostToDevice);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    return 0;
}

I expect that the result will be something like this:

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,

i.e. all values of the texture object (order doesn't matter).

But the actual result is:

0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6,     

What am I doing wrong?

Coelostat answered 8/1, 2019 at 19:47 Comment(0)
F
12

When you use the pitch2D variant for the texture operation, the underlying allocation is supposed to be a proper pitched allocation. I think typically people would create this with cudaMallocPitch. However the requirement stated is:

cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to cudaDeviceProp::texturePitchAlignment.

On my GPU, that last property is 32. I don't know about your GPU, but I bet that property is not 4 for your GPU. However you are specifying 4 here:

resDesc.res.pitch2D.pitchInBytes = 4;

Again, I think people would typically use a pitched allocation created with cudaMallocPitch for this. However it does appear to be possible to me to pass an ordinary linear allocation if the row-to-row dimension (in bytes) is divisible by texturePitchAlignment (32 in my case).

Another change I made is to use cudaCreateChannelDesc<>() instead of manually setting the parameters like you did. This creates a different set of desc parameters and seems to affect the result also. It should not be difficult to study the differences.

When I adjust your code to address those issues, I get results that seem sensible to me:

$ cat t30.cu
#include <stdio.h>
#include <stdint.h>

typedef uint8_t mt;  // use an integer type

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    mt val = tex2D<mt>(tex, x, y);
    printf("%d, ", val);
}

int main(int argc, char **argv)
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
    cudaTextureObject_t tex;
    const int num_rows = 4;
    const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
    const int ts = num_cols*num_rows;
    const int ds = ts*sizeof(mt);
    mt dataIn[ts];
    for (int i = 0; i < ts; i++) dataIn[i] = i;
    mt* dataDev = 0;
    cudaMalloc((void**)&dataDev, ds);
    cudaMemcpy(dataDev, dataIn, ds, cudaMemcpyHostToDevice);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.width = num_cols;
    resDesc.res.pitch2D.height = num_rows;
    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
    resDesc.res.pitch2D.pitchInBytes = num_cols*sizeof(mt);
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    printf("\n");
    return 0;
}
$ nvcc -o t30 t30.cu
$ cuda-memcheck ./t30
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$

As asked in the comments, if I were going to do something similar to this but using cudaMallocPitch and cudaMemcpy2D, it could look something like this:

$ cat t1421.cu
#include <stdio.h>
#include <stdint.h>

typedef uint8_t mt;  // use an integer type

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    mt val = tex2D<mt>(tex, x, y);
    printf("%d, ", val);
}

int main(int argc, char **argv)
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
    cudaTextureObject_t tex;
    const int num_rows = 4;
    const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
    const int ts = num_cols*num_rows;
    const int ds = ts*sizeof(mt);
    mt dataIn[ts];
    for (int i = 0; i < ts; i++) dataIn[i] = i;
    mt* dataDev = 0;
    size_t pitch;
    cudaMallocPitch((void**)&dataDev, &pitch,  num_cols*sizeof(mt), num_rows);
    cudaMemcpy2D(dataDev, pitch, dataIn, num_cols*sizeof(mt), num_cols*sizeof(mt), num_rows, cudaMemcpyHostToDevice);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.width = num_cols;
    resDesc.res.pitch2D.height = num_rows;
    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
    resDesc.res.pitch2D.pitchInBytes = pitch;
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    printf("\n");
    return 0;
}
$ nvcc -o t1421 t1421.cu
$ cuda-memcheck ./t1421
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$

Although what we have here are texture objects, its easy enough to demonstrate that similar issues occur with texture references. You cannot create an arbitrarily small 2D texture reference just as you cannot create an arbitrarily small 2D texture object. I'm not going to provide a demonstration of that also, as it would largely duplicate the above, and folks shouldn't be using texture references anymore for new development work - texture objects are the better approach.

Fist answered 8/1, 2019 at 21:43 Comment(11)
Thanks for the detailed answer. I really didn't understand how to use pitch parameter and your answer was very helpfull.Coelostat
Robert, can you please show how to use cudaMallocPitch and cudaMemcpy2D instead of cudaMalloc and cudaMemcpy in this code to get absolutely the same result?Coelostat
I can't do something identical with cudaMallocPitch because it will allocate to a wider allocation pitch (probably 512 bytes instead of 32). I can do something similar, but not identical. I have updated my answer with an example.Fist
@RobertCrovella, I don't understand why there isn't simple explanation about Texture memory in cuda(even in books they did not write about that), could you please suggest a good tutorial on this?Orfurd
chapter 7 of CUDA by Example, Sanders and KandrotFist
The CUDA handbook also covers texturing in part 2.Fist
@RobertCrovella,Thank you very muchOrfurd
@RobertCrovella: Both the books (CUDA Handbook and CUDA by Example) do not provide any information about "Texture Objects". But they provide ample information to get some idea about the Texture Memory. Is there any good/detailed resource to better understand the new approach of using "Texture Objects"?Extraneous
There is this blog.Fist
Shouldn't dataIn be initialized as mt dataIn[ts]; (instead of mt dataIn[ds];)?Firn
Yes, I've made the change, thanks.Fist

© 2022 - 2024 — McMap. All rights reserved.