Memory allocation on GPU for dynamic array of structs
Asked Answered
H

2

3

I have problem with passing array of struct to gpu kernel. I based on this topic - cudaMemcpy segmentation fault and I wrote sth like this:

#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);
    for(int i=0; i < n; i++) {
        cudaMalloc((void**)&(test[i].array), size * sizeof(char));
        cudaMemcpy(&(dev_test[i].array), &(test[i].array), size * sizeof(char), cudaMemcpyHostToDevice);
    }

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

There is no error, but displayed values in kernel are incorrect. What I'm doing wrong? Thank in advance for any help.

Hedwig answered 6/5, 2015 at 16:55 Comment(4)
Why is it cudaMalloc((void**)&(test[i].array), size * sizeof(char)); and not cudaMalloc((void**)&(dev_test[i].array), size * sizeof(char)); ? Also, it should be cudaMemcpy(dev_test[i].array, test[i].array, size * sizeof(char), cudaMemcpyHostToDevice);.Annabelle
@francis, it doesn't works (Segmentation fault (core dumped)). On gpu we can't allocate memory in standard way.Hedwig
Additional friendly advice : do not pick code from a question except if you have understood the problem faced by the asker...Sorry if my suggestion didn't work. My suggestion was to allocate memory for dev_test[i].array, not for test[i].array which is already allocated on the CPU by test[i].array = (char*)malloc(size * sizeof(char));.Annabelle
@francis, ok no problem. Yes test[i].array is already allocated but only on CPU, no on GPU. We can't allocate memory for dev_test[i].array, because this memory is only visible from device. At least I understand it so.Hedwig
W
11
  1. This is allocating a new pointer to host memory:

     test[i].array = (char*)malloc(size * sizeof(char));
    
  2. This is copying data to that region in host memory:

     memcpy(test[i].array, temp, size * sizeof(char));
    
  3. This is overwriting the previously allocated pointer to host memory (from step 1 above) with a new pointer to device memory:

     cudaMalloc((void**)&(test[i].array), size * sizeof(char));
    

After step 3, the data you set up in step 2 is entirely lost, and no longer accessible in any fashion. Referring to steps 3 and 4 in the question/answer you linked:

3.Create a separate int pointer on the host, let's call it myhostptr

4.cudaMalloc int storage on the device for myhostptr

You haven't done this. You did not create a separate pointer. You reused (erased, overwrote) an existing pointer, which was pointing to data you cared about on the host. This question/answer, also linked from the answer you linked, gives almost exactly the steps you need to follow, in code.

Here's a modified version of your code, which properly implements the missing steps 3 and 4 (and 5) that you didn't implement correctly according to the question/answer you linked: (refer to comments delineating steps 3,4,5)

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    // Step 3:
    char *temp_data[n];
    // Step 4:
    for (int i=0; i < n; i++)
      cudaMalloc(&(temp_data[i]), size*sizeof(char));
    // Step 5:
    for (int i=0; i < n; i++)
      cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);
    // now copy the embedded data:
    for (int i=0; i < n; i++)
      cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

Since the above methodology can be challenging for beginners, the usual advice is not to do it, but instead flatten your data structures. Flatten generally means to rearrange the data storage so as to remove the embedded pointers that have to be separately allocated.

A trivial example of flattening this data structure would be to use this instead:

struct Test {
    char array[5];
};

It's recognized of course that this particular approach would not serve every purpose, but it should illustrate the general idea/intent. With that modification, as an example, the code becomes much simpler:

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char array[5];
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}
$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$
Warrigal answered 6/5, 2015 at 18:11 Comment(2)
thanks a lot. What do you mean by "flatten your data structures"?Hedwig
updated my answer to respond to this question. However if you search on the CUDA tag you will find many references and examples for "flattening".Warrigal
M
1

Thanks @Robert Crovella for the answer. The above answer was very useful to me. I have updated the code from using charcter array to using integer array of structures for reference. In the kernel, values are updated and returned back to host.

  #include <stdio.h>
  #include <stdlib.h>

  struct Test {
  int *array;
  };

  __global__ void kernel(Test *dev_test) {
  //    for(int i=0; i < 4; i++)
  //      for(int j=0; j < 5; j++) {
  //        printf("Kernel[X][i]: %d \n", dev_test[i].array[j]);
  //    }
  int i =threadIdx.x + blockIdx.x*blockDim.x;
  int j = threadIdx.y + blockIdx.y*blockDim.y;

  if(i<4 && j<5)
      dev_test[i].array[j] *= 2; 
  }

  int main(void) {

  int n = 4, size = 5;
  Test *dev_test, *test;

  test = (Test*)malloc(sizeof(Test)*n);
  for(int i = 0; i < n; i++)
      test[i].array = (int*)malloc(size * sizeof(int));

  for(int i=0;i<n;i++)
      for(int j=0; j<size; j++)
          test[i].array[j] = i*n+j;

  cudaMalloc((void**)&dev_test, n * sizeof(Test));
  cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

  // Step 3:
  int *temp_data[n];
  // Step 4:
  for (int i=0; i < n; i++)
      cudaMalloc(&(temp_data[i]), size*sizeof(int));
  // Step 5:
  for (int i=0; i < n; i++)
      cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(int *), cudaMemcpyHostToDevice);
  // now copy the embedded data:
  for (int i=0; i < n; i++)
      cudaMemcpy(temp_data[i], test[i].array, size*sizeof(int), cudaMemcpyHostToDevice);

  dim3 threads(4,5);
  kernel<<<1, threads>>>(dev_test);
  cudaDeviceSynchronize();

  for(int i=0;i<n;i++)
      cudaMemcpy(test[i].array, temp_data[i], size*sizeof(int), cudaMemcpyDeviceToHost);

  for(int i=0;i <n;i++){
      for(int j=0;j<size;j++)
          printf("%d ",test[i].array[j]);
  printf("\n");
  }
  //  memory free
  return 0;
  }
Manpower answered 28/3, 2023 at 13:24 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.