I am trying to call cudaMemsetAsync
from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
int main()
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
Creating library a.lib and object a.exp
When I call memory set, I use value 0x7FFFFFFF
. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
to me. I have filed a bug internally at NVIDIA. If I learn anything significant, I will update. Until then, my suggestion would be to use another method to initialize memory in this fashion, such as a code loop. – Smokejumpermemset()
function, and call it instead of using the API function. Alternatively, write your own kernel and launch that using dynamic parallelism. I would very much prefer avoiding the kernel launch if I were you. – Twinge