CUDA Dynamic Parallelism, bad performance

Asked 19/7, 2017 at 21:10 Answered 30/8, 2017 at 21:2

Solved c++cuda dynamic-parallelism cuda-streams

We are having performance issues when using the CUDA Dynamic Parallelism. At this moment, CDP is performing at least 3X slower than a traditional approach. We made the simplest reproducible code to show this issue, which is to increment the value of all elements of an array by +1. i.e.,

a[0,0,0,0,0,0,0,.....,0] --> kernel +1 --> a[1,1,1,1,1,1,1,1,1]

The point of this simple example is just to see if CDP can perform as the others, or if there are serious overheads.

The code is here:

#include <stdio.h>
#include <cuda.h>
#define BLOCKSIZE 512

__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);


// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid == 0){
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);

        kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
        kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
    }

    a[tid] += 1;
}


__global__ void kernel_simple(int *a, int n, int N, int offset){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int pos = tid + offset;
    if(pos < N){
        a[pos] += 1;
    }
}

int main(int argc, char **argv){
    if(argc != 3){
        fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional)  \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
        exit(EXIT_FAILURE);
    }
    int N = atoi(argv[1])*3;
    int method = atoi(argv[2]);
    // init array as 0
    int *ah, *ad;
    printf("genarray of 3*N = %i.......", N); fflush(stdout);
    ah = (int*)malloc(sizeof(int)*N);
    for(int i=0; i<N; ++i){
        ah[i] = 0;
    }
    printf("done\n"); fflush(stdout);

    // malloc and copy array to gpu
    printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
    cudaMalloc(&ad, sizeof(int)*N);
    cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
    printf("done\n"); fflush(stdout);

    // kernel launch (timed)
    cudaStream_t s1, s2, s3;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
    cudaEvent_t start, stop;
    float rtime = 0.0f;
    cudaEventCreate(&start); 
    cudaEventCreate(&stop);
    printf("Kernel...........................", N); fflush(stdout);
    if(method == 0){
        // CLASSIC KERNEL LAUNCH
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block >>> (ad, N, N, 0);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else if(method == 1){
        // DYNAMIC PARALLELISM
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else{
        // THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
        kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
        kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    printf("done\n"); fflush(stdout);


    printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
    cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
    printf("done\n"); fflush(stdout);

    printf("checking result.................."); fflush(stdout);
    for(int i=0; i<N; ++i){
        if(ah[i] != 1){
            fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
            exit(EXIT_FAILURE);
        }
    }
    printf("done\n"); fflush(stdout);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&rtime, start, stop);
    printf("rtime: %f ms\n", rtime); fflush(stdout);
    return EXIT_SUCCESS;
}

Can be compiled with

nvcc -arch=sm_35 -rdc=true -lineinfo -lcudadevrt -use_fast_math main.cu -o prog

This example can compute the result with 3 methods:

Simple Kernel: Just a single classic kernel +1 pass on the array.
Dynamic Parallelism: from main(), call a parent kernel which does +1 on the range [0,N/3), and also calls two child kernels. The first child does +1 in the range [N/3, 2*N/3), the second child in the range [2*N/3,N). Childs are launched using different streams so they can be concurrent.
Three Streams from Host: This one just launches three non-blocking streams from main(), one for each third of the array.

I get the following profile for method 0 (simple kernel): The following for method 1 (dynamic parallelism): And the following for method 2 (Three Streams from Host) The running times are like this:

➜  simple-cdp git:(master) ✗ ./prog 16777216 0
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.140928 ms
➜  simple-cdp git:(master) ✗ ./prog 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 5.790048 ms
➜  simple-cdp git:(master) ✗ ./prog 16777216 2
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.011936 ms

The main problem, visible from the pictures, is that in the Dynamic Parallelism method the parent kernel is taking excessive amount of time to close after the two child kernels have finished, which is what is making it take 3X or 4X times more. Even when considering the worst case, if all three kernels (parent and two childs) run in serial, it should take much less. I.e., there is N/3 of work for each kernel, so the whole parent kernel should take approx 3 child kernels long, which is much less. Is there a way to solve this problem?

EDIT: The serialization phenomenon of the child kernels, as well as for method 2, have been explained by Robert Crovella in the comments (many thanks). The fact that the kernels did run in serial do not invalidate the problem described in bold text (not for now at least).

Orontes answered 19/7, 2017 at 21:10 Comment(5)

Regarding serialization, the serialization is due to the size of the kernels. A kernel launch which fully occupies a GPU will fully occupy the GPU and prevent subsequent kernels from occupying the GPU. Actually witnessing concurrent kernel execution is hard to do in practice. Study the relevant CUDA sample code and you will see that that kernel is carefully crafted to make limited usage of GPU resources to enable concurrency. If you want to see kernel concurrency, run the CUDA sample code, and learn how to design a code like it. – Avar 19/7, 2017 at 21:25

I see. For this example the design hardly made any sense but I understand what your explained and therefore I would not consider it a problem as the GPU is working at full occupancy or close to that. In my real example, the recursion keeps going as a binary tree so eventually I do produce small kernels that would benefit from concurrency. I would then have to focus on the second problem, which is the cause for the 3X or more of slowdown. Might it be related to the same cause, i.e., the father occupying the GPU? But the work is N/3 on all three, so no reason to take more time in terms of work – Orontes 19/7, 2017 at 21:33

I'm not suggesting I understand the report about dynamic parallelism extending the duration of the parent kernel that long. I can't explain that at the moment. The serialization is not surprising to me, but it seems to be (from my perspective) the lesser of the two issues. In other words, I agree. But I haven't investigated that other issue yet. First step is to try to reproduce it and also study your code a bit. – Avar 20/7, 2017 at 0:9

thanks, It would be very useful if you get the chance to reproduce the code and post back your findings. – Orontes 21/7, 2017 at 2:40

It seems that the "cudaStreamCreateWithFlags(...)" function is the one producing extra time. Launching with no streams at all (no creation as well) makes the CDP run practically as fast as the others at least in non-profiled time. The bad thing of this is that we eliminate any chance of concurrent kernels, which is a bad hit for an eventual recursive algorithm that at some point would produce small kernels. Now, if we launch the kernels from different thread-blocks, would that allow the chance of concurrency if utilization is low for one kernel? – Orontes 21/7, 2017 at 14:19

Calls into the device runtime are "expensive", just like calls into the host runtime are expensive. In this case, it seems that you are calling into the device runtime to create streams for every thread, even though this code only requires them for thread 0.

By modifying your code to only request the stream creation for thread 0, we can produce timing parity between the case where we are using separate streams for the child kernel launch, and the case where we are not using separate streams for the child kernel launch:

$ cat t370.cu
#include <stdio.h>
#define BLOCKSIZE 512

__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);


// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid == 0){
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
#ifdef USE_STREAMS
        cudaStream_t s1, s2;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
        kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
        kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
#else
        kernel_simple<<< grid, block >>> (a, n, N, n);
        kernel_simple<<< grid, block >>> (a, n, N, 2*n);
#endif
// these next 2 lines add noticeably to the overall timing
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) printf("oops1: %d\n", (int)err);
    }

    a[tid] += 1;
}


__global__ void kernel_simple(int *a, int n, int N, int offset){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int pos = tid + offset;
    if(pos < N){
        a[pos] += 1;
    }
}

int main(int argc, char **argv){
    if(argc != 3){
        fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional)  \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
        exit(EXIT_FAILURE);
    }
    int N = atoi(argv[1])*3;
    int method = atoi(argv[2]);
    // init array as 0
    int *ah, *ad;
    printf("genarray of 3*N = %i.......", N); fflush(stdout);
    ah = (int*)malloc(sizeof(int)*N);
    for(int i=0; i<N; ++i){
        ah[i] = 0;
    }
    printf("done\n"); fflush(stdout);

    // malloc and copy array to gpu
    printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
    cudaMalloc(&ad, sizeof(int)*N);
    cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
    printf("done\n"); fflush(stdout);

    // kernel launch (timed)
    cudaStream_t s1, s2, s3;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
    cudaEvent_t start, stop;
    float rtime = 0.0f;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    printf("Kernel...........................", N); fflush(stdout);
    if(method == 0){
        // CLASSIC KERNEL LAUNCH
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block >>> (ad, N, N, 0);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else if(method == 1){
        // DYNAMIC PARALLELISM
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else{
        // THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
        kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
        kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    printf("done\n"); fflush(stdout);


    printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
    cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
    printf("done\n"); fflush(stdout);

    printf("checking result.................."); fflush(stdout);
    for(int i=0; i<N; ++i){
        if(ah[i] != 1){
            fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
            exit(EXIT_FAILURE);
        }
    }
    printf("done\n"); fflush(stdout);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&rtime, start, stop);
    printf("rtime: %f ms\n", rtime); fflush(stdout);
    return EXIT_SUCCESS;
}
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.925632 ms
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu -DUSE_STREAMS
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.673568 ms
$

Although not included in the test output above, according to my testing, this also brings the CUDA dynamic parallelism (CDP) case (1) into "approximate parity" with the non-CDP cases (0, 2). Note that we can shave about 1 ms (!) off the above time by forgoing the call to cudaGetLastError() in the parent kernel (which I added to your code).

Avar answered 22/7, 2017 at 1:54 Comment(4)

Many thanks. The time went down to be competitive with other methods, also removed the error checking to get a faster time. Now, should we accept this difference in time i.e., 3.18ms (CDP) vs 2.15ms (Others) as part of the CDP overhead ? – Orontes 23/7, 2017 at 12:35

When I tested on a GTX 960 using CUDA 9 EA or CUDA 7.5, the difference in time between the 3 cases was smaller than you are indicating, when I removed the extra error check that I mentioned. I witnessed 0: 4.8ms, 1: 5.3 ms, 2: 4.7ms (no difference between CUDA 7.5 and CUDA 9 perf.) On CUDA 8 with a Pascal Titan X, I witnessed 0: 1.08ms, 1: 1.35ms, 2: 1.08ms Yes I expect child kernel launches will have some overhead, and for a problem that is as trivially simple as this one, it is clearly more efficient to not use CDP. (All my tests were on linux.) – Avar 23/7, 2017 at 23:13

I see. Regarding the example, yes it was too simple. I will make a recursive CDP version of this problem and see if there is too much overhead or not, but this is out of the scope of this question. In that case I would make another one. Thanks again. – Orontes 24/7, 2017 at 12:37

I noticed that for a small count the overhead is quite large (100us vs 26us on my p5000 card) – Gravitation 30/8, 2017 at 18:36

#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using thrust::host_vector;
using thrust::device_vector;

#define BLOCKSIZE 512

__global__ void child(int* a)
{
    if (threadIdx.x == 0 && blockIdx.x == 0)
        a[0]++;
}

__global__ void parent(int* a)
{
    if (threadIdx.x == 0 && blockIdx.x == 0)
        child<<<gridDim, blockDim>>>(a);
}

#define NBLOCKS 1024
#define NTHREADS 1024
#define BENCHCOUNT 1000

template<typename Lambda>
void runBench(Lambda arg, int* rp, const char* name)
{
    // "preheat" the GPU
    for (int i = 0; i < 100; i++)
        child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);

    cudaEvent_t start, stop;
    float rtime = 0.0f;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, 0);
    for (int i = 0; i < BENCHCOUNT; i++)
        arg();
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&rtime, start, stop);

    printf("=== %s ===\n", name);
    printf("time: %f ms\n", rtime/BENCHCOUNT); fflush(stdout);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaDeviceSynchronize();
}

int main(int argc, char **argv)
{
    host_vector<int> hv(1);
    hv[0] = 0xAABBCCDD;
    device_vector<int> dv(1);
    dv = hv;
    int* rp = thrust::raw_pointer_cast(&dv[0]);

    auto benchFun = [&](void) {
        child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
    runBench(benchFun, rp, "Single kernel launch");

    auto benchFun2 = [&](void) {
        for (int j = 0; j < 2; j++)
            child<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp);
    };
    runBench(benchFun2, rp, "2x sequential kernel launch");

    auto benchFunDP = [&](void) {
        parent<<<dim3(NBLOCKS,1,1), dim3(NTHREADS,1,1)>>>(rp); };
    runBench(benchFunDP, rp, "Nested kernel launch");
}

To build/run:

Copy/paste code above to dpar.cu
nvcc -arch=sm_52 -rdc=true -std=c++11 -lcudadevrt -o dpar dpar.cu
./dpar

On my p5000 laptop it prints:

=== Single kernel launch ===
time: 0.014297 ms
=== 2x sequential kernel launch ===
time: 0.030468 ms
=== Nested kernel launch ===
time: 0.083820 ms

So the overhead is quite large.. looks like in my case 43 microseconds.

Gravitation answered 30/8, 2017 at 21:2 Comment(2)

Thanks. It seems that DP is competitive only when compared against a batch of kernel calls but not against one kernel call. – Orontes 31/8, 2017 at 13:55

Tried with a Titan X (Pascal but not the Xp), and got === Single kernel launch === time: 0.007592 ms === 2x sequential kernel launch === time: 0.016331 ms === Nested kernel launch === time: 0.047563 ms – Orontes 28/9, 2017 at 13:28

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags