I have gone through this site. From here I got that pinned memory using cudamallocHost gives better performance than cudamalloc. Then I use two different simple program and tested the execution time as
using cudaMallocHost
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
clock_t start;
start=clock();/* Line 8 */
clock_t finish;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 100000; // Number of elements in arrays
size_t size = N * sizeof(float);
cudaMallocHost((void **) &a_h, size);
//a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
cudaFreeHost(a_h);
cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
using malloc
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
clock_t start;
start=clock();/* Line 8 */
clock_t finish;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 100000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
here during execution of both program, the execution time was almost similar. Is there anything wrong in the implementation?? what is the exact difference in execution in cudamalloc and cudamallochost??
and also with each run the execution time decreases