You can cudaMalloc a buffer of the maximum size you can on your device. After this, copy over chunks of your input data of this size from host to device, process it, copy back the results and continue.
// Your input data on host
int hostBufNum = 5600000;
int* hostBuf = ...;
// Assume this is largest device buffer you can allocate
int devBufNum = 1000000;
int* devBuf;
cudaMalloc( &devBuf, sizeof( int ) * devBufNum );
int* hostChunk = hostBuf;
int hostLeft = hostBufNum;
int chunkNum = ( hostLeft < devBufNum ) ? hostLeft : devBufNum;
do
{
cudaMemcpy( devBuf, hostChunk, chunkNum * sizeof( int ) , cudaMemcpyHostToDevice);
doSomethingKernel<<< >>>( devBuf, chunkNum );
hostChunk = hostChunk + chunkNum;
hostLeft = hostBufNum - ( hostChunk - hostBuf );
} while( hostLeft > 0 );