With the introduction of the cuFFT callback feature, the normalization required by the inverse FFT performed by the cuFFT can be embedded directly within the cufftExecC2C
call by defining the normalization operation as a __device__
function.
Besides the cuFFT User Guide, for the cuFFT callback features, see
CUDA Pro Tip: Use cuFFT Callbacks for Custom Data Processing
Below is an example of implementing the IFFT normalization by cuFFT callback.
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <cufftXt.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See https://mcmap.net/q/1468318/-cufft-error-handling
#ifdef _CUFFT_H_
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
float *scaling_factor = (float*)callerInfo;
float2 output;
output.x = cuCrealf(element);
output.y = cuCimagf(element);
output.x = output.x / scaling_factor[0];
output.y = output.y / scaling_factor[0];
((float2*)dataOut)[offset] = output;
}
__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
cufftHandle plan;
float2 *h_input = (float2*)malloc(N*sizeof(float2));
float2 *h_output1 = (float2*)malloc(N*sizeof(float2));
float2 *h_output2 = (float2*)malloc(N*sizeof(float2));
float2 *d_input; gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
float2 *d_output1; gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
float2 *d_output2; gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
float *h_scaling_factor = (float*)malloc(sizeof(float));
h_scaling_factor[0] = 16.0f;
float *d_scaling_factor; gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
for (int i=0; i<N; i++) {
h_input[i].x = 1.0f;
h_input[i].y = 0.f;
}
gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));
cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
cufftCallbackStoreC h_storeCallbackPtr;
gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
cufftSafeCall(cufftDestroy(plan));
gpuErrchk(cudaFree(d_input));
gpuErrchk(cudaFree(d_output1));
gpuErrchk(cudaFree(d_output2));
return 0;
}
EDIT
The "moment" the callback operation is performed is specified by CUFFT_CB_ST_COMPLEX
in the call to cufftXtSetCallback
. Notice that you can then have load and store callbacks with the same cuFFT plan.
CUFFT_INVERSE
operation. – Unexpressed