I believe the answer given by Xiaojing An is a good solution but there is a minor issue with the negative zero which is mentioned by Robert Crovella in a comment. For example, if *addr = -1.0f and val = -0.0f then after running the atomicMaxFloat function addr will be set to -1.0f but it should be -0.0f, and the atomicMinFloat function will also be wrong in this case. This happens because the >= 0 check returns true for negative 0 but we need it to be false in this case. This case can be fixed by using the signbit function instead:
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
Note - i would have posted this as a comment to the answer from Xiaojing An but don't have enough reputation.
Of course, it's unclear what will happen with nans or infs in this function but i think it can be used without worrying about that assuming you don't need to handle those cases - the negative 0 is probably the only really worrying case. It also depends on your willingness to accept this kind of hackery where we are making assumptions about the way the floating point values are represented in binary and many people may prefer never to go down this kind of route.
Here's a small test program:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
/*
//these versions fail some of the tests involving negative 0
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = value >= 0 ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = value >= 0 ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
*/
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
float old;
old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
__uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
return old;
}
__global__ void testKernel(float* testMaxData,
float* testMinData,
const float* testValues,
int numTests)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= numTests)
{
return;
}
float val = testValues[index];
atomicMaxFloat(testMaxData + index, val);
atomicMinFloat(testMinData + index, val);
}
void checkCudaErr(cudaError_t cudaStatus)
{
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA Runtime error: %s\n", cudaGetErrorString(cudaStatus));
}
}
int main()
{
const int numValues = 6;
const int numTests = numValues * numValues;
float testData[numValues] = { 0.0f, -0.0f, 1.0f, -1.0f, 200.0f, -200.0f };
float testValuesMinMaxHost[numTests];
float testValuesHost[numTests];
for (int i = 0; i < numValues; ++i)
{
for (int j = 0; j < numValues; ++j)
{
/*
We will test the values of min(a,b) and max(a,b) for
all values of a and b in the testData array.
*/
testValuesMinMaxHost[numValues * i + j] = testData[i];
testValuesHost[numValues * i + j] = testData[j];
}
}
float* devTestMax = 0;
float* devTestMin = 0;
float* devTestValues = 0;
checkCudaErr(cudaSetDevice(0));
checkCudaErr(cudaMalloc((void**)&devTestMax, numTests * sizeof(float)));
checkCudaErr(cudaMalloc((void**)&devTestMin, numTests * sizeof(float)));
checkCudaErr(cudaMalloc((void**)&devTestValues, numTests * sizeof(float)));
checkCudaErr(cudaMemcpy(devTestMax, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(devTestMin, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(devTestValues, testValuesHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
int blockSize = 128;
testKernel << < (numTests+(blockSize-1))/ blockSize, blockSize >> > (devTestMax, devTestMin, devTestValues, numTests);
checkCudaErr(cudaGetLastError());
float resultsMin[numTests];
float resultsMax[numTests];
checkCudaErr(cudaMemcpy(resultsMin, devTestMin, numTests * sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErr(cudaMemcpy(resultsMax, devTestMax, numTests * sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErr(cudaFree(devTestMax));
checkCudaErr(cudaFree(devTestMin));
checkCudaErr(cudaFree(devTestValues));
int fail = 0;
for (int i = 0; i < numTests; ++i)
{
float expectedMax = fmax(testValuesMinMaxHost[i], testValuesHost[i]);
if (resultsMax[i] != expectedMax)
{
printf("fail, expected %f, got %f from max(%f, %f)\n",
expectedMax,
resultsMax[i],
testValuesMinMaxHost[i],
testValuesHost[i]);
fail = 1;
}
float expectedMin = fmin(testValuesMinMaxHost[i], testValuesHost[i]);
if (resultsMin[i] != expectedMin)
{
printf("fail, expected %f, got %f from min(%f, %f)\n",
expectedMin,
resultsMin[i],
testValuesMinMaxHost[i],
testValuesHost[i]);
fail = 1;
}
}
if (fail == 0)
{
printf("all tests passed\n");
}
return 0;
}