How do I use atomicMax on floating-point values in CUDA?
Asked Answered
P

6

13

I have used atomicMax() to find the maximum value in the CUDA kernel:

__global__ void global_max(float* values, float* gl_max)
{
    int i=threadIdx.x + blockDim.x * blockIdx.x;
    float val=values[i];

    atomicMax(gl_max, val);
}

It is throwing the following error:

error: no instance of overloaded function "atomicMax" matches the argument list

The argument types are: (float *, float).

Preoccupied answered 1/7, 2013 at 7:11 Comment(1)
It's not supported, but you can create your ownSausage
P
3

The short answer is that you can't. As you can see from the atomic function documentation, only integer arguments are supported for atomicMax and 64 bit integer arguments are only supported on compute capability 3.5 devices.

Patriotism answered 1/7, 2013 at 7:29 Comment(0)
O
35

atomicMax is not available for float types. But you can implement it via atomicCAS:

__device__ static float atomicMax(float* address, float val)
{
    int* address_as_i = (int*) address;
    int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed,
            __float_as_int(::fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}
Outsize answered 1/7, 2013 at 9:25 Comment(3)
To have an implementation for a float atomicMin version, just replace fmaxf by fminf.Augmented
I'm not sure this is a good solution: the arguments of atomicCAS are not processed in an "atomic" way: thus you may have a race condition when evaluating ::fmax(val, ...). I tried to use this implementation and it resulted with erroneous outputs. I suspect due to this "non-atomic" ::fmax. Xiaojing An's solution seems to work better.Graeae
@Graeae why would there be a race condition for fmax? Both are local variables and the while loop will only exit when the value on the address is the maximum from both what is stored there and val. That being said, the other solution might be a better choice as it is using a single atomic instruction and is most likely fasterTub
O
18

Based on the CUDA Toolkit Documentation v9.2.148, there are no atomic operations for float. But we can implement it by mixing atomicMax and atomicMin with signed and unsigned integer casts!

This is a float atomic min:

__device__ __forceinline__ float atomicMinFloat (float * addr, float value) {
        float old;
        old = (value >= 0) ? __int_as_float(atomicMin((int *)addr, __float_as_int(value))) :
             __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));

        return old;
}

This is a float atomic max:

__device__ __forceinline__ float atomicMaxFloat (float * addr, float value) {
    float old;
    old = (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) :
         __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));

    return old;
}
Ophir answered 27/7, 2018 at 0:41 Comment(6)
should there also be a test on *addr if it negative or not? what happens if sign of value and *addr are not the same?Graeae
Only know the sign of value is enough. For example, when in atomicMinFloat, when value >= 0, we use atomicMin for signed int: if *addr < 0, then *addr return; if *addr >=0, then compare and have the min between *addr and value.Ophir
I don't believe this handles the float negative zero case correctly. You might be able to trivially fix that by adding zero to value before using it.Myxomycete
It seems to me that, for atomicMinFloat: if -0 and 0 are compared, it will set *addr as -0; Otherwise, correct. Is the first case here that you think is incorrect? If not, I would really appreciate clarification with a counter example. :)Ophir
This answer is about 10x faster than Vinograd47's answer on modern GPUs because int32 atomicMax/atomicMin have hardware support that makes them much faster than atomicCAS under contention.Heteronym
See timothygiraffe's answer for correction (using signbit function instead of comparing with zero). Note that the same can be done for double-precision floating points. @RobertCrovellaCharland
S
10

You need to map float to orderedIntFloat to use atomicMax!

__device__ __forceinline__ int floatToOrderedInt( float floatVal ) {
 int intVal = __float_as_int( floatVal );
 return (intVal >= 0 ) ? intVal : intVal ^ 0x7FFFFFFF;
}
__device__ __forceinline__ float orderedIntToFloat( int intVal ) {
 return __int_as_float( (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
}
Splenius answered 23/6, 2015 at 18:11 Comment(2)
how can you map *address this way - the mapping is NOT atomic?Graeae
@Graeae You just store OrderedIntFloat at *address as int. Always encode before atomicMax and decode in threads.Spheroidal
D
4

I believe the answer given by Xiaojing An is a good solution but there is a minor issue with the negative zero which is mentioned by Robert Crovella in a comment. For example, if *addr = -1.0f and val = -0.0f then after running the atomicMaxFloat function addr will be set to -1.0f but it should be -0.0f, and the atomicMinFloat function will also be wrong in this case. This happens because the >= 0 check returns true for negative 0 but we need it to be false in this case. This case can be fixed by using the signbit function instead:

__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

Note - i would have posted this as a comment to the answer from Xiaojing An but don't have enough reputation.

Of course, it's unclear what will happen with nans or infs in this function but i think it can be used without worrying about that assuming you don't need to handle those cases - the negative 0 is probably the only really worrying case. It also depends on your willingness to accept this kind of hackery where we are making assumptions about the way the floating point values are represented in binary and many people may prefer never to go down this kind of route.

Here's a small test program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <math.h>

/*
//these versions fail some of the tests involving negative 0
__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = value >= 0 ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = value >= 0 ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}
*/


__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
    float old;
    old = !signbit(value) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
        __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));

    return old;
}

__global__ void testKernel(float* testMaxData, 
                           float* testMinData,
                           const float* testValues, 
                           int numTests)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index >= numTests)
    {
        return;
    }
    float val = testValues[index];
    atomicMaxFloat(testMaxData + index, val);
    atomicMinFloat(testMinData + index, val);
}

void checkCudaErr(cudaError_t cudaStatus)
{
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime error: %s\n", cudaGetErrorString(cudaStatus));
    }
}

int main()
{
    const int numValues = 6;
    const int numTests = numValues * numValues;
    float testData[numValues] = { 0.0f, -0.0f, 1.0f, -1.0f, 200.0f, -200.0f };
    float testValuesMinMaxHost[numTests];
    float testValuesHost[numTests];

    for (int i = 0; i < numValues; ++i)
    {
        for (int j = 0; j < numValues; ++j)
        {
            /*
            We will test the values of min(a,b) and max(a,b) for
            all values of a and b in the testData array.
            */
            testValuesMinMaxHost[numValues * i + j] = testData[i];
            testValuesHost[numValues * i + j] = testData[j];
        }
    }
  
    float* devTestMax = 0;
    float* devTestMin = 0;
    float* devTestValues = 0;

    checkCudaErr(cudaSetDevice(0));
    checkCudaErr(cudaMalloc((void**)&devTestMax, numTests * sizeof(float)));
    checkCudaErr(cudaMalloc((void**)&devTestMin, numTests * sizeof(float)));
    checkCudaErr(cudaMalloc((void**)&devTestValues, numTests * sizeof(float)));

    checkCudaErr(cudaMemcpy(devTestMax, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(devTestMin, testValuesMinMaxHost, numTests * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErr(cudaMemcpy(devTestValues, testValuesHost, numTests * sizeof(float), cudaMemcpyHostToDevice));

    int blockSize = 128;
    testKernel << < (numTests+(blockSize-1))/ blockSize, blockSize >> > (devTestMax, devTestMin, devTestValues, numTests);
    checkCudaErr(cudaGetLastError());
    
    float resultsMin[numTests];
    float resultsMax[numTests];

    checkCudaErr(cudaMemcpy(resultsMin, devTestMin, numTests * sizeof(float), cudaMemcpyDeviceToHost));
    checkCudaErr(cudaMemcpy(resultsMax, devTestMax, numTests * sizeof(float), cudaMemcpyDeviceToHost));

    checkCudaErr(cudaFree(devTestMax));
    checkCudaErr(cudaFree(devTestMin));
    checkCudaErr(cudaFree(devTestValues));

    int fail = 0;
    for (int i = 0; i < numTests; ++i)
    {
        float expectedMax = fmax(testValuesMinMaxHost[i], testValuesHost[i]);
        if (resultsMax[i] != expectedMax)
        {
            printf("fail, expected %f, got %f from max(%f, %f)\n",
                   expectedMax,
                   resultsMax[i],
                   testValuesMinMaxHost[i],
                   testValuesHost[i]);
            fail = 1;
        }

        float expectedMin = fmin(testValuesMinMaxHost[i], testValuesHost[i]);
        if (resultsMin[i] != expectedMin)
        {
            printf("fail, expected %f, got %f from min(%f, %f)\n",
                   expectedMin,
                   resultsMin[i],
                   testValuesMinMaxHost[i],
                   testValuesHost[i]);
            fail = 1;
        }
    }

    if (fail == 0)
    {
        printf("all tests passed\n");
    }

    return 0;
}
Demakis answered 1/6, 2022 at 11:55 Comment(1)
This answer needs more votes. It's both correct (for finite numbers) and much faster than the atomicCAS loop solution. This is one case where the ordering relationship between floating-point and integer representations can be exploited (something like this is done for radix sort implementations for both 32-bit and 64-bit floating points).Charland
P
3

The short answer is that you can't. As you can see from the atomic function documentation, only integer arguments are supported for atomicMax and 64 bit integer arguments are only supported on compute capability 3.5 devices.

Patriotism answered 1/7, 2013 at 7:29 Comment(0)
T
-3

This is the syntax for Atomic MAX

int atomicMax(int* address,int val);

But there are exception like atomicAdd which support floats.

Totalitarianism answered 1/7, 2013 at 9:10 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.