Why is numba so fast?

Asked 9/12, 2021 at 21:36 Answered 12/1, 2022 at 11:19

I want to write a function which will take an index lefts of shape (N_ROWS,) I want to write a function which will create a matrix out = (N_ROWS, N_COLS) matrix such that out[i, j] = 1 if and only if j >= lefts[i]. A simple example of doing this in a loop is here:

class Looped(Strategy):
    def copy(self, lefts):
        out = np.zeros([N_ROWS, N_COLS])
        for k, l in enumerate(lefts): 
            out[k, l:] = 1
        return out

Now I wanted it to be as fast as possible, so I have different implementations of this function:

The plain python loop
numba with @njit
A pure c++ implementation which I call with ctypes

Here are the results of the average of 100 runs:

Looped took 0.0011599776260009093
Numba took 8.886413300206186e-05
CPP took 0.00013200821400096175

So numba is about 1.5 times than the next fastest implementation which is the c++ one. My question is why?

I have heard in similar questions cython was slower because it wasn't being compiled with all the optimisation flags set, but the cpp implementation was compiled with -O3 is that enough for me to have all the possible optimisations the compiler will give me?
I do not fully understand how to hand the numpy array to c++, am I inadvertently making a copy of the data here?

# numba implementation

@njit
def numba_copy(lefts):
    out = np.zeros((N_ROWS, N_COLS), dtype=np.float32)
    for k, l in enumerate(lefts): 
        out[k, l:] = 1.
    return out

    
class Numba(Strategy):
    def __init__(self) -> None:
        # avoid compilation time when timing 
        numba_copy(np.array([1]))

    def copy(self, lefts):
        return numba_copy(lefts)


// array copy cpp

extern "C" void copy(const long *lefts,  float *outdatav, int n_rows, int n_cols) 
{   
    for (int i = 0; i < n_rows; i++) {
        for (int j = lefts[i]; j < n_cols; j++){
            outdatav[i*n_cols + j] = 1.;
        }
    }
}

// compiled to a .so using g++ -O3 -shared -o array_copy.so array_copy.cpp

# using cpp implementation

class CPP(Strategy):

    def __init__(self) -> None:
        lib = ctypes.cdll.LoadLibrary("./array_copy.so")
        fun = lib.copy
        fun.restype = None
        fun.argtypes = [
            ndpointer(ctypes.c_long, flags="C_CONTIGUOUS"),
            ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"),
            ctypes.c_long,
            ctypes.c_long,
            ]
        self.fun = fun

    def copy(self, lefts):
        outdata = np.zeros((N_ROWS, N_COLS), dtype=np.float32, )
        self.fun(lefts, outdata, N_ROWS, N_COLS)
        return outdata

The full code with the timing etc:

import time
import ctypes
from itertools import combinations

import numpy as np
from numpy.ctypeslib import ndpointer
from numba import njit


N_ROWS = 1000
N_COLS = 1000


class Strategy:

    def copy(self, lefts):
        raise NotImplementedError

    def __call__(self, lefts):
        s = time.perf_counter()
        n = 1000
        for _ in range(n):
            out = self.copy(lefts)
        print(f"{type(self).__name__} took {(time.perf_counter() - s)/n}")
        return out


class Looped(Strategy):
    def copy(self, lefts):
        out = np.zeros([N_ROWS, N_COLS])
        for k, l in enumerate(lefts): 
            out[k, l:] = 1
        return out


@njit
def numba_copy(lefts):
    out = np.zeros((N_ROWS, N_COLS), dtype=np.float32)
    for k, l in enumerate(lefts): 
        out[k, l:] = 1.
    return out


class Numba(Strategy):
    def __init__(self) -> None:
        numba_copy(np.array([1]))

    def copy(self, lefts):
        return numba_copy(lefts)


class CPP(Strategy):

    def __init__(self) -> None:
        lib = ctypes.cdll.LoadLibrary("./array_copy.so")
        fun = lib.copy
        fun.restype = None
        fun.argtypes = [
            ndpointer(ctypes.c_long, flags="C_CONTIGUOUS"),
            ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"),
            ctypes.c_long,
            ctypes.c_long,
            ]
        self.fun = fun

    def copy(self, lefts):
        outdata = np.zeros((N_ROWS, N_COLS), dtype=np.float32, )
        self.fun(lefts, outdata, N_ROWS, N_COLS)
        return outdata


def copy_over(lefts):
    strategies = [Looped(), Numba(), CPP()]

    outs = []
    for strategy in strategies:
        o = strategy(lefts)
        outs.append(o)

    for s_0, s_1 in combinations(outs, 2):
        for a, b in zip(s_0, s_1):
            np.testing.assert_allclose(a, b)
    

if __name__ == "__main__":
    copy_over(np.random.randint(0, N_COLS, size=N_ROWS))

Dwarfism answered 9/12, 2021 at 21:36 Comment(6)

Wow, numba is so awesome if it makes your Python faster than hand-written C++!! – Tantalizing 9/12, 2021 at 21:40

Let's be honest -- you are comparing a package that has probably been highly optimized by very intelligent programmers over the months, maybe years, to your attempt of basically a double-nested loop. It's not a doubt who would win that race. – Ruby 9/12, 2021 at 21:43

some basic optimisations plus the right compiler flags might improve the c++ performance: godbolt.org/z/Kz3MWvPEd – Schleicher 9/12, 2021 at 21:54

@AlanBirtles this gets me down to 1.28 times better – Dwarfism 9/12, 2021 at 22:3

Numba actually has an advantage over the C++ code: the N_ROWS and N_COLS are hard-coded constants for Numba while they are variables in c++. This might allow it to unroll some loops for example – Jamnis 9/12, 2021 at 22:7

There are many more optimisations to do, you can unroll loops and use simd intrinsics. Making as much as possible constants will help out the optimiser too – Schleicher 9/12, 2021 at 23:21

Numba currently uses LLVM-Lite to compile the code efficiently to a binary (after the Python code has been translated to an LLVM intermediate representation). The code is optimized like a C++ code compiled with Clang with the flags -O3 and -march=native. This last parameter is very important as it enables LLVM to use wider SIMD instructions on relatively-recent x86-64 processors: AVX and AVX2 (possibly AVX512 for very recent Intel processors). Otherwise, by default, Clang and GCC use only the SSE/SSE2 instructions (because of backward compatibility).

Another difference come from the comparison between GCC and the LLVM code from Numba. Clang/LLVM tends to aggressively unroll the loops while GCC often don't. This has a significant performance impact on the resulting program. In fact, you can see that the generated assembly code from Clang:

With Clang (128 items per loops):

.LBB0_7:
        vmovups ymmword ptr [r9 + 4*r8 - 480], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 448], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 416], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 384], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 352], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 320], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 288], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 256], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 224], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 192], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 160], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 128], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 96], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 64], ymm0
        vmovups ymmword ptr [r9 + 4*r8 - 32], ymm0
        vmovups ymmword ptr [r9 + 4*r8], ymm0
        sub     r8, -128
        add     rbp, 4
        jne     .LBB0_7

With GCC (8 items per loops):

.L5:
        mov     rdx, rax
        vmovups YMMWORD PTR [rax], ymm0
        add     rax, 32
        cmp     rdx, rcx
        jne     .L5

Thus, to be fair, you need to compare the Numba code with the C++ code compiled with Clang and the above optimization flags.

Note that regarding your needs and the size of your last-level processor cache, you can write a faster platform-specific C++ code using non-temporal stores (NT-stores). NT-stores tell to the processor not to store the array in its cache. Writing data using NT-stores is faster to write huge arrays in RAM, but this can slower when you read the stored array after the copy if the array could fit in the cache (since the array will have to be reloaded from the RAM). In your case (4 MiB array), this is unclear either this would be faster.

Griffon answered 9/12, 2021 at 23:26 Comment(1)

If anyone is using MSVC compiler, the flags are different: /O2 for optimization and regarding architecture /arch:AVX2 or /arch:AVX512. This is the full list of compiler options – Colombi 3/6, 2022 at 19:49

Combining all of the suggestions from other answers/comments I was able to do better than Numba with the following:

Using cython + memoryiews (there was some overhead somewhere using ctypes)
Optimising the cpp implementation
Changing the cython compiler to clang and setting -march=skylake

Now I have

CPP took 9.407872100018721e-05
Numba took 9.336918499957392e-05
Cythonised took 9.22323310005595e-05

// array_copy.cpp

#include "array.h" 

const int n_rows = 1000;
const int n_cols = 1000;

void copy(const long *lefts,  float *outdatav) 
{   
    const float one = 1.;
    for (int i = 0; i < n_rows; i++) {
        const int l = lefts[i];
        float* start = outdatav + i*n_cols + l;
        std::fill(start, start + n_cols - l, one);
    }
}

# setup.py
import os

os.environ["CXX"] = "clang"
os.environ["CC"] = "clang"

from setuptools import setup, Extension
from Cython.Build import cythonize

ex = Extension(
    "array_copy_cython", 
    ["./cpp_ext/array_copy_ext.pyx", "./cpp_ext/array.cpp" ],
    include_dirs=["./cpp_ext"],
    extra_compile_args=["-march=skylake"],
    language="c++")
    

setup(
    name='copy',
    ext_modules=cythonize(ex),
    zip_safe=False,
)

# array_copy_ext.pyx

cdef extern from "array.h":
    void copy(const long* lefts,  float* outdatav)

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def copy_array(const long[:] lefts, float[:,:] outdatav):
    copy(&lefts[0], &outdatav[0][0])
    return outdatav

Dwarfism answered 12/1, 2022 at 11:19 Comment(0)

Recommended topics

Hot tags