For everybody who is still struggling,
here is the solution that is much faster than with gil: .
Nowadays (7 years later), it is possible to import atomic/openmp directly.
It took me a while to figure that out since there is nothing about in the official documentation.
The atomic solution
from libcpp.atomic cimport atomic
from cython.parallel cimport prange
cpdef parallel_loop():
cdef:
Py_ssize_t a = 100000
Py_ssize_t loopvar
atomic[int] *atticounter1= new atomic[int](0)
int finalvar = 0
try:
for loopvar in prange(a,nogil=True):
if loopvar % 1000 == 0:
atticounter1.fetch_add(1)
finalvar = atticounter1.load()
finally:
del atticounter1
return finalvar
f=parallel_loop()
print(f)
The openmp solution
Most of the time, you want to do more than count positive results.
In this case you can use openmp.
Here is a working example of a code that locates RGB colors in an image
The pyx file:
from cython.parallel cimport prange
cimport openmp
import numpy as np
cimport numpy as np
import cython
cimport cython
cpdef void searchforcolor(
unsigned char[:] pic,
unsigned char[::1] colors,
Py_ssize_t[:,::1] results,
Py_ssize_t[::1] countervar,
Py_ssize_t width,
Py_ssize_t totallengthpic,
Py_ssize_t totallengthcolor,
int cpus,
):
cdef:
Py_ssize_t i, j
unsigned char r,g,b
openmp.omp_lock_t locker
if cpus < 1:
cpus=openmp.omp_get_max_threads()
if cpus > 1:
openmp.omp_set_num_threads(cpus)
openmp.omp_init_lock(&locker)
for i in prange(0, totallengthcolor, 3, nogil=True): # should be possible, but not work not working: use_threads_if=cpus>1
r = colors[i]
g = colors[i + 1]
b = colors[i + 2]
for j in range(0, totallengthpic, 3):
if (r == pic[j]) and (g == pic[j+1]) and (b == pic[j+2]):
openmp.omp_set_lock(&locker)
results[countervar[0]][1] = ((j / 3) // width) #x
results[countervar[0]][0] = ((j / 3) % width) #y
results[countervar[0]][2] = b
results[countervar[0]][3] = g
results[countervar[0]][4] = r
countervar[0]+=1
openmp.omp_unset_lock(&locker)
openmp.omp_destroy_lock(&locker)
else:
for i in range(0, totallengthcolor, 3):
r = colors[i]
g = colors[i + 1]
b = colors[i + 2]
for j in range(0, totallengthpic, 3):
if (r == pic[j]) and (g == pic[j+1]) and (b == pic[j+2]):
results[countervar[0]][1] = ((j / 3) // width) #x
results[countervar[0]][0] = ((j / 3) % width) #y
countervar[0]+=1
The python code:
def search_colors(pic, colors, cpus=-1):
if not isinstance(colors, np.ndarray):
colors = np.array(colors, dtype=np.uint8)
rav_colors = np.ascontiguousarray(colors.ravel())
totallengthcolor = rav_colors.shape[0] - 1
totallenghtpic = np.prod(pic.shape) - 1
width = pic.shape[1]
results = np.zeros((totallenghtpic, 5), dtype=np.int64)
countervar = np.zeros(1, dtype=np.int64) # this is going to be our atomic counter
searchforcolor(
pic.ravel(),
rav_colors,
results,
countervar,
width,
totallenghtpic,
totallengthcolor,
cpus,
)
return results[: countervar[0]]
picpath = r"C:\Users\hansc\Downloads\pexels-alex-andrews-2295744.jpg"
pic = cv2.imread(picpath)
colors0 = np.array([[255, 255, 255]], dtype=np.uint8)
resus0 = search_colors(pic=pic, colors=colors0)
colors1 = np.array(
[
(66, 71, 69),
(62, 67, 65),
(144, 155, 153),
(52, 57, 55),
(127, 138, 136),
(53, 58, 56),
(51, 56, 54),
(32, 27, 18),
(24, 17, 8),
],
dtype=np.uint8,
)
resus1 = search_colors(pic=pic, colors=colors1)
print(resus1)
Important: Compile both examples it with the OpenMP flag!
with gil
way, but it was so slow for mycalc_something
that I did not even mention it. The other options seem on the other hand very promising. – Edmond