I'm implementing a BK-Tree in Cython.
For one million items, the search time is too long! It's ~30 seconds :(
Here is my Cython code:
# -*- coding: UTF-8 -*-
from itertools import imap
from PIL import Image
DEF MAX_TREE_POOL = 10000
cdef extern from "distances.h":
int hamming_distance(char *a, char *b)
enum: HASH_BITS
cdef findInTree(Node parent, Item item, int threshold):
cdef int d
cdef int i = 0
cdef Node child
cdef object childrens
cdef object results = []
cdef object extends = results.extend
if parent:
d = hamming_distance(item.hash, parent.item.hash)
childrens = parent.childrens.get
if d <= threshold:
results.append((d, parent.item))
for i in xrange(max(0, d - threshold), d + threshold + 1):
child = childrens(i)
if child:
extends(findInTree(child, item, threshold))
return results
cdef class Item:
cdef public unsigned int id
cdef public object hash
def __init__(Item self, unsigned int id, object hash):
assert id > 0 and len(hash) == HASH_BITS
self.id = id
self.hash = hash
def __str__(Item self):
return '<Item {0}>'.format(self.id)
def __repr__(Item self):
return '<Item #{0} object at 0x{1}>'.format(self.id, id(self))
cdef class Node:
cdef readonly Item item
cdef readonly dict childrens
def __cinit__(Node self, Item item):
self.item = item
self.childrens = {}
def __repr__(Item self):
return '<Node object at 0x{0} item {1} childrens {2}>'.format(id(self), repr(self.item), repr(self.childrens))
cdef class BKTree:
cdef readonly Node tree
cdef readonly unsigned int count
def __cinit__(BKTree self):
self.count = 0
def addItem(BKTree self, Item item):
cdef int w
cdef int d
cdef object a
cdef Node n
cdef Node c
if not self.tree:
self.tree = Node(item)
else:
w = 1
c = self.tree
a = item.hash
while w:
d = hamming_distance(a, c.item.hash)
n = c.childrens.get(d)
if n is None:
c.childrens[d] = Node(item)
# Break
w = 0
else:
c = c.childrens[d]
self.count += 1
# Success, return
return self.count
def query(BKTree self, Item item, int threshold):
return findInTree(self.tree, item, threshold)
cdef class BKTreePool:
cdef list pool
cdef readonly unsigned int count
cdef BKTree tree
def __cinit__(BKTreePool self):
self.pool = []
self.rotate()
def addItem(BKTreePool self, Item item):
if self.tree.count >= MAX_TREE_POOL:
self.rotate()
try:
self.tree.addItem(item)
self.count += 1
finally:
return self.count
def query(BKTreePool self, Item item, int threshold):
cdef BKTree tree
cdef list results
results = []
for tree in self.pool:
results.extend(tree.query(item, threshold))
return results
cdef rotate(BKTreePool self):
self.pool.append(BKTree())
self.tree = self.pool[-1]
distances.h
#ifndef DISTANCES_H
#define DISTANCES_H 1
#define HASH_BITS 16 * 16
static int hamming_distance(char *a, char *b);
// static int default_distance(char *a, char *b);
static int hamming_distance(char *a, char *b) {
unsigned int distance = 0;
int i;
for (i = 0; i <= HASH_BITS; i++) {
if (a[i] != b[i]) {
distance++;
}
}
return distance;
}
#endif
Example:
tree = BKTreePool()
tree.addItem(Item(1, '10' * 256))
tree.addItem(Item(1, '10' * 256))
....
tree.query(Item(1, '10' * 256), 5)
This tree begins search duplicate images by 256 bits hash.
How can I optimize this findInTree
function?
findInTree
recursively but use an iterative search instead, it's usually much faster. Also, implementing this in C will probably be still much faster than Cython. But the whole idea seems a bit strange to me, I have to say. calculating hamming distance on a 256 byte string is expensive. – Af'10'
* 256 is a string of length 512 BYTES and you are calling it a 256-bit "hash"??? – Del