If you'd like to include repeated elements, you can use Counter
, which I would imagine is relatively quick since it's just an extended dict
under the hood:
from collections import Counter
def jaccard_repeats(a, b):
"""Jaccard similarity measure between input iterables,
allowing repeated elements"""
_a = Counter(a)
_b = Counter(b)
c = (_a - _b) + (_b - _a)
n = sum(c.values())
return n/(len(a) + len(b) - n)
list1 = ['dog', 'cat', 'rat', 'cat']
list2 = ['dog', 'cat', 'rat']
list3 = ['dog', 'cat', 'mouse']
jaccard_repeats(list1, list3)
>>> 0.75
jaccard_repeats(list1, list2)
>>> 0.16666666666666666
jaccard_repeats(list2, list3)
>>> 0.5