https://en.m.wikipedia.org/wiki/Jaccard_index
and now some cleaned-up sample code.
def jac(s1,s2):
"""the jaccard index between 2 sets"""
s_union = s1.union(s2)
s_inter = s1.intersection(s2)
len_union = len(s_union)
if not len_union:
return 0
return len(s_inter)*1.0/len_union
from itertools import permutations
ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0},
'Bob': {'Panic Room':5.0,'Nonstop':5.0}}
def common_movie(dict0, dict1):
"""have we rated the same movies?"""
set0 = set(dict0.items())
set1 = set(dict1.items())
return jac(set0, set1)
def movies_and_ratings(dict0, dict1):
"""how do our movies and ratings line up?"""
set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())
key_commonality = jac(set_keys0, set_keys1)
set0 = set(dict0.items())
set1 = set(dict1.items())
item_commonality = jac(set0, set1)
#ok, so now we give a proximity on key match, even if key + data dont match
return 0.3 * key_commonality + 0.7 * item_commonality
def common_movie_ratings(dict0, dict1):
"""how do our ratings correspond on the same movies?"""
set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())
set_common = set_keys0.intersection(set_keys1)
set0 = set([v for k, v in dict0.items() if k in set_common])
set1 = set([v for k, v in dict1.items() if k in set_common])
return jac(set0, set1)
for pair in permutations(ratings.keys(), 2):
dict0, dict1 = ratings[pair[0]], ratings[pair[1]]
print "\n %s vs %s" % (pair)
#make no assumption on key/value
#order coming out of a dictionary. So, you need to order them.
li = dict0.items()
li.sort()
print " %s" % (li)
li = dict1.items()
li.sort()
print " %s" % (li)
print " common_movie :%s" % common_movie(dict0, dict1)
print " movies_and_ratings:%s" % movies_and_ratings(dict0, dict1)
print " common_movie_ratings :%s" % common_movie_ratings(dict0, dict1)
The output:
Shane vs Bob
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie :0.25
movies_and_ratings:0.25
common_movie_ratings :1.0
Shane vs Joe
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings :0.333333333333
Bob vs Shane
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie :0.25
movies_and_ratings:0.25
common_movie_ratings :1.0
Bob vs Joe
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie :0.0
movies_and_ratings:0.06
common_movie_ratings :0.0
Joe vs Shane
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings :0.333333333333
Joe vs Bob
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie :0.0
movies_and_ratings:0.06
common_movie_ratings :0.0
Taken 3
. Or the actual values? What about multi-level dictionaries? – Ruffi