Here's an answer with just python
+ numpy
, in short:
Cosine:
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
Ngrams:
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
TF-IDF (it's a little weird but it works):
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
Here's the long answer with the tests:
import numpy as np
from math import sqrt, log
from itertools import chain, product
from collections import defaultdict
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
def corpus2vectors(corpus):
def vectorize(sentence, vocab):
return [sentence.split().count(i) for i in vocab]
vectorized_corpus = []
vocab = sorted(set(chain(*[i.lower().split() for i in corpus])))
for i in corpus:
vectorized_corpus.append((i, vectorize(i, vocab)))
return vectorized_corpus, vocab
def create_test_corpus():
sent1 = "this is a foo bar"
sent2 = "foo bar bar black sheep"
sent3 = "this is a sentence"
all_sents = [sent1,sent2,sent3]
corpus, vocab = corpus2vectors(all_sents)
return corpus, vocab
def test_cosine():
corpus, vocab = create_test_corpus()
for sentx, senty in product(corpus, corpus):
print sentx[0]
print senty[0]
print "cosine =", cosine_sim(sentx[1], senty[1])
print
def test_ngrams():
corpus, vocab = create_test_corpus()
for sentx in corpus:
print sentx[0]
print ngrams(sentx[0],2)
print ngrams(sentx[0],3)
print
def test_tfidf():
corpus, vocab = create_test_corpus()
print corpus
print vocab
print tfidf(corpus, vocab)
print "Testing cosine..."
test_cosine()
print
print "Testing ngrams..."
test_ngrams()
print
print "Testing tfidf..."
test_tfidf()
print
[out]:
Testing cosine...
this is a foo bar
this is a foo bar
cosine = 1.0
this is a foo bar
foo bar bar black sheep
cosine = 0.507092552837
this is a foo bar
this is a sentence
cosine = 0.67082039325
foo bar bar black sheep
this is a foo bar
cosine = 0.507092552837
foo bar bar black sheep
foo bar bar black sheep
cosine = 1.0
foo bar bar black sheep
this is a sentence
cosine = 0.0
this is a sentence
this is a foo bar
cosine = 0.67082039325
this is a sentence
foo bar bar black sheep
cosine = 0.0
this is a sentence
this is a sentence
cosine = 1.0
Testing ngrams...
this is a foo bar
[('this', 'is'), ('is', 'a'), ('a', 'foo'), ('foo', 'bar')]
[('this', 'is', 'a'), ('is', 'a', 'foo'), ('a', 'foo', 'bar')]
foo bar bar black sheep
[('foo', 'bar'), ('bar', 'bar'), ('bar', 'black'), ('black', 'sheep')]
[('foo', 'bar', 'bar'), ('bar', 'bar', 'black'), ('bar', 'black', 'sheep')]
this is a sentence
[('this', 'is'), ('is', 'a'), ('a', 'sentence')]
[('this', 'is', 'a'), ('is', 'a', 'sentence')]
Testing tfidf...
[('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
['a', 'bar', 'black', 'foo', 'is', 'sentence', 'sheep', 'this']
[[0.30000000000000004, 0.30000000000000004, 0.0, 0.30000000000000004, 0.30000000000000004, 0.0, 0.0, 0.30000000000000004], [0.0, 0.6000000000000001, 0.6000000000000001, 0.30000000000000004, 0.0, 0.0, 0.6000000000000001, 0.0], [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]