I've patched the following code from examples I've found over the web:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from sklearn.cluster import KMeans
# random
from random import shuffle
# classifier
class LabeledLineSentence(object):
def __init__(self, sources):
self.sources = sources
flipped = {}
# make sure that keys are unique
for key, value in sources.items():
if value not in flipped:
flipped[value] = [key]
else:
raise Exception('Non-unique prefix encountered')
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
return self.sentences
def sentences_perm(self):
shuffle(self.sentences)
return self.sentences
sources = {'test.txt' : 'DOCS'}
sentences = LabeledLineSentence(sources)
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())
for epoch in range(10):
model.train(sentences.sentences_perm())
print(model.docvecs)
my test.txt file contains a paragraph per line.
The code runs fine and generates DocvecsArray for each line of text
my goal is to have an output like so:
cluster 1: [DOC_5,DOC_100,...DOC_N]
cluster 2: [DOC_0,DOC_1,...DOC_N]
I have found the following Answer, but the output is:
cluster 1: [word,word...word]
cluster 2: [word,word...word]
How can I alter the code and get document clusters?