I am trying to implement the GloVe algorithm in pytorch
. This is the first time I am using pytorch
and I think my implementation might not be very efficient. Apart from the obvious (vectorizing the for loop
that is run every batch) is there anything that would give me some speed up? I found that the self.optimizer.step()
is particularly expensive.
from torch.autograd import Variable
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from scipy import sparse
from collections import Counter
import gensim.utils as utils
import pandas as pd
from collections import deque
import time
import gensim.utils as utils
def tokenize_sentences(sentences,enforce_lower = True):
return [list(utils.tokenize(sentence,lowercase=enforce_lower)) for sentence in sentences]
class GloVe(nn.Module):
def __init__(self, window = 3, size = 100, xmax = 2, alpha = 0.75, batch_size = 10,
l_rate = 0.01, num_epochs = 100, min_count=2, sentence_representation={'name':'mean_of_words'},
verbose=True, retrain_first=False, pretrained_model=None):
super(GloVe, self).__init__()
self.window = window
self.size = size
self.xmax = xmax
self.alpha = alpha
self.batch_size = batch_size
self.l_rate = l_rate
self.num_epochs = num_epochs
self.min_count = min_count
self.sentence_representation = sentence_representation
self.verbose = verbose
def build_model(self):
self.vocab, self.reverse_vocab = self.build_vocab(self.sentence_tokens)
self.comat, self.coocs = self.build_cooccur(self.vocab, self.sentence_tokens, self.min_count)
self.l_embed = nn.Embedding(self.vocab_size, self.size, sparse=True)
self.r_embed = nn.Embedding(self.vocab_size, self.size, sparse=True)
# If we use other optimizers such as Adam, we can't use sparse matrices
# self.l_embed = nn.Embedding(self.vocab_size, self.size)
# self.r_embed = nn.Embedding(self.vocab_size, self.size)
self.l_biases = Variable(torch.from_numpy(np.random.normal(0, 0.01,
(self.vocab_size, 1))).float(),requires_grad = True)
self.r_biases = Variable(torch.from_numpy(np.random.normal(0, 0.01,
(self.vocab_size, 1))).float(),requires_grad = True)
self.optimizer = optim.SGD(self.parameters(), lr = self.l_rate)
def build_vocab(self, sentence_tokens):
vocab = Counter()
for sentence in sentence_tokens:
vocab.update(sentence)
normal_vocab = {word: (i, freq) for i, (word, freq) in enumerate(vocab.items())}
reverse_vocab = {i: (word, freq) for i, (word, freq) in enumerate(vocab.items())}
return normal_vocab, reverse_vocab
def build_cooccur(self, vocab, corpus, window_size=10, min_count=None):
self.vocab_size = len(vocab)
id2word = dict((i, word) for word, (i, _) in vocab.items())
comat = np.zeros((self.vocab_size, self.vocab_size))
for i, sentence in enumerate(corpus):
token_ids = [vocab[word][0] for word in sentence]
for center_i, center_id in enumerate(token_ids):
context_ids = token_ids[max(0, center_i - window_size) : center_i]
contexts_len = len(context_ids)
for left_i, left_id in enumerate(context_ids):
distance = contexts_len - left_i
increment = 1.0 / float(distance)
comat[center_id, left_id] += increment
comat[left_id, center_id] += increment
coocs = np.transpose(np.nonzero(comat))
return comat, coocs
def gen_batch(self):
sample = np.random.choice(np.arange(len(self.coocs)), size=self.batch_size, replace=False).astype(int)
indices = np.array([self.coocs[ind] for ind in sample])
l_vecs = self.l_embed(Variable(torch.LongTensor(indices[:,0])))
r_vecs = self.r_embed(Variable(torch.LongTensor(indices[:,1])))
l_v_bias = self.l_biases[torch.LongTensor(indices[:,0])]
r_v_bias = self.r_biases[torch.LongTensor(indices[:,1])]
covals = torch.from_numpy(np.array([[self.comat[tuple(self.coocs[chosen])]] for chosen in sample])).float()
return l_vecs, r_vecs, covals, l_v_bias, r_v_bias
def wf(self, x):
return torch.min((x/self.xmax)**self.alpha, torch.from_numpy(np.array([1])).float())
def train_model(self):
w_list_size = np.sum([len(s) for s in self.sentence_tokens])
continue_training = True
last_losses = deque(maxlen=5)
last_losses.append(np.inf)
moving_avg = np.mean(last_losses)
epoch = 0
while continue_training:
num_batches = int(w_list_size/self.batch_size)
avg_loss = 0.0
for batch in range(num_batches):
if self.verbose:
print('Batch {}/{}'.format(batch, num_batches))
start_time = time.time()
loss = 0.0
l_vecs, r_vecs, covals, l_v_bias, r_v_bias = self.gen_batch()
# TODO: vectorize this
loop_time = time.time()
for i in range(self.batch_size):
loss += Variable(self.wf(covals[i]), requires_grad=False) * torch.pow((torch.dot(l_vecs[i], r_vecs[i])
+ l_v_bias[i] + r_v_bias[i] - Variable(torch.log(covals[i]), requires_grad=False)),2)/self.batch_size
print('loop time: ', time.time()-loop_time)
avg_loss += loss/num_batches
self.optimizer.zero_grad() # reset gradient vector
loss.backward() # calculate gradients
self.optimizer.step() # do optimization step
if self.verbose:
print('batch time: ', time.time()-start_time)
last_losses.append(avg_loss.data.numpy())
if (np.mean(last_losses)>0.98*moving_avg and epoch>1) or (epoch == self.num_epochs):
continue_training = False
else:
moving_avg = np.mean(last_losses)
if self.verbose:
print("Average loss for epoch {}: {}, Moving average: {}".format(epoch, avg_loss.data.numpy()[0], moving_avg))
epoch += 1
def get_embeddings(self):
embeddings = (self.l_embed.weight.data + self.r_embed.weight.data).numpy()/2
embeddings_dict = {}
for k,v in self.vocab.items():
embeddings_dict[k] = embeddings[v[0]]
return embeddings, embeddings_dict
def get_mean_sentence_embeddings(self, tokenised_sentences):
sentence_embeddings = []
sentence_embeddings = np.zeros((len(tokenised_sentences), self.size))
ct = 0
for tokens in tokenised_sentences:
vecs = []
for word in tokens:
try:
vecs.append(self.embeddings_dict[word])
except:
pass
if len(vecs) > 0:
arr = np.array(vecs)
sentence_embeddings[ct] = list(arr.mean(axis=0))
ct +=1
return sentence_embeddings
def fit_transform(self, sentences):
self.sentence_tokens = tokenize_sentences(sentences)
self.build_model()
self.train_model()
self.get_embeddings()
if self.sentence_representation['name'] == 'mean_of_words':
return self.get_mean_sentence_embeddings(self.sentence_tokens)
def transform(self,sentences):
sentence_tokens = tokenize_sentences(sentences)
if self.sentence_representation['name'] == 'mean_of_words':
return self.get_mean_sentence_embeddings(sentence_tokens)