Glove Pytorch speed up
Asked Answered
D

0

7

I am trying to implement the GloVe algorithm in pytorch. This is the first time I am using pytorch and I think my implementation might not be very efficient. Apart from the obvious (vectorizing the for loop that is run every batch) is there anything that would give me some speed up? I found that the self.optimizer.step() is particularly expensive.

from torch.autograd import Variable
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from scipy import sparse
from collections import Counter
import gensim.utils as utils
import pandas as pd
from collections import deque
import time
import gensim.utils as utils

def tokenize_sentences(sentences,enforce_lower = True):
    return [list(utils.tokenize(sentence,lowercase=enforce_lower)) for sentence in sentences]

class GloVe(nn.Module):

def __init__(self, window = 3, size = 100, xmax = 2, alpha = 0.75, batch_size = 10, 
    l_rate = 0.01, num_epochs = 100, min_count=2, sentence_representation={'name':'mean_of_words'}, 
    verbose=True, retrain_first=False, pretrained_model=None):

    super(GloVe, self).__init__()
    self.window = window
    self.size = size
    self.xmax = xmax
    self.alpha = alpha
    self.batch_size = batch_size
    self.l_rate = l_rate
    self.num_epochs = num_epochs
    self.min_count = min_count
    self.sentence_representation = sentence_representation
    self.verbose = verbose

def build_model(self):

    self.vocab, self.reverse_vocab = self.build_vocab(self.sentence_tokens)
    self.comat, self.coocs = self.build_cooccur(self.vocab, self.sentence_tokens, self.min_count)
    self.l_embed = nn.Embedding(self.vocab_size, self.size, sparse=True)
    self.r_embed = nn.Embedding(self.vocab_size, self.size, sparse=True)

    # If we use other optimizers such as Adam, we can't use sparse matrices
    # self.l_embed = nn.Embedding(self.vocab_size, self.size)
    # self.r_embed = nn.Embedding(self.vocab_size, self.size)

    self.l_biases = Variable(torch.from_numpy(np.random.normal(0, 0.01, 
        (self.vocab_size, 1))).float(),requires_grad = True)
    self.r_biases = Variable(torch.from_numpy(np.random.normal(0, 0.01, 
        (self.vocab_size, 1))).float(),requires_grad = True)
    self.optimizer = optim.SGD(self.parameters(), lr = self.l_rate)


def build_vocab(self, sentence_tokens):

    vocab = Counter()
    for sentence in sentence_tokens:
        vocab.update(sentence)
    normal_vocab = {word: (i, freq) for i, (word, freq) in enumerate(vocab.items())}
    reverse_vocab = {i: (word, freq) for i, (word, freq) in enumerate(vocab.items())}
    return normal_vocab, reverse_vocab

def build_cooccur(self, vocab, corpus, window_size=10, min_count=None):

    self.vocab_size = len(vocab)
    id2word = dict((i, word) for word, (i, _) in vocab.items())

    comat = np.zeros((self.vocab_size, self.vocab_size))
    for i, sentence in enumerate(corpus):
        token_ids = [vocab[word][0] for word in sentence]

        for center_i, center_id in enumerate(token_ids):
            context_ids = token_ids[max(0, center_i - window_size) : center_i]
            contexts_len = len(context_ids)

            for left_i, left_id in enumerate(context_ids):
                distance = contexts_len - left_i
                increment = 1.0 / float(distance)
                comat[center_id, left_id] += increment
                comat[left_id, center_id] += increment
    coocs = np.transpose(np.nonzero(comat))
    return comat, coocs

def gen_batch(self):    
    sample = np.random.choice(np.arange(len(self.coocs)), size=self.batch_size, replace=False).astype(int)
    indices = np.array([self.coocs[ind] for ind in sample])
    l_vecs = self.l_embed(Variable(torch.LongTensor(indices[:,0])))
    r_vecs = self.r_embed(Variable(torch.LongTensor(indices[:,1])))
    l_v_bias = self.l_biases[torch.LongTensor(indices[:,0])]
    r_v_bias = self.r_biases[torch.LongTensor(indices[:,1])]
    covals = torch.from_numpy(np.array([[self.comat[tuple(self.coocs[chosen])]] for chosen in sample])).float()
    return l_vecs, r_vecs, covals, l_v_bias, r_v_bias

def wf(self, x):
    return torch.min((x/self.xmax)**self.alpha, torch.from_numpy(np.array([1])).float())

def train_model(self):
    w_list_size = np.sum([len(s) for s in self.sentence_tokens])

    continue_training = True
    last_losses = deque(maxlen=5)
    last_losses.append(np.inf)
    moving_avg = np.mean(last_losses)
    epoch = 0
    while continue_training:
        num_batches = int(w_list_size/self.batch_size)
        avg_loss = 0.0
        for batch in range(num_batches):
            if self.verbose:
                print('Batch {}/{}'.format(batch, num_batches))
                start_time = time.time()
            loss = 0.0

            l_vecs, r_vecs, covals, l_v_bias, r_v_bias = self.gen_batch()

            # TODO: vectorize this
            loop_time = time.time()
            for i in range(self.batch_size):

                loss +=  Variable(self.wf(covals[i]), requires_grad=False) * torch.pow((torch.dot(l_vecs[i], r_vecs[i])
                     + l_v_bias[i] + r_v_bias[i] - Variable(torch.log(covals[i]), requires_grad=False)),2)/self.batch_size
            print('loop time: ', time.time()-loop_time)

            avg_loss += loss/num_batches

            self.optimizer.zero_grad() # reset gradient vector
            loss.backward() # calculate gradients
            self.optimizer.step() # do optimization step

            if self.verbose:
                print('batch time: ', time.time()-start_time)

        last_losses.append(avg_loss.data.numpy())
        if (np.mean(last_losses)>0.98*moving_avg and epoch>1) or (epoch == self.num_epochs):
            continue_training = False
        else:
            moving_avg = np.mean(last_losses)
        if self.verbose:
            print("Average loss for epoch {}: {}, Moving average: {}".format(epoch, avg_loss.data.numpy()[0], moving_avg))
        epoch += 1

def get_embeddings(self):
    embeddings = (self.l_embed.weight.data + self.r_embed.weight.data).numpy()/2
    embeddings_dict = {}
    for k,v in self.vocab.items():
        embeddings_dict[k] = embeddings[v[0]]
    return embeddings, embeddings_dict

def get_mean_sentence_embeddings(self, tokenised_sentences):
    sentence_embeddings = []
    sentence_embeddings = np.zeros((len(tokenised_sentences), self.size))
    ct = 0
    for tokens in tokenised_sentences:

        vecs = []
        for word in tokens:
            try:
                vecs.append(self.embeddings_dict[word])
            except:
                pass
        if len(vecs) > 0:
            arr = np.array(vecs)
            sentence_embeddings[ct] = list(arr.mean(axis=0))

        ct +=1
    return sentence_embeddings

def fit_transform(self, sentences):

    self.sentence_tokens = tokenize_sentences(sentences)
    self.build_model()
    self.train_model()
    self.get_embeddings()

    if self.sentence_representation['name'] == 'mean_of_words':
        return self.get_mean_sentence_embeddings(self.sentence_tokens)

def transform(self,sentences):
    sentence_tokens = tokenize_sentences(sentences)
    if self.sentence_representation['name'] == 'mean_of_words':
        return self.get_mean_sentence_embeddings(sentence_tokens)
Dalton answered 24/10, 2017 at 9:16 Comment(2)
I have done a fast implementation, efficient on the GPU: discuss.pytorch.org/t/fast-implementation-of-glove/3812Handhold
thanks a lot for this, it runs much faster than my implementation!Dalton

© 2022 - 2024 — McMap. All rights reserved.