Here is performance check of the methods to obtain list of most similar words. In some way this is extreme case where a model has neither w.prob nor w.cluster to narrow down the search space. I used four methods: two mentioned above, most_similar from SpaCy and most_similar from Gensim:
def spacy_most_similar(word, topn=10):
ms = nlp_ru.vocab.vectors.most_similar(
nlp_ru(word).vector.reshape(1,nlp_ru(word).vector.shape[0]), n=topn)
words = [nlp_ru.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
return words, distances
def spacy_similarity(word, topn=10):
word = nlp_ru.vocab[str(word)]
queries = [
w for w in word.vocab if w.is_lower == word.is_lower and np.count_nonzero(w.vector)
]
by_similarity = sorted(queries, key=lambda w: w.similarity(word), reverse=True)
return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
def spacy_similarity_numba(word, topn=10):
word = nlp_ru.vocab[str(word)]
queries = [
w for w in word.vocab if w.is_lower == word.is_lower and np.count_nonzero(w.vector)
]
by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
from numba import jit
@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
assert(u.shape[0] == v.shape[0])
uv = 0
uu = 0
vv = 0
for i in range(u.shape[0]):
uv += u[i]*v[i]
uu += u[i]*u[i]
vv += v[i]*v[i]
cos_theta = 1
if uu != 0 and vv != 0:
cos_theta = uv/np.sqrt(uu*vv)
return cos_theta
Here is timing results:
from time import time
import timeit, functools
from timeit import default_timer as timer
print(nlp_ru.vocab.vectors.shape)
arr = ("дерево")
print(f'Gensim most_similar: {timeit.Timer(functools.partial(wv.most_similar, arr)).timeit(1)}')
print(f'Spacy most_similar: {timeit.Timer(functools.partial(spacy_most_similar, arr)).timeit(1)}')
print(f'Spacy cosine_similarity_numba: {timeit.Timer(functools.partial(spacy_similarity_numba, arr)).timeit(1)}')
print(f'Spacy similarity: {timeit.Timer(functools.partial(spacy_similarity, arr)).timeit(1)}')
(1239964, 100)
Gensim most_similar: 0.06437033399993197
Spacy most_similar: 0.4855721250000897
Spacy cosine_similarity_numba: 13.404324778000046
Spacy similarity: 60.58928110700003
All methods return identical results. As you see Gensim is blazingly fast comparing to others. And you don't even need to narrow down the search space. All measurements done on CPU. Embeddings taken from here http://panchenko.me/data/dsl-backup/w2v-ru/all.norm-sz100-w10-cb0-it1-min100.w2v
nlp.vocab
rather thanword.vocab
? – Marxist