I want to develop an NER model where I want to use word-embedding features to train CRF model. Code perfectly working without word-embedding features but when I insert embedding as features for CRF training, got error messages. Here is the part of snippet of my code:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from itertools import chain
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle
from gensim.models import KeyedVectors
import numpy as np
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) ### Loading pre-trainned word2vec model
### Embedding function
def get_features(word):
word=word.lower()
vectors=[]
try:
vectors.append(model1[word])
except:
pass
#vectors=np.array(vectors)
#vectors=vectors[0]
return vectors
def word2features(sent, i):
word = sent[i][0]
wordembdding=get_features(word) ## word embedding vector
wordembdding=np.array(wordembdding) ## vectors
#wordembdding=
#wordembdding=wordembdding[0]
postag = sent[i][1]
tag1=sent[i][2]
tag2=sent[i][4]
tag3 = sent[i][5]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'wordembdding': wordembdding,
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
'postag': postag,
'postag[:2]': postag[:2],
'tag1': tag1,
'tag1[:2]': tag1[:2],
'tag2': tag2,
'tag2[:2]': tag2[:2],
'tag3': tag3,
'tag3[:2]': tag3[:2],
'wordlength': len(word),
'wordinitialcap': word[0].isupper(),
'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
'wordallcap': len([x for x in word if x.isupper()])==len(word),
'distfromsentbegin': i
}
if i > 0:
word1 = sent[i-1][0]
wordembdding1= get_features(word1)
wordembdding1=np.array(wordembdding1)
#wordembdding1=f2(wordembdding1)
postag1 = sent[i-1][1]
tag11=sent[i-1][2]
tag22=sent[i-1][4]
tag33 = sent[i-1][5]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:wordembdding': wordembdding1, # word embedding features
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
'-1:tag1': tag1,
'-1:tag1[:2]': tag1[:2],
'-1:tag2': tag2,
'-1:tag2[:2]': tag2[:2],
'-1:tag3': tag3,
'-1:tag3[:2]': tag3[:2],
'-1:wordlength': len(word),
'-1:wordinitialcap': word[0].isupper(),
'-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
'-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
})
else:
features['BOS'] = True
if i < len(sent)-1:
word1 = sent[i+1][0]
wordembdding1= get_features(word1)
wordembdding1= get_features(word1)
wordembdding1=np.array(wordembdding1) ## word embedding features
#wordembdding1=f2(wordembdding)
postag1 = sent[i+1][1]
tag11=sent[i+1][2]
tag22=sent[i+1][4]
tag33 = sent[i+1][5]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:wordembdding': wordembdding1,
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
'+1:tag1': tag1,
'+1:tag1[:2]': tag1[:2],
'+1:tag2': tag2,
'+1:tag2[:2]': tag2[:2],
'+1:tag3': tag3,
'+1:tag3[:2]': tag3[:2],
'+1:wordlength': len(word),
'+1:wordinitialcap': word[0].isupper(),
'+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
'+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
})
else:
features['EOS'] = True
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, tag1, label, tag2, tag3 in sent]
def sent2tokens(sent):
return [token for token, postag, tag1, label, tag2, tag3, tag4, tag5 in sent]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
%%time
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train) ### Error message when try to train
When I want to train the CRF model I got this error messages:
TypeError: only size-1 arrays can be converted to Python scalars
Can anyone suggest me how to use word embedding vectors to train CRF model ?