The best pipeline I have encountered so far is from Maksym Balatsko's Medium article Text preprocessing steps and universal reusable pipeline. The best part is that we can use it as part of scikit-learn transformer pipeline and supports multiprocess
I have modified what Maksym has done and kept packages to a minimum and used generators instead of lists to avoid loading data to memory:
import numpy as np
import multiprocessing as mp
import string
import spacy
from sklearn.base import TransformerMixin, BaseEstimator
nlp = spacy.load("en_core_web_sm")
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,
nlp = nlp,
n_jobs=1):
"""
Text preprocessing transformer includes steps:
1. Punctuation removal
2. Stop words removal
3. Lemmatization
nlp - spacy model
n_jobs - parallel jobs to run
"""
self.nlp = nlp
self.n_jobs = n_jobs
def fit(self, X, y=None):
return self
def transform(self, X, *_):
X_copy = X.copy()
partitions = 1
cores = mp.cpu_count()
if self.n_jobs <= -1:
partitions = cores
elif self.n_jobs <= 0:
return X_copy.apply(self._preprocess_text)
else:
partitions = min(self.n_jobs, cores)
data_split = np.array_split(X_copy, partitions)
pool = mp.Pool(cores)
data = pd.concat(pool.map(self._preprocess_part, data_split))
pool.close()
pool.join()
return data
def _preprocess_part(self, part):
return part.apply(self._preprocess_text)
def _preprocess_text(self, text):
doc = self.nlp(text)
removed_punct = self._remove_punct(doc)
removed_stop_words = self._remove_stop_words(removed_punct)
return self._lemmatize(removed_stop_words)
def _remove_punct(self, doc):
return (t for t in doc if t.text not in string.punctuation)
def _remove_stop_words(self, doc):
return (t for t in doc if not t.is_stop)
def _lemmatize(self, doc):
return ' '.join(t.lemma_ for t in doc)
You can use it as:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
# ... assuming data split X_train, X_test ...
clf = Pipeline(steps=[
('normalize', TextPreprocessor(n_jobs=-1)),
('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
('classifier', LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', n_jobs=-1, verbose=1))
])
clf.fit(X_train, y_train)
clf.predict(X_test)
X_train is data that will pass through TextPreprocessing, then we extract features, then pass to a classifier.