I agree with @George - "there is something "wrong" with the test set"
. I got similar results of MSE - approx. 21.
I also tried to put train and test datasets together and feed it to GridSearchCV.
Here are the results of those attempts:
In [33]: print_grid_results(grid)
----------------------------- [SVR_rbf] ------------------------------
Score: 48.98%
Parameters: {'SVR_rbf__C': 5, 'SVR_rbf__max_iter': 500}
**********************************************************************
---------------------------- [SVR_linear] ----------------------------
Score: 64.07%
Parameters: {'SVR_linear__C': 0.1, 'SVR_linear__max_iter': 500}
**********************************************************************
------------------------------ [Ridge] -------------------------------
Score: 63.98%
Parameters: {'Ridge__alpha': 100, 'Ridge__max_iter': 200}
**********************************************************************
------------------------------ [Lasso] -------------------------------
Score: 60.36%
Parameters: {'Lasso__alpha': 0.001, 'Lasso__max_iter': 1000}
**********************************************************************
--------------------------- [RandomForest] ---------------------------
Score: 44.01%
Parameters: {'RandomForest__max_depth': 5, 'RandomForest__n_estimators': 100}
**********************************************************************
Also different splits are giving very different test scores:
In [43]: clf = grid['SVR_linear']
In [44]: {k:v for k,v in clf.cv_results_.items() if k.endswith('_test_score')}
Out[44]:
{'mean_test_score': array([0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 , 0.64067998, 0.63919104, 0.6391681 ]),
'rank_test_score': array([1, 4, 7, 1, 4, 7, 1, 4, 7]),
'split0_test_score': array([0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802, 0.98557453, 0.98876705, 0.98883802]),
'split1_test_score': array([0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475, 0.69915178, 0.69750946, 0.69740475]),
'split2_test_score': array([0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214, 0.23568677, 0.22964765, 0.22961214]),
'std_test_score': array([0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954, 0.30903146, 0.31275403, 0.31278954])}
Here is a full code:
import os
#import contextlib
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
def get_data_split(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return X_train, y_train[0], X_test, y_test[0]
def get_data(path='.'):
p = Path(path)
kwargs = dict(delim_whitespace=True, header=None)
X_train = pd.read_csv(list(p.glob('trainX.txt*'))[0], **kwargs)
y_train = pd.read_csv(list(p.glob('trainY.txt*'))[0], **kwargs)
X_test = pd.read_csv(list(p.glob('testX.txt*'))[0], **kwargs)
y_test = pd.read_csv(list(p.glob('testY.txt*'))[0], **kwargs)
return (pd.concat([X_train, X_test], ignore_index=True),
pd.concat([y_train, y_test], ignore_index=True)[0])
def fit_all_classifiers_grid(X, y, classifiers, **common_grid_kwargs):
grids = {}
for clf in classifiers:
print('{:-^70}'.format(' [' + clf['name'] + '] '))
pipe = Pipeline([
("scale", StandardScaler()),
(clf['name'], clf['clf']) ])
grids[clf['name']] = (GridSearchCV(pipe,
param_grid=clf['param_grid'],
**common_grid_kwargs)
.fit(X, y))
# saving single trained model ...
joblib.dump(grids[clf['name']], './{}.pkl'.format(clf['name']))
return grids
def test_dataset(grid, X_test, y_test):
res = {}
for name, clf in grid.items():
y_pred = clf.predict(X_test)
res[name] = {'MSE': mean_squared_error(y_test, y_pred),
'R2': r2_score(y_test, y_pred)
}
return res
def print_grid_results(grids):
for name, clf in grids.items():
print('{:-^70}'.format(' [' + name + '] '))
print('Score:\t\t{:.2%}'.format(clf.best_score_))
print('Parameters:\t{}'.format(clf.best_params_))
print('*' * 70)
classifiers = [
{ 'name': 'SVR_rbf',
'clf': SVR(),
'title': "SVR_rbf",
'param_grid': {
'SVR_rbf__C': [0.1, 1, 5],
'SVR_rbf__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'SVR_linear',
'clf': SVR(kernel='linear'),
'title': "SVR_rbf",
'param_grid': {
'SVR_linear__C': [0.1, 1, 5],
'SVR_linear__max_iter': [500, 1000, 5000]
}
},
{ 'name': 'Ridge',
'clf': Ridge(),
'title': "Ridge",
'param_grid': {
'Ridge__alpha': [0.1, 1, 5, 10, 50, 100],
'Ridge__max_iter': [200, 500]
}
},
{ 'name': 'Lasso',
'clf': Lasso(),
'title': "Lasso",
'param_grid': {
'Lasso__alpha': [0.001, 0.01, 0.1, 1, 5, 10],
'Lasso__max_iter': [1000, 5000]
}
},
{ 'name': 'RandomForest',
'clf': RandomForestRegressor(),
'title': "RandomForest",
'param_grid': {
'RandomForest__n_estimators': [10, 100],
'RandomForest__max_depth': [3, 5],
}
},
]
def main(path):
#path = r'D:\data\work\.ML\SO\49094242-SVM provided a bad result in my data'
os.chdir(path)
X, y = get_data(path)
grid = fit_all_classifiers_grid(X, y, classifiers, cv=3, verbose=2, n_jobs=-1)
print_grid_results(grid)
#X_train, y_train, X_test, y_test = get_data_split(path)
#grid = fit_all_classifiers_grid(X_train, y_train, classifiers, cv=2, verbose=2, n_jobs=-1)
#res = test_dataset(grid, X_test, y_test)
#print(res)
PS sorry for using name classifier
instead of regressor
- I just reused my old code where I was searching for the best classifier....