I was wondering how to run a multi-class, multi-label, ordinal classification with sklearn. I want to predict a ranking of target groups, ranging from the one that is most prevalant at a certain location (1) to the one that is least prevalent (7). I don't seem to be able to get it right. Could you please help me out?
# Random Forest Classification
# Import
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Import dataset
dataset = pd.read_excel('alle_probs_edit.v2.xlsx')
X = dataset.iloc[:,4:-1].values
Y = dataset.iloc[:,-1].values
# Split in Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42 )
# Scaling the features (alle Variablen auf eine gleiche Ebene), necessary depend on the choosen method
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Creat classifier
classifier = RandomForestClassifier(criterion = 'entropy')
# Choose some parameter combinations to try
parameters = {'bootstrap': [True, False],
'max_depth': [50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 3, 4],
'min_samples_split': [9, 10, 11, 12, 13],
'n_estimators': [500,1000,1500]}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
# Run the grid search
grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer, cv = 3, n_jobs = -1)
grid_obj = grid_obj.fit(X_train, Y_train)
# Set the classifier to the best combination of parameters
classifier = grid_obj.best_estimator_
# Fit the best algorithm to the data
classifier.fit(X_train, Y_train)
#Prediction the Test data
Y_pred = classifier.predict(X_test)
#Confusion Matrix
cm = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
#Accuracy
accuracy1 = accuracy_score(Y_test, Y_pred)
print("Accuracy1: %.2f%%" % (accuracy1 * 100.0))
# k-Fold Class Validation
accuracy1 = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10)
kfold = accuracy1.mean()
accuracy1.std()