Get confidence interval from sklearn linear regression in python

import numpy as np import matplotlib.pyplot as plt from math import pi import pandas as pd import seaborn as sns from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score # import the data boston_dataset = load_boston() boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names) boston['MEDV'] = boston_dataset.target X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM']], columns=['LSTAT', 'RM']) Y = boston['MEDV'] # splits the training and test data set in 80% : 20% # assign random_state to any value.This ensures consistency. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5) lin_model = LinearRegression() lin_model.fit(X_train, Y_train) # model evaluation for training set y_train_predict = lin_model.predict(X_train) rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict))) r2 = r2_score(Y_train, y_train_predict) # model evaluation for testing set y_test_predict = lin_model.predict(X_test) # root mean square error of the model rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict))) # r-squared score of the model r2 = r2_score(Y_test, y_test_predict) plt.scatter(Y_test, y_test_predict) plt.show()

If you're looking to compute the confidence interval of the regression parameters, one way is to manually compute it using the results of LinearRegression from scikit-learn and numpy methods.

The code below computes the 95%-confidence interval (alpha=0.05). alpha=0.01 would compute 99%-confidence interval etc.

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression

alpha = 0.05 # for 95% confidence interval; use 0.01 for 99%-CI.

# fit a sklearn LinearRegression model
lin_model = LinearRegression().fit(X_train, Y_train)

# the coefficients of the regression model
coefs = np.r_[[lin_model.intercept_], lin_model.coef_]
# build an auxiliary dataframe with the constant term in it
X_aux = X_train.copy()
X_aux.insert(0, 'const', 1)
# degrees of freedom
dof = -np.diff(X_aux.shape)[0]
# Student's t-distribution table lookup
t_val = stats.t.isf(alpha/2, dof)
# MSE of the residuals
mse = np.sum((Y_train - lin_model.predict(X_train)) ** 2) / dof
# inverse of the variance of the parameters
var_params = np.diag(np.linalg.inv(X_aux.T.dot(X_aux)))
# distance between lower and upper bound of CI
gap = t_val * np.sqrt(mse * var_params)

conf_int = pd.DataFrame({'lower': coefs - gap, 'upper': coefs + gap}, index=X_aux.columns)

Using the Boston housing dataset, the above code produces the dataframe below:

If this is too much manual code, you can always resort to the statsmodels and use its conf_int method:

import statsmodels.api as sm
alpha = 0.05 # 95% confidence interval
lr = sm.OLS(Y_train, sm.add_constant(X_train)).fit()
conf_interval = lr.conf_int(alpha)

Since it uses the same formula, it produces the same output as above.

Convenient wrapper function:

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression


def get_conf_int(alpha, lr, X=X_train, y=Y_train):
    
    """
    Returns (1-alpha) 2-sided confidence intervals
    for sklearn.LinearRegression coefficients
    as a pandas DataFrame
    """
    
    coefs = np.r_[[lr.intercept_], lr.coef_]
    X_aux = X.copy()
    X_aux.insert(0, 'const', 1)
    dof = -np.diff(X_aux.shape)[0]
    mse = np.sum((y - lr.predict(X)) ** 2) / dof
    var_params = np.diag(np.linalg.inv(X_aux.T.dot(X_aux)))
    t_val = stats.t.isf(alpha/2, dof)
    gap = t_val * np.sqrt(mse * var_params)

    return pd.DataFrame({
        'lower': coefs - gap, 'upper': coefs + gap
    }, index=X_aux.columns)


# for 95% confidence interval; use 0.01 for 99%-CI.
alpha = 0.05
# fit a sklearn LinearRegression model
lin_model = LinearRegression().fit(X_train, Y_train)

get_conf_int(alpha, lin_model, X_train, Y_train)

Stats reference

Recommended topics

Hot tags