The inconsistency is due to the fact that statsmodels
uses different formulas to calculate the R-squared depending on whether the model includes an intercept or not. If the intercept is included, statsmodels
divides the sum of squared residuals by the centered total sum of squares, while if the intercept is not included, statsmodels
divides the sum of squared residuals by the uncentered total sum of squares. This means that statsmodels
uses the following formula to calculate the R-squared, which can be found in the documentation:
import numpy as np
def rsquared(y_true, y_pred, fit_intercept=True):
'''
Statsmodels R-squared, see https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.rsquared.html.
'''
if fit_intercept:
return 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
else:
return 1 - np.sum((y_true - y_pred) ** 2) / np.sum(y_true ** 2)
On the other hand, sklearn
always uses the centered total sum of squares at the denominator, regardless of whether the intercept is actually included in the model or not (i.e. regardless of whether fit_intercept=True
or fit_intercept=False
). See also this answer.
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
def rsquared(y_true, y_pred, fit_intercept=True):
'''
Statsmodels R-squared, see https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.rsquared.html.
'''
if fit_intercept:
return 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
else:
return 1 - np.sum((y_true - y_pred) ** 2) / np.sum(y_true ** 2)
# dummy data:
y = np.array([1, 3, 4, 5, 2, 3, 4])
X = np.array(range(1, 8)).reshape(-1, 1) # reshape to column
# intercept is not zero: the result are the same
# scikit-learn:
lr = LinearRegression(fit_intercept=True)
lr.fit(X, y)
print(lr.score(X, y))
# 0.16118421052631582
print(rsquared(y, lr.predict(X), fit_intercept=True))
# 0.16118421052631582
# statsmodels
X_ = sm.add_constant(X)
model = sm.OLS(y, X_)
results = model.fit()
print(results.rsquared)
# 0.16118421052631582
print(rsquared(y, results.fittedvalues, fit_intercept=True))
# 0.16118421052631593
# intercept is zero: the result are different
# scikit-learn:
lr = LinearRegression(fit_intercept=False)
lr.fit(X, y)
print(lr.score(X, y))
# -0.4309210526315792
print(rsquared(y, lr.predict(X), fit_intercept=True))
# -0.4309210526315792
print(rsquared(y, lr.predict(X), fit_intercept=False))
# 0.8058035714285714
# statsmodels
model = sm.OLS(y, X)
results = model.fit()
print(results.rsquared)
# 0.8058035714285714
print(rsquared(y, results.fittedvalues, fit_intercept=False))
# 0.8058035714285714