Mighty late to the party, but in case this can help anyone else out there:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from scipy.special import polygamma, logit, expit
import pandas as pd
import numpy as np
class BetaRegressor(BaseEstimator, RegressorMixin):
'''
A Sklearn-compliant regressor class implementing the beta regression for
continuous responses over the (0, 1) support.
Parameters
----------
`fit_intercept: bool = True`
Optional flag indicating whether an intercept parameter should be
included. Defaults to `True`.
`maxiter: int = 100`
The maximum number of iterations the Newton method is allowed to
perform. Defaults to `100`.
`eps: float = 0.0001`
The maximum error the numerical method must reach to achieve
convergence. Defaults to `0.0001`.
`use_smart_seed: bool = True`
Optional flag controlling whether a transformed regression should be
use to provide better seed values for the algorithm. Defaults to
`True`.
Raises
------
`AssertionError`
If the maximum number of iterations is non-positive.
`AssertionError`
If the error tolerance parameter is non-positive.
'''
def __init__(
self, fit_intercept: bool = True, maxiter: int = 100,
eps: float = 0.0000001, use_smart_seed: bool = True
) -> None:
assert maxiter > 0, 'Maximum number of iterations must be positive'
assert eps > 0.0, 'Error tolerance parameter must be positive'
super().__init__()
self.eps = eps
self.maxiter = maxiter
self.fit_intercept = fit_intercept
self.intercept_ = None
self.coef_ = None
self.shape = None
self.niter = None
self.status = 0
self.use_smart_seed = use_smart_seed
def __str__(self) -> str:
return str({
'intercept': None if self.intercept_ is None else np.round(self.intercept_, 4), # noqa: E501
'coef': None if self.coef_ is None else self.coef_.round(4),
'shape': None if self.shape is None else np.round(self.shape, 4),
'niter': self.niter, 'status': self.status
})
def fit(self, X: pd.DataFrame, y: pd.Series, verbose: bool = False):
'''
Fits the predictive model by maximizing the likelihood function of the
beta distribution for the expit mean function given a data sample.
Parameters
----------
`X: pd.DataFrame`
A feature matrix.
`y: pd.Series`
A response array.
`verbose: bool = False`
An optional flag controlling the verbosity of outputs. Defaults to
`False`.
Raises
------
`AssertionError`
If one or more response values are out of bounds.
`AssertionError`
If the feature matrix (including the added dummy column to account
for the intercept parameter) is perfectly colinear.
'''
assert y.min() > 0.0 and y.max() < 1.0, 'Response is out of bounds'
X_ = np.c_[np.ones_like(y), X] if self.fit_intercept else X
assert np.linalg.det(X_.T @ X_) != 0.0, 'Feature matrix displays perfect collinearity' # noqa: E501
size, d = X_.shape
check = False
niter = 0
logy = np.log(y)
logy_ = np.log(1.0 - y)
p = np.zeros(shape=d + 1)
g = np.zeros(shape=d + 1)
h = np.zeros(shape=(d + 1, d + 1))
if self.use_smart_seed:
# Transformed linear regression seeds
reg_ = TransformedTargetRegressor(
regressor=LinearRegression(fit_intercept=self.fit_intercept),
func=logit, inverse_func=expit
).fit(
X=X, y=y
)
if self.fit_intercept:
p[0] = reg_.regressor_.intercept_
p[int(self.fit_intercept):-1] = reg_.regressor_.coef_
mean = expit(X_ @ p[:-1])
p[-1] = np.sum(mean * (1.0 - mean)) / np.sum((y - mean) ** 2) - 1.0
else:
# Method of moments seeds
mean = y.mean()
if self.fit_intercept:
p[0] = logit(mean)
p[-1] = mean * (1.0 - mean) / y.var() - 1.0
while not (check or niter > self.maxiter):
niter += 1
coef, shape = p[:-1], p[-1]
mean = expit(X_ @ coef)
# dl/db
g[:-1] = X_.T @ (
(
-1.0 * polygamma(n=0, x=mean * shape) +
1.0 * polygamma(n=0, x=(1.0 - mean) * shape) +
logy - logy_
) * (
shape * mean * (1.0 - mean)
)
) / size
# dl/dp
g[-1] = (
size * polygamma(n=0, x=shape) +
mean.T @ (-polygamma(n=0, x=mean * shape) + logy) +
(1.0 - mean).T @ (-polygamma(n=0, x=(1.0 - mean) * shape) + logy_) # noqa: E501
) / size
# d2l/db2
sigma = (
(
polygamma(n=1, x=mean * shape) +
polygamma(n=1, x=(1.0 - mean) * shape)
) * (
-1.0 * (shape * mean * (1.0 - mean)) ** 2.0
) + (
-polygamma(n=0, x=mean * shape) +
polygamma(n=0, x=(1.0 - mean) * shape) +
logy - logy_
) * (
shape * (1.0 - 2.0 * mean) * mean * (1.0 - mean)
)
) / size
# h[:-1, :-1] = X_.T @ np.diag(sigma) @ X_
h[:-1, :-1] = np.multiply(sigma[:, None], X_).T @ X_
# d2l/dbdp
h[-1, :-1] = X_.T @ (
(
-shape * mean * polygamma(n=1, x=mean * shape) +
shape * (1.0 - mean) * polygamma(n=1, x=(1.0 - mean) * shape) + # noqa: E501
-polygamma(n=0, x=mean * shape) +
polygamma(n=0, x=(1.0 - mean) * shape) +
logy - logy_
) * (
mean * (1.0 - mean)
)
) / size
h[:-1, -1] = h[-1, :-1]
# d2l/dp2
h[-1, -1] = (
size * polygamma(n=1, x=shape) +
-mean.T @ (polygamma(n=1, x=mean * shape) * mean) +
-(1.0 - mean).T @ (polygamma(n=1, x=(1.0 - mean) * shape) * (1.0 - mean)) # noqa: E501
) / size
if np.linalg.det(h) == 0.0:
break
delta = -np.linalg.inv(a=h) @ g
loss = np.linalg.norm(x=delta)
check = loss <= self.eps
if verbose:
print(niter, p.round(decimals=4), np.round(loss, decimals=4))
p += delta
self.intercept_ = p[0] if self.fit_intercept else 0.0
self.coef_ = p[int(self.fit_intercept):-1]
self.shape = p[-1]
self.niter = niter
self.status = int(check)
return self
def predict(self, X: pd.DataFrame) -> np.ndarray:
'''
Evaluates the expected values of the probabilistic model given the
feature matrix.
Parameters
----------
`X: pd.DataFrame`
The feature matrix.
Returns
-------
`np.ndarray`
An array of predictions.
Raises
------
`AssertionError`
If the number of features do not match that provided at fitting.
`AssertionError`
If the regressor hasn't been successfully calibration prior to predicting.
'''
assert X.shape[1] == len(self.coef_), 'Dimension of feature matrix do not match those of coefficient array' # noqa: E501
assert self.status == 1, 'Regressor must be successfully fitted before use' # noqa: E501
return expit(self.intercept_ + X @ self.coef_)
if __name__ == '__main__':
size, d = 100_000, 5
intercept = np.random.normal(scale=0.1)
coef = np.random.normal(size=d, scale=0.1)
shape = 10.0
X = np.random.normal(size=(size, d))
loc = expit(intercept + X @ coef)
y = np.random.beta(a=loc * shape, b=(1.0 - loc) * shape)
reg = BetaRegressor(use_smart_seed=False).fit(X=X, y=y)
print({
'intercept': np.round(intercept, 4), 'coef': coef.round(4),
'shape': np.round(shape, 4)
})
print(reg)
Adding some clarifications:
- The code is a Sklearn API compliant implementation of a regressor class for the beta regression.
- For individual data samples, we assume $y_i \mid \boldsymbol{x}_i \sim \mathcal{B}!\left(\mu_i, \phi\right)$, where responses are independant conditional on $\boldsymbol{X}$, the feature matrix.
- The $\left(\mu, \phi\right) = \left(\tfrac{\alpha}{\alpha + \beta}, \alpha + \beta\right)$ parametrization is used in the same spirit as other more classical forms of regression do (e.g., GLMs).
- This essentially means that the mean parameter is expressed as a transformation of $\boldsymbol{x}^\mathsf{T}\boldsymbol{\beta}$, the linear combination of all predictive features ($\boldsymbol{x}$) and regression coefficients ($\boldsymbol{\beta}$).
- The so-called inverse link function, or mean function, is here defined as $\mu!\left(\boldsymbol{x}\right) = {\left{1 + \exp!\left(-\boldsymbol{x}^\mathsf{T}\boldsymbol{\beta}\right)\right}}^{-1}$ -- the expit function -- basically as a way to enforce the support the beta distribution is defined over ($\Omega_y = \left(0, 1\right)$).
- The numerical solver implemented is Newton's method.
- The regression coefficients estimates are obtained by maximising the log-likelihood function by way of this numerical solver.
- Expressions for both gradient and hessian matrix are included within the code, and details of their mathematical derivations are not included here.
- The seed values provided to the numerical solver rely on either of 2 strategies: "smart seeds" or the method of moments.
- The method of moments assumes all coefficients are 0 except for the intercept parameter which, if included (via the
fit_intercept
argument), if evaluated so the theoretical mean matches the sample mean of the response.
- Likewise the dispersion parameter, $\phi$, is set so that theoretical and sample variances are a match.
- The smart seed strategy, on the other hand, uses a transformed regressor to fit a linear regression on the logit of the response.
- Finally, a short illustrative example is provided under the
if __name__ == '__main__':
line.