Poisson model restricted mean field approximation of Mr.Ash
- About
- Generate toy data
- Implementation of variational inference
- Fitting
- Results
- Calibration of predictions
- Evaluation of ranking power
About
This is a proof of concept implementation of restricted mean field approximation with independent coefficients factorization for Poisson regression. Here, we restrict the factorized distribution to follow: \begin{equation*} q\left(\boldsymbol{\beta}, \boldsymbol{\gamma}, \boldsymbol{\pi} \right) = q\left( \boldsymbol{\pi} \right) \prod_{j=1}^{P}q\left( \beta_j \mid \boldsymbol{\gamma}_j\right) q\left( \boldsymbol{\gamma}_j \right) \end{equation*} where, \begin{align*} q\left( \beta_j \mid \boldsymbol{\gamma}_j\right) &= \prod_{k=1}^{K} \left[ \mathcal{N} \left( \beta_j \mid \mu_{jk}, s_{jk}^2 \right) \right]^{\gamma_{jk}} \\ q\left( \boldsymbol{\gamma}_j \right) &= \prod_{k=1}^{K} \alpha_{jk}^{\gamma_{jk}} \end{align*}
For a generalized method, we have used an approximate ELBO by taking a lower bound of the exact ELBO using a Taylor series expansion of the gradient of the expected response (see notes for details). Although the variational parameters, namely $\alpha_{jk}$, $\mu_{jk}$ and $s_{jk}^2$ are not analytically tractable, we developed an approximate coordinate ascent algorithm by iterating over components.
Here, we compare the restricted mean field approach with "blackbox" PoissonRegressor from Python scikit-learn. We have set the regularization strength alpha
to approximately 1.0 over number of samples.
We used the mean Poisson deviance to quantify the quality of classification, and the Gini index1 to quantify the sparsity of the coefficients.
#collapse_hide
import numpy as np
np.set_printoptions(precision = 4, suppress=True)
import pandas as pd
from scipy import optimize
from scipy import stats
from scipy.special import digamma
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_poisson_deviance
from sklearn.utils import gen_even_slices
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import ticker as plticker
from mpl_toolkits.axes_grid1 import make_axes_locatable
import sys
sys.path.append("../../utils/")
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)
Generate toy data
Let us consider a Poisson model with sparse coefficients, so that the number of causal variables ncausal
is much less than the number of variables nvar
in the model. To ensure sparsity of the coefficients, we use a Gaussian mixture prior of nGcomp
components with variances given by $\sigma_k^2$ (sigmak2
) and unknown mixture coefficients. The sparsity of the initialization is controlled by the variable sparsity
that specifies the mixture coefficient of the zero-th component $\mathcal{N}(0, 0)$ (or the delta function). We are particularly interested in systems where the number of samples nsample
is less than nvar
. Validation is performed on separate test dataset with ntest
samples.
nsample = [50, 200]
nvar = [100, 100]
ncausal = 5
nGcomp = 10
sparsity = 0.95
prior_variance = 5
ntest = 2000
sigmak2 = np.array([prior_variance * np.square(np.power(2, (i)/nGcomp) - 1) for i in range(nGcomp)])
We use the exponential inverse link function $\theta_i = \exp\left(\mathbf{X}_i\boldsymbol{\beta}\right)$ to generate the response variable $y_i$ for $i = 1, \ldots, N$ samples using the Poisson probability distribution \begin{equation*} p\left(y_i \mid \mathbf{X}_i, \boldsymbol{\beta}\right) = \frac{\theta_i^{y_i}e^{-\theta_i}}{y_i!} \end{equation*} $\mathbf{X}$ is centered and scaled such that for each variable $j$, the variance is $\mathrm{var}(\mathbf{x}_j) = 1$.
# collapse-hide
def standardize(X):
Xnorm = (X - np.mean(X, axis = 0))
Xstd = Xnorm / np.std(Xnorm, axis = 0)
return Xstd
def poisson_data(X, beta):
Xbeta = np.dot(X, beta)
pred = np.exp(Xbeta)
Y = np.random.poisson(pred)
return Y
def gini(x):
n = len(x)
xabs = abs(x)
xsort = xabs[np.argsort(xabs)]
t1 = xsort / np.sum(xabs)
t2 = (n - np.arange(n) - 1 + 0.5) / n
gini = 1 - 2 * np.sum(t1 * t2)
return gini
def simulate_data(nsample, nvar, ncausal, ntest):
X = np.random.rand(nsample * nvar).reshape(nsample, nvar)
X = standardize(X)
Xtest = np.random.rand(ntest * nvar).reshape(ntest, nvar)
Xtest = standardize(Xtest)
beta = np.zeros(nvar)
causal_idx = np.random.choice(nvar, ncausal, replace = False)
beta[causal_idx] = np.random.normal(loc = 0., scale = 1., size = ncausal)
Y = poisson_data(X, beta)
Ytest = poisson_data(Xtest, beta)
return X, Y, beta, Xtest, Ytest
Let us have a look at the generated data.
# collapse-hide
X = [None for n in nsample]
Xtest = [None for n in nsample]
Y = [None for n in nsample]
Ytest = [None for n in nsample]
beta = [None for n in nsample]
for i, n in enumerate(nsample):
X[i], Y[i], beta[i], Xtest[i], Ytest[i] \
= simulate_data(n, nvar[i], ncausal, ntest)
print(f'There are {ncausal} non-zero coefficients with variance {np.var(beta[beta!=0]):.4f}')
fig = plt.figure(figsize = (12,12))
ax = [fig.add_subplot(221),
fig.add_subplot(222),
fig.add_subplot(223),
fig.add_subplot(224)]
for i, n in enumerate(nsample):
Xbeta = np.dot(X[i], beta[i])
ax[i * 2].scatter(Xbeta, np.log(Y[i] + 1), s = 10)
#ax[i * 2].scatter(np.exp(Xbeta), Y[i], s = 10)
ax[i * 2 + 1].hist(Y[i])
ax[i * 2].set_xlabel(r'$\sum_i X_{ni} \beta_i$')
ax[i * 2].set_ylabel(r'$\log(Y_n + 1)$')
ax[i * 2 + 1].set_xlabel(r'$Y_n$')
ax[i * 2 + 1].set_ylabel('Number')
plt.tight_layout()
plt.show()
# collapse-hide
def get_btheta(X, beta):
nsample, nvar = X.shape
Xbeta = np.dot(X, beta)
btheta = np.exp(Xbeta)
dbtheta = btheta.copy()
ddbtheta = btheta.copy()
return btheta, dbtheta, ddbtheta
def get_elbo(X, Y, dspr, alpha, mu, s2, logpi, sigmak2, debug = False):
nvar, nGcomp = alpha.shape
Ebeta = np.sum(np.multiply(alpha, mu)[:, 1:], axis = 1)
Ebeta2 = np.zeros(nvar)
mu2 = np.square(mu)
X2 = np.square(X)
bglm, dbglm, ddbglm = get_btheta(X, Ebeta)
t1 = np.dot(Y, np.dot(X, Ebeta)) / dspr
t3 = 0
t4 = 0
t5 = 0
for j in range(nvar):
for k in range(nGcomp):
mom2jk = mu2[j, k] + s2[j, k]
Ebeta2[j] += alpha[j, k] * mom2jk
if alpha[j, k] != 0:
t4 += - alpha[j, k] * np.log(alpha[j, k])
t5jk = 0
if k > 0:
t5jk += np.log(s2[j, k])
t5jk += - np.log(sigmak2[k])
t5jk += - mom2jk / sigmak2[k]
t5jk += 1
t5jk *= alpha[j, k] * 0.5
t5 += t5jk
x2Ebeta2 = np.einsum('ij, j->i', X2, Ebeta2)
x2r2 = np.einsum('ij, j->i', X2, np.square(Ebeta))
Eqbglm = bglm + 0.5 * np.multiply( ddbglm, (x2Ebeta2 - x2r2))
t2 = - np.sum(Eqbglm) / dspr
elbo = t1 + t2 + t3 + t4 + t5
return elbo
def ash_qL_pi_init(nGcomp, pi0, sparse = True):
'''
By default, pi is initialized with sparsity, i.e. pi[0] > pi[j], for j > 0
'''
pi = np.zeros(nGcomp)
pi[0] = pi0
pi[1:(nGcomp - 1)] = (1 - pi0) / (nGcomp - 1)
pi[nGcomp - 1] = 1 - np.sum(pi)
if not sparse:
pi = np.repeat(1 / nGcomp, nGcomp)
return pi
def ash_rmf(X, Y, sigmak2, dspr, binit,
a_dirich = None,
maxiter = 1000, tol = 1e-10, pi0 = 0.8, sparse_init = True,
debug = True, debug_step = 10):
'''
Current focus is on readability and debug.
'''
nsample, nvar = X.shape
nGcomp = sigmak2.shape[0]
# default dirichlet prior on pi
if a_dirich is None:
a_dirich = np.repeat(1., nGcomp)
a_dirich[0] = 1. # soft max at pi[0]
XTY = np.dot(X.T, Y)
'''
Initialize
'''
pi = ash_qL_pi_init(nGcomp, pi0, sparse = sparse_init)
logpi = np.log(pi)
alpha = np.repeat(np.array([pi]), nvar, axis = 0)
logalpha = np.log(alpha)
R = binit.copy()
bglm, dbglm, ddbglm = get_btheta(X, R)
#XBXjj = np.sum(np.multiply(ddbglm.reshape(-1, 1), np.square(X)), axis = 0)
XBXjj = np.einsum('i,ij,ij->j', ddbglm, X, X)
s2 = np.zeros((nvar, nGcomp))
mu = np.zeros((nvar, nGcomp))
s2[:, 1:] = 1. / (XBXjj.reshape(-1, 1) / dspr + 1 / sigmak2[1:])
_muj = XTY - np.dot(X.T, dbglm) + XBXjj * binit
mu = np.multiply(s2, _muj.reshape(-1,1)) / dspr
R = np.sum(np.multiply(alpha, mu)[:, 1:], axis = 1)
'''
Iteration
'''
elbo_old = get_elbo(X, Y, dspr, alpha, mu, s2, logpi, sigmak2)
pi_old = pi.copy()
if debug:
print(f"Starting ELBO: {elbo_old:.4f} Pi0: {pi[0]:.4f}")
for it in range(maxiter):
# E-step
for j in range(nvar):
s2[j, 0] = 0.
mu[j, 0] = 0.
logalpha[j, 0] = logpi[0]
_muj = XTY[j] - np.sum(np.multiply(dbglm, X[:, j])) + XBXjj[j] * R[j]
for k in range(1, nGcomp):
s2[j, k] = 1 / (XBXjj[j] / dspr + (1 / sigmak2[k]))
mu[j, k] = s2[j, k] * _muj / dspr
ssf = np.log(s2[j, k] / sigmak2[k])
ssr = mu[j, k] * mu[j, k] / s2[j, k]
logalpha[j, k] = logpi[k] + 0.5 * ssf + 0.5 * ssr
maxL = np.max(logalpha[j, :])
alpha[j, :] = np.exp(logalpha[j, :] - maxL)
alpha[j, :] /= np.sum(alpha[j, :])
R[j] = np.sum(np.multiply(alpha[j, 1:], mu[j, 1:]))
bglm, dbglm, ddbglm = get_btheta(X, R)
XBXjj = np.einsum('i,ij,ij->j', ddbglm, X, X)
# M-step
#R = np.sum(np.multiply(alpha, mu)[:, 1:], axis = 1)
#bglm, dbglm, ddbglm = get_btheta(X, R)
bk = np.sum(alpha, axis = 0) + a_dirich
pi = bk / np.sum(bk)
logpi = digamma(bk) - digamma(np.sum(bk))
elbo = get_elbo(X, Y, dspr, alpha, mu, s2, logpi, sigmak2)
eps_elbo = elbo - elbo_old
eps_pi = max([abs(x - y) for x, y in zip(pi, pi_old)])
if debug and (it % debug_step == 0):
print(f"Iteration {it}. ELBO: {elbo:.4f} Pi0: {pi[0]:.4f} epsilon: {eps_pi:.4f}, {eps_elbo:.4f}")
'''
Check termination, continue otherwise.
'''
if eps_pi < tol:
print(eps_pi)
break
'''
Keep this step in memory.
Use deep copy to avoid in place substitutions.
'''
pi_old = pi.copy()
elbo_old = elbo
return pi, alpha, mu, s2, R
Create a data frame for result summary
# collapse-hide
res_df = [pd.DataFrame(columns = ['Method', 'RMSE', 'MeanPoisDev', 'Gini Index']) for n in nsample]
data_Xbeta = [np.dot(Xtest[i], beta[i]) for i in range(len(nsample))]
data_mpd = [mean_poisson_deviance(Ytest[i], np.exp(data_Xbeta[i])) for i in range(len(nsample))]
data_rmse = [np.sqrt(mean_squared_error(Ytest[i], Ytest[i])) for i in range(len(nsample))]
data_gini = [gini(beta[i]) for i in range(len(nsample))]
for i in range(len(nsample)):
res_df[i] = res_df[i].append(pd.DataFrame([['Data', data_rmse[i], data_mpd[i], data_gini[i]]],
columns = res_df[i].columns))
# collapse-hide
clf = [linear_model.PoissonRegressor(alpha = 1.0/n, fit_intercept = False, max_iter = 5000) for n in nsample]
for i, n in enumerate(nsample):
clf[i].fit(X[i], Y[i])
clf_Xbeta = [np.dot(Xtest[i], clf[i].coef_) for i in range(len(nsample))]
clf_mpd = [mean_poisson_deviance(Ytest[i],
clf[i].predict(Xtest[i])) for i in range(len(nsample))]
clf_rmse = [np.sqrt(mean_squared_error(Ytest[i],
clf[i].predict(Xtest[i]))) for i in range(len(nsample))]
clf_gini = [gini(clf[i].coef_) for i in range(len(nsample))]
for i in range(len(nsample)):
res_df[i] = res_df[i].append(pd.DataFrame([['GLM_Py', clf_rmse[i], clf_mpd[i], clf_gini[i]]],
columns = res_df[i].columns))
- RMF Ash
# collapse-hide
pi = [None for n in nsample]
alpha = [None for n in nsample]
mu = [None for n in nsample]
s2 = [None for n in nsample]
R = [None for n in nsample]
for i in range(len(nsample)):
binit = np.repeat(0.1, nvar[i])
dspr = 1.
pi[i], alpha[i], mu[i], s2[i], R[i] = ash_rmf(X[i], Y[i], sigmak2, dspr, binit,
tol = 1e-4, debug_step = 1, maxiter = 1000,
pi0 = sparsity, sparse_init = True)
RMFash_Xbeta = [np.dot(Xtest[i], R[i]) for i in range(len(nsample))]
RMFash_rmse = [np.sqrt(mean_squared_error(Ytest[i], np.exp(RMFash_Xbeta[i])))
for i in range(len(nsample))]
RMFash_mpd = [mean_poisson_deviance(Ytest[i], np.exp(RMFash_Xbeta[i]))
for i in range(len(nsample))]
RMFash_gini = [gini(R[i]) for i in range(len(nsample))]
for i in range(len(nsample)):
res_df[i] = res_df[i].append(pd.DataFrame([['RMFash', RMFash_rmse[i], RMFash_mpd[i], RMFash_gini[i]]],
columns = res_df[i].columns))
i = 0
print (f'N = {nsample[i]}, P = {nvar[i]}')
res_df[i]
i = 1
print (f'N = {nsample[i]}, P = {nvar[i]}')
res_df[i]
# collapse-hide
n_bins = 40
fig = plt.figure(figsize = (16, 8))
ax = [fig.add_subplot(121),
fig.add_subplot(122)]
for i, n in enumerate(nsample):
whichX = Xtest[i].copy()
whichY = Ytest[i].copy()
idx_sort = np.argsort(whichY)
q = np.arange(0, 1, 1/n_bins) + 0.5/n_bins
seg_true = np.zeros(n_bins)
seg_clf = np.zeros(n_bins)
seg_RMF = np.zeros(n_bins)
for j, sl in enumerate(gen_even_slices(len(whichY), n_bins)):
seg_true[j] = np.average(whichY[idx_sort][sl])
ax[i].scatter(q, np.log(seg_true+1), s = 10, label = 'Observations')
ax[i].plot(q, np.log(seg_true+1))
Xbeta_clf = np.dot(whichX, clf[i].coef_)
Ypred_clf = clf[i].predict(whichX)
for j, sl in enumerate(gen_even_slices(len(whichY), n_bins)):
seg_clf[j] = np.average(Ypred_clf[idx_sort][sl])
ax[i].scatter(q, np.log(seg_clf + 1), s = 10, label = 'GLM_py')
ax[i].plot(q, np.log(seg_clf + 1))
Xbeta_RMF = np.dot(whichX, R[i])
Ypred_RMF = np.exp(Xbeta_RMF)
for j, sl in enumerate(gen_even_slices(len(whichY), n_bins)):
seg_RMF[j] = np.average(Ypred_RMF[idx_sort][sl])
ax[i].scatter(q, np.log(seg_RMF + 1), s = 10, label = 'RMFash')
ax[i].plot(q, np.log(seg_RMF + 1))
leg = ax[i].legend(markerscale = 4)
ax[i].set_xlabel('Fraction of samples')
ax[i].set_ylabel(r'$\log (\langle{Y}\rangle + 1)$')
ax[i].set_title(f'N = {nsample[i]}, P = {nvar[i]}', pad = 20)
plt.tight_layout(pad = 2.0)
plt.show()
Evaluation of ranking power
To evaluate the ranking power, I used the Lorenz curve. It is popularly used in economics to describe inequality in wealth but can be instructive here due to the sparsity in $\mathbf{y}$. We rank the samples based on predicted $\mathbf{y}$ for each model. The sample threshold is shown on the x-axis, and the cumulative sum of $\mathbf{y}$ up to the corresponding ranked sample is shown on the y-axis. Both axes are scaled to 1.
# collapse-hide
def lorenz_curve(y_true, y_pred):
y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
nsample = y_true.shape[0]
ranking = np.argsort(y_pred)
ranked_y = y_true[ranking]
cumulated_y = np.cumsum(ranked_y).astype(float)
cumulated_y /= cumulated_y[-1]
cumulated_obs = np.cumsum(np.repeat(1., nsample))
cumulated_obs /= cumulated_obs[-1]
return cumulated_obs, cumulated_y
fig = plt.figure(figsize = (16, 8))
ax = [fig.add_subplot(121),
fig.add_subplot(122)]
for i, n in enumerate(nsample):
whichX = Xtest[i].copy()
whichY = Ytest[i].copy()
cum_obs, cum_y = lorenz_curve(whichY, whichY)
ax[i].plot(cum_obs, cum_y, label = "Observation")
cum_obs, cum_y = lorenz_curve(whichY, clf[i].predict(whichX))
ax[i].plot(cum_obs, cum_y, label = "GLM_py")
cum_obs, cum_y = lorenz_curve(whichY, np.exp(np.dot(whichX, R[i])))
ax[i].plot(cum_obs, cum_y, label = "RMFash")
rand_idx = np.arange(len(whichY))
np.random.shuffle(rand_idx)
cum_obs, cum_y = lorenz_curve(whichY, whichY[rand_idx])
ax[i].plot(cum_obs, cum_y, linestyle="dotted", color="black", label="Random")
leg = ax[i].legend(handlelength = 1.5)
ax[i].set_xlabel('Ranked samples')
ax[i].set_ylabel('Cumulative sum of Y')
ax[i].set_title(f'N = {nsample[i]}, P = {nvar[i]}', pad = 20)
plt.tight_layout(pad = 2.0)
plt.show()
1. For a review of sparsity indices, see Harley and Rickard (2009). Comparing Measures of Sparsity. arXiv. DOI: arXiv:0811.4706v2↩