EBMR with adaptive shrinkage prior
About
We consider the following hierarchical Empirical Bayes (EB) regression model:
$p\left(\mathbf{y} \mid s, \mathbf{b}\right) = N\left(\mathbf{y} \mid \mathbf{X}\mathbf{b}, s^2 I_n\right)$
$p\left(\mathbf{b} \mid s_b, s,\mathbf{W}\right) = N\left(\mathbf{b} \mid 0,s_b^2 s^2 \mathbf{W}\right)$
$p\left(w_j \mid g\right) = g \in \mathcal{G}.$
where $\mathbf{W}=\mathrm{diag}(w_1,\dots,w_p)$ is a diagonal matrix of prior variances, $s, s_b$ are scalars, and $g$ is a prior distribution that is to be estimated. We refer to the model as the Empirical Bayes Multiple Regression (EBMR). We split this model into two overlapping parts:
- The first two equations define the Generalized Ridge Regression (GRR) model.
- We call the combination of the last two equations as the "Empirical Bayes Normal Variances" (EBNV) model.
Here, we introduce three different priors for $g$ in the EBNV model and solve GRR using the EM-SVD method (see here). The three priors used for EBNV are:
- Point Mass $p\left(w_j\right) = \delta(w_j - \lambda_k)$. This corresponds to ridge regression.
- Exponential $p\left(w_j\right) = \lambda \exp(-\lambda w_j)$. This corresponds to Lasso.
- Mixture of point mass $p\left(w_j\right) = \sum_{k=1}^{K}\pi_k\delta(w_j - \lambda_k)$. This corresponds to the adaptive shrinkage prior (ash). We consider $\lambda_k$ as known inputs and solve for $\pi_k$ in the EBNV step.
The derivations for the point mass and the exponential prior are provided by Matthew in the corresponding Overleaf document, while some handwritten notes for the mixture of point mass is here.
import numpy as np
import pandas as pd
from scipy import linalg as sc_linalg
import matplotlib.pyplot as plt
import ebmrPy
from ebmrPy.inference.ebmr import EBMR
from ebmrPy.inference import f_elbo
from ebmrPy.inference import f_sigma
from ebmrPy.inference import penalized_em
from ebmrPy.utils import log_density
from pymir import mpl_stylesheet
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 72, fontsize = 18)
def standardize(X):
Xnorm = (X - np.mean(X, axis = 0))
Xstd = Xnorm / np.sqrt((Xnorm * Xnorm).sum(axis = 0))
return Xstd
def ridge_data(n, p, sd=5.0, sb2=100.0, seed=100):
np.random.seed(seed)
X = np.random.normal(0, 1, n * p).reshape(n, p)
X = standardize(X)
btrue = np.random.normal(0, np.sqrt(sb2), p)
y = np.dot(X, btrue) + np.random.normal(0, sd, n)
y = y - np.mean(y)
#y = y / np.std(y)
return X, y, btrue
def sparse_data(nsample, nvar, neff, errsigma, sb2=100, seed=200):
np.random.seed(seed)
X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
X = standardize(X)
btrue = np.zeros(nvar)
bidx = np.random.choice(nvar, neff , replace = False)
btrue[bidx] = np.random.normal(0, np.sqrt(sb2), neff)
y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
y = y - np.mean(y)
#y = y / np.std(y)
return X, y, btrue
def test_data(nsample, btrue, errsigma):
nvar = btrue.shape[0]
X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
X = standardize(X)
y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
y = y - np.mean(y)
#y = y / np.std(y)
return X, y
n = 50
p = 100
peff = 5
sb = 5.0
sd = 10.0
sb2 = sb * sb
X, y, btrue = ridge_data(n, p, sd, sb2, seed=100)
#X, y, btrue = sparse_data(n, p, peff, sd, sb2, seed = 200)
Xtest, ytest = test_data(200, btrue, sd)
yvar = np.var(y)
residual_var = np.var(y - np.dot(X, btrue))
explained_var = yvar - residual_var
print(f"Total variance of y is {yvar:.3f} and the residual variance is {residual_var:.3f}")
print(f"Hence, PVE is {(yvar - residual_var) / yvar:.3f}")
EBMR
We use the Python implementation of EBMR, see here. I have switched off the convergence criteria, so that we can monitor how the ELBO evolves with iteration. This will evaluate the results over all the max_iter
steps and does not gurantee the best solution (if the convergence criteria has not been met after max_iter
steps).
priors = ['point', 'dexp', 'mix_point']
#priors = ['point']
mcolors = {'point': '#2D69C4',
'dexp' : '#93AA00',
'mix_point': '#CC2529'
}
mlabels = {'point': 'Ridge',
'dexp' : 'Lasso',
'mix_point': 'Ash'
}
ebmr_ridge = dict()
wks = np.array([0.001, 1.0, 2.0, 3.0, 4.0])
for mprior in priors:
mix_prior = None
if mprior == 'mix_point': mix_prior = wks
if mprior == 'dexp':# or mprior == 'mix_point':
grr_method = 'mle'
else:
grr_method = 'em_svd'
ebmr_ridge[mprior] = EBMR(X, y, prior=mprior,
grr = grr_method, sigma = 'full', inverse = 'direct',
s2_init = 1, sb2_init = 1,
max_iter = 100, tol = 1e-8,
mll_calc = True,
mix_point_w = mix_prior,
ignore_convergence = True
)
ebmr_ridge[mprior].update()
We note the ELBO at the last step is similar for point
prior (Ridge) and mix_point
prior (Ash).
for mprior in priors:
print(f"ELBO for {mprior} prior: {ebmr_ridge[mprior].elbo:.4f}")
Here are the optimal values of $s^2$, $s_b^2$ and $\bar{w_0}$ (strictly, they are values obtained after the last step and I assume we have reached convergence). There are $p$ elements in the diagonal vector $\bar{\mathbf{W}}$, of which $w_0$ is the first element.
data = [[ebmr_ridge[x].s2, ebmr_ridge[x].sb2, ebmr_ridge[x].Wbar[0],
ebmr_ridge[x].s2 * ebmr_ridge[x].sb2 * ebmr_ridge[x].Wbar[0]]
for x in priors]
colnames = ['s2', 'sb2', 'w_0', 's2 * sb2 * w_0']
rownames = priors.copy()
df = pd.DataFrame.from_records(data, columns = colnames, index = rownames)
df.style.format("{:.3f}")
Finally, here are the mixtures coefficients estimated by EBMR for the ash regression.
data = [wks, wks * ebmr_ridge['mix_point'].sb2, ebmr_ridge['mix_point'].mixcoef]
rownames = ['w_k', 'sb2 * w_k', 'pi_k']
df = pd.DataFrame.from_records(data, index = rownames)
# https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
df.style.format("{:.3f}")
The ELBO is decreasing, which is wrong. However asymptotically, the iteration updates lead to exactly same results for ridge regression and ash regression.
fig = plt.figure()
ax1 = fig.add_subplot(111)
for mprior in priors:
mres = ebmr_ridge[mprior]
xvals = np.arange(mres.n_iter)
ax1.scatter(xvals, mres.elbo_path[1:], color = mcolors[mprior], s=6)
ax1.plot(xvals, mres.elbo_path[1:], color = mcolors[mprior], label = mlabels[mprior])
legend1 = ax1.legend(loc = 'center right', bbox_to_anchor = (0.95, 0.3), frameon = False,
handlelength = 1.0)
#legend1._legend_box.align = "left"
#lframe = legend1.get_frame()
#lframe.set_linewidth(0)
ax1.set_xlabel("Iteration step")
ax1.set_ylabel("ELBO")
plt.tight_layout()
plt.show()
The plot on the left shows the prediction of the different methods on a separate test data (plot on the left). The plot on the right compares the expectation of the coefficients of the variables ($\mathbf{b}$) for the different methods. The colors are same as in the plot above. The ridge regression and the ash regression gives identical results.
def lims_xy(ax):
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes
np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes
]
return lims
def plot_diag(ax):
lims = lims_xy(ax)
ax.plot(lims, lims, ls='dotted', color='gray')
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
#ax2.scatter(np.arange(p), btrue, color = 'black')
for mprior in priors:
mres = ebmr_ridge[mprior]
ypred = np.dot(Xtest, mres.mu)
ax1.scatter(ytest, ypred, color = mcolors[mprior], alpha = 0.5)
#ax2.scatter(np.arange(p), mres.mu, color = mcolors[mprior], alpha = 0.5)
ax2.scatter(btrue, mres.mu, color = mcolors[mprior], alpha = 0.5)
plot_diag(ax1)
plot_diag(ax2)
ax1.set_xlabel("y")
ax1.set_ylabel("y_pred")
ax2.set_xlabel("b")
ax2.set_ylabel("b_pred")
plt.tight_layout()
plt.show()
Compare ELBO with evidence
To check if the ELBOs are correct, I compare the ELBO with the the marginal log likelihood $p\left(\mathbf{y} \mid s^2, s_b^2\right)$ (also called the evidence), calculated at every step for the last 20 iterations. The ELBO is shown with the colored points while the evidence is the black dotted line.
fig = plt.figure(figsize = (18, 6))
ax = [None for mprior in priors]
nstep = 20
for i, mprior in enumerate(priors):
ax[i] = fig.add_subplot(1, 3, i+1)
mres = ebmr_ridge[mprior]
xvals = np.arange(mres.n_iter+1)[-nstep:]
#ax[i].scatter(mres.elbo_path[2:], mres.mll_path[2:], color = mcolors[mprior], s = 20)
ax[i].plot(xvals, mres.mll_path[-nstep:], color = 'black', ls='dotted', label = "Evidence")
ax[i].scatter(xvals, mres.elbo_path[-nstep:], color = mcolors[mprior], s=50)
#ax[i].plot(xvals, mres.elbo_path[-nstep:], color = mcolors[mprior], lw=1, label = "ELBO")
ax[i].text(0.7, 0.2, mlabels[mprior], transform=ax[i].transAxes)
ax[i].set_xlabel("Iteration")
ax[i].set_ylabel("ELBO / Evidence")
plt.tight_layout()
plt.show()