About

We consider the following hierarchical Empirical Bayes (EB) regression model:

$p\left(\mathbf{y} \mid s, \mathbf{b}\right) = N\left(\mathbf{y} \mid \mathbf{X}\mathbf{b}, s^2 I_n\right)$

$p\left(\mathbf{b} \mid s_b, s,\mathbf{W}\right) = N\left(\mathbf{b} \mid 0,s_b^2 s^2 \mathbf{W}\right)$

$p\left(w_j \mid g\right) = g \in \mathcal{G}.$

where $\mathbf{W}=\mathrm{diag}(w_1,\dots,w_p)$ is a diagonal matrix of prior variances, $s, s_b$ are scalars, and $g$ is a prior distribution that is to be estimated. We refer to the model as the Empirical Bayes Multiple Regression (EBMR). We split this model into two overlapping parts:

  1. The first two equations define the Generalized Ridge Regression (GRR) model.
  2. We call the combination of the last two equations as the "Empirical Bayes Normal Variances" (EBNV) model.

Here, we introduce three different priors for $g$ in the EBNV model and solve GRR using the EM-SVD method (see here). The three priors used for EBNV are:

  1. Point Mass $p\left(w_j\right) = \delta(w_j - \lambda_k)$. This corresponds to ridge regression.
  2. Exponential $p\left(w_j\right) = \lambda \exp(-\lambda w_j)$. This corresponds to Lasso.
  3. Mixture of point mass $p\left(w_j\right) = \sum_{k=1}^{K}\pi_k\delta(w_j - \lambda_k)$. This corresponds to the adaptive shrinkage prior (ash). We consider $\lambda_k$ as known inputs and solve for $\pi_k$ in the EBNV step.

The derivations for the point mass and the exponential prior are provided by Matthew in the corresponding Overleaf document, while some handwritten notes for the mixture of point mass is here.

import numpy as np
import pandas as pd
from scipy import linalg as sc_linalg
import matplotlib.pyplot as plt

import ebmrPy
from ebmrPy.inference.ebmr import EBMR
from ebmrPy.inference import f_elbo
from ebmrPy.inference import f_sigma
from ebmrPy.inference import penalized_em
from ebmrPy.utils import log_density

from pymir import mpl_stylesheet
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 72, fontsize = 18)

Model Setup

We use a simple simulation to evaluate the three priors.

def standardize(X):
    Xnorm = (X - np.mean(X, axis = 0))
    Xstd = Xnorm / np.sqrt((Xnorm * Xnorm).sum(axis = 0))
    return Xstd

def ridge_data(n, p, sd=5.0, sb2=100.0, seed=100):
    np.random.seed(seed)
    X = np.random.normal(0, 1, n * p).reshape(n, p)
    X = standardize(X)
    btrue = np.random.normal(0, np.sqrt(sb2), p)
    y = np.dot(X, btrue) + np.random.normal(0, sd, n)
    y = y - np.mean(y)
    #y = y / np.std(y)
    return X, y, btrue

def sparse_data(nsample, nvar, neff, errsigma, sb2=100, seed=200):
    np.random.seed(seed)
    X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
    X = standardize(X)
    btrue = np.zeros(nvar)
    bidx = np.random.choice(nvar, neff , replace = False)
    btrue[bidx] = np.random.normal(0, np.sqrt(sb2), neff)
    y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
    y = y - np.mean(y)
    #y = y / np.std(y)
    return X, y, btrue

def test_data(nsample, btrue, errsigma):
    nvar = btrue.shape[0]
    X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
    X = standardize(X)
    y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
    y = y - np.mean(y)
    #y = y / np.std(y)
    return X, y

n = 50
p = 100
peff = 5
sb = 5.0
sd = 10.0

sb2 = sb * sb
X, y, btrue = ridge_data(n, p, sd, sb2, seed=100)
#X, y, btrue = sparse_data(n, p, peff, sd, sb2, seed = 200)
Xtest, ytest = test_data(200, btrue, sd)

yvar = np.var(y)
residual_var = np.var(y - np.dot(X, btrue))
explained_var = yvar - residual_var
print(f"Total variance of y is {yvar:.3f} and the residual variance is {residual_var:.3f}")
print(f"Hence, PVE is {(yvar - residual_var) / yvar:.3f}")
Total variance of y is 151.364 and the residual variance is 112.677
Hence, PVE is 0.256

EBMR

We use the Python implementation of EBMR, see here. I have switched off the convergence criteria, so that we can monitor how the ELBO evolves with iteration. This will evaluate the results over all the max_iter steps and does not gurantee the best solution (if the convergence criteria has not been met after max_iter steps).

priors = ['point', 'dexp', 'mix_point']
#priors = ['point']
mcolors = {'point': '#2D69C4', 
           'dexp' : '#93AA00',
           'mix_point': '#CC2529'
          }
mlabels = {'point': 'Ridge', 
           'dexp' : 'Lasso',
           'mix_point': 'Ash'
          }

ebmr_ridge = dict()
wks = np.array([0.001, 1.0, 2.0, 3.0, 4.0])
for mprior in priors:
    mix_prior = None
    if mprior == 'mix_point': mix_prior = wks
    if mprior == 'dexp':# or mprior == 'mix_point': 
        grr_method = 'mle'
    else:
        grr_method = 'em_svd'
    ebmr_ridge[mprior] = EBMR(X, y, prior=mprior,
                               grr = grr_method, sigma = 'full', inverse = 'direct',
                               s2_init = 1, sb2_init = 1,
                               max_iter = 100, tol = 1e-8,
                               mll_calc = True,
                               mix_point_w = mix_prior,
                               ignore_convergence = True
                              )
    ebmr_ridge[mprior].update()
2021-03-29 12:25:32,959 | ebmrPy.inference.ebmr | DEBUG | EBMR using point prior, em_svd grr, full b posterior variance, direct inversion
2021-03-29 12:25:33,183 | ebmrPy.inference.ebmr | DEBUG | EBMR using dexp prior, mle grr, full b posterior variance, direct inversion
2021-03-29 12:25:33,263 | ebmrPy.inference.ebmr | DEBUG | EBMR using mix_point prior, em_svd grr, full b posterior variance, direct inversion

We note the ELBO at the last step is similar for point prior (Ridge) and mix_point prior (Ash).

for mprior in priors:
    print(f"ELBO for {mprior} prior: {ebmr_ridge[mprior].elbo:.4f}")
ELBO for point prior: -194.8095
ELBO for dexp prior: -217.3764
ELBO for mix_point prior: -194.0409

Here are the optimal values of $s^2$, $s_b^2$ and $\bar{w_0}$ (strictly, they are values obtained after the last step and I assume we have reached convergence). There are $p$ elements in the diagonal vector $\bar{\mathbf{W}}$, of which $w_0$ is the first element.

data = [[ebmr_ridge[x].s2,  ebmr_ridge[x].sb2,  ebmr_ridge[x].Wbar[0],
         ebmr_ridge[x].s2 * ebmr_ridge[x].sb2 * ebmr_ridge[x].Wbar[0]]
        for x in priors]
colnames = ['s2', 'sb2', 'w_0', 's2 * sb2 * w_0']
rownames = priors.copy()
df = pd.DataFrame.from_records(data, columns = colnames, index = rownames)
df.style.format("{:.3f}")
s2 sb2 w_0 s2 * sb2 * w_0
point 67.007 1.000 0.640 42.898
dexp 14.379 1.000 2.808 40.370
mix_point 53.139 0.325 2.815 48.585

Finally, here are the mixtures coefficients estimated by EBMR for the ash regression.

data = [wks, wks * ebmr_ridge['mix_point'].sb2, ebmr_ridge['mix_point'].mixcoef]
rownames = ['w_k', 'sb2 * w_k', 'pi_k']
df = pd.DataFrame.from_records(data, index = rownames)
# https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
df.style.format("{:.3f}")
0 1 2 3 4
w_k 0.001 1.000 2.000 3.000 4.000
sb2 * w_k 0.000 0.325 0.650 0.974 1.299
pi_k 0.200 0.200 0.200 0.200 0.200

The ELBO is decreasing, which is wrong. However asymptotically, the iteration updates lead to exactly same results for ridge regression and ash regression.

fig = plt.figure()
ax1 = fig.add_subplot(111)
for mprior in priors:
    mres = ebmr_ridge[mprior]
    xvals = np.arange(mres.n_iter)
    ax1.scatter(xvals, mres.elbo_path[1:], color = mcolors[mprior], s=6)
    ax1.plot(xvals, mres.elbo_path[1:], color = mcolors[mprior], label = mlabels[mprior])

legend1 = ax1.legend(loc = 'center right', bbox_to_anchor = (0.95, 0.3), frameon = False,
                     handlelength = 1.0)
#legend1._legend_box.align = "left"
#lframe = legend1.get_frame()
#lframe.set_linewidth(0)

ax1.set_xlabel("Iteration step")
ax1.set_ylabel("ELBO")
plt.tight_layout()
plt.show()

The plot on the left shows the prediction of the different methods on a separate test data (plot on the left). The plot on the right compares the expectation of the coefficients of the variables ($\mathbf{b}$) for the different methods. The colors are same as in the plot above. The ridge regression and the ash regression gives identical results.

def lims_xy(ax):
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    return lims

def plot_diag(ax):
    lims = lims_xy(ax)
    ax.plot(lims, lims, ls='dotted', color='gray')


fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
#ax2.scatter(np.arange(p), btrue, color = 'black')



for mprior in priors:
    mres = ebmr_ridge[mprior]
    ypred = np.dot(Xtest, mres.mu)
    ax1.scatter(ytest, ypred, color = mcolors[mprior], alpha = 0.5)
    #ax2.scatter(np.arange(p), mres.mu, color = mcolors[mprior], alpha = 0.5)
    ax2.scatter(btrue, mres.mu, color = mcolors[mprior], alpha = 0.5)
plot_diag(ax1)
plot_diag(ax2)

ax1.set_xlabel("y")
ax1.set_ylabel("y_pred")
ax2.set_xlabel("b")
ax2.set_ylabel("b_pred")

plt.tight_layout()
plt.show()

Compare ELBO with evidence

To check if the ELBOs are correct, I compare the ELBO with the the marginal log likelihood $p\left(\mathbf{y} \mid s^2, s_b^2\right)$ (also called the evidence), calculated at every step for the last 20 iterations. The ELBO is shown with the colored points while the evidence is the black dotted line.

fig = plt.figure(figsize = (18, 6))
ax = [None for mprior in priors]
nstep = 20

for i, mprior in enumerate(priors):
    ax[i] = fig.add_subplot(1, 3, i+1)
    mres = ebmr_ridge[mprior]
    xvals = np.arange(mres.n_iter+1)[-nstep:]
    #ax[i].scatter(mres.elbo_path[2:], mres.mll_path[2:], color = mcolors[mprior], s = 20)
    ax[i].plot(xvals, mres.mll_path[-nstep:], color = 'black', ls='dotted', label = "Evidence")
    ax[i].scatter(xvals, mres.elbo_path[-nstep:], color = mcolors[mprior], s=50)
    #ax[i].plot(xvals, mres.elbo_path[-nstep:], color = mcolors[mprior], lw=1, label = "ELBO")
    ax[i].text(0.7, 0.2, mlabels[mprior], transform=ax[i].transAxes)
    ax[i].set_xlabel("Iteration")
    ax[i].set_ylabel("ELBO / Evidence")

plt.tight_layout()
plt.show()