Comparing Quasi-Laplace approximation with true posterior
About
In the quasi-Laplace approximation, we apply the Laplace approximation to a regularized likelihood $\mathscr{L}_{\mathrm{reg}}(\boldsymbol{\beta})$ defined as the product of the likelihood and a Gaussian regularizer, \begin{equation*} \mathscr{L}_{\mathrm{reg}}(\boldsymbol{\beta}) \triangleq p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right) \propto \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{m}, \mathbf{S} \right) \end{equation*} such that the mode of the regularized likelihood is near the mode of the posterior.
- Likelihood: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right)$
- Regularized likelihood: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right)$
- Prior: $p\left(\boldsymbol{\beta}\mid g \right)$
- Posterior: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right)$
- Marginal likelihood: $p \left( \mathbf{y} \mid \mathbf{X} \right) = \int p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right) d\boldsymbol{\beta}$
Here, $g$ is the mixture of Gaussians with known variances but unknown mixture coefficients. The precision matrix $\mathbf{\Lambda}$ is defined as a diagonal matrix, $\mathbf{\Lambda} \triangleq \mathrm{diag}\left(\boldsymbol{\lambda}\right)$, whose elements $\lambda_j$ are roughly set to some expected value to ensure that the regularized likelihood is centered at the mode of the posterior.
The quasi-Laplace approximation is helpful in cases where the marginal likelihood (and therefore, the posterior) is not analytically tractable, for example in logistic regression.
#collapse_hide
import numpy as np
np.set_printoptions(precision = 4, suppress=True)
from scipy import optimize
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import ticker as plticker
from mpl_toolkits.axes_grid1 import make_axes_locatable
import sys
sys.path.append("../../utils/")
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'kelly', dpi = 72)
Generate toy data
Let us consider a logistic model with sparse coefficients, so that the number of causal variables is much less than the number of variables nvar
in the model. This is ensured by sampling the betas from a Gaussian mixture prior with variances given by $\sigma_k^2$ (sigmak2
), where the numbers of components is given by nGcomp
and the mixture coefficients given by probk
, the sparsity of which is controlled by the variable sparsity
that specifies the mixture coefficient of the component $\mathcal{N}(0, 0)$ (or the delta function).
nsample = 20
nvar = 30
nGcomp = 3
sparsity = 0.8
prior_strength = 5
num_inf = 1e4 # a large number for 1/sigma_k^2 when sigma_k^2 = 0
probk = np.zeros(nGcomp)
probk[0] = sparsity
probk[1:(nGcomp - 1)] = (1 - sparsity) / (nGcomp - 1)
probk[nGcomp - 1] = 1 - np.sum(probk)
sigmak2 = np.array([prior_strength * np.square(np.power(2, (i)/nGcomp) - 1) for i in range(nGcomp)])
The data is generated from Bernoulli trials (equivalent to binomial distribution with $n=1$ and $p=\sigma(\mathbf{X} \boldsymbol{\beta})$, where $\sigma(\cdot)$ is the logistic function. $\mathbf{X}$ is centered and scaled such that for each variable $j$, the variance is $\mathrm{var}(\mathbf{x}_j) = 1$.
# collapse-hide
def standardize(X):
Xnorm = (X - np.mean(X, axis = 0))
Xstd = Xnorm / np.std(Xnorm, axis = 0)
return Xstd
def logistic_data(X, beta):
Xbeta = np.dot(X, beta)
pred = 1 / (1 + np.exp(-Xbeta))
Y = np.random.binomial(1, pred)
return Y
X = np.random.rand(nsample * nvar).reshape(nsample, nvar)
X = standardize(X)
gammajk = np.random.multinomial(1, probk, size = nvar)
beta = np.zeros(nvar)
for j in range(nvar):
if gammajk[j, 0] != 1:
kidx = np.where(gammajk[j, :] == 1)[0][0]
kstd = np.sqrt(sigmak2[kidx])
beta[j] = np.random.normal(loc = 0., scale = kstd)
ncausal = beta[beta != 0].shape[0]
betavar = np.var(beta[beta != 0])
Y = logistic_data(X, beta)
Let us have a look at the generated data.
# collapse-hide
print(f'There are {ncausal} non-zero coefficients with variance {betavar:.4f}')
fig = plt.figure(figsize = (6,6))
ax1 = fig.add_subplot(111)
Xbeta = np.dot(X, beta)
pred = 1 / (1 + np.exp(-Xbeta))
ax1.scatter(Xbeta, Y, s = 10)
ax1.scatter(Xbeta, pred, s = 10)
ax1.set_xlabel(r'$\sum_i X_{ni} \beta_i$')
ax1.set_ylabel(r'$Y_n$')
plt.show()
True posterior vs quasi-Laplace posterior
We select two causal variables (with maximum effect size) and fix all the others to optimum values to understand how the likelihood and posterior depends on these two chosen variables. To avoid the sum over the indicator variables, we use the joint prior $p\left(\boldsymbol{\beta}, \boldsymbol{\gamma} \mid g\right)$.
Some useful function definitions:
# collapse-hide
def get_log_likelihood(Y, X, beta):
Xbeta = np.dot(X, beta)
logL = np.sum(Y * Xbeta - np.log(1 + np.exp(Xbeta)))
return logL
def get_log_prior(beta, gammajk, probk, sigmak2):
logprior = 0
for j, b in enumerate(beta):
k = np.where(gammajk[j, :] == 1)[0][0]
logprior += np.log(probk[k])
if k > 0:
logprior += - 0.5 * (np.log(2 * np.pi) + np.log(sigmak2[k]) + b * b / sigmak2[k])
return logprior
def plot_contours(ax, X, Y, Z, beta, norm, cstep = 10, zlabel = ""):
zmin = np.min(Z) - 1 * np.std(Z)
zmax = np.max(Z) + 1 * np.std(Z)
ind = np.unravel_index(np.argmax(Z, axis=None), Z.shape)
levels = np.linspace(zmin, zmax, 200)
clevels = np.linspace(zmin, zmax, 20)
cmap = cm.YlOrRd_r
if norm:
cset1 = ax.contourf(X, Y, Z, levels, norm = norm,
cmap=cm.get_cmap(cmap, len(levels) - 1))
else:
cset1 = ax.contourf(X, Y, Z, levels,
cmap=cm.get_cmap(cmap, len(levels) - 1))
cset2 = ax.contour(X, Y, Z, clevels, colors='k')
for c in cset2.collections:
c.set_linestyle('solid')
ax.set_aspect("equal")
ax.scatter(beta[0], beta[1], color = 'blue', s = 100)
ax.scatter(X[ind[1]], Y[ind[0]], color = 'k', s = 100)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(cset1, cax=cax)
ytickpos = np.arange(int(zmin / cstep) * cstep, zmax, cstep)
cbar.set_ticks(ytickpos)
if zlabel:
cax.set_ylabel(zlabel)
#loc = plticker.AutoLocator()
#ax.xaxis.set_major_locator(loc)
#ax.yaxis.set_major_locator(loc)
def log_likelihood(beta, X, Y, L):
nvar = beta.shape[0]
Xbeta = np.dot(X, beta)
## Function
llb = np.sum(np.dot(Y, Xbeta) - np.log(1 + np.exp(Xbeta)))
#reg = 0.5 * np.sum(np.log(L)) - 0.5 * np.einsum('i,i->i', np.square(beta), L)
reg = - 0.5 * np.einsum('i,i->i', np.square(beta), L)
loglik = llb + reg
## Gradient
pred = 1 / (1 + np.exp(-Xbeta))
der = np.einsum('i,ij->j', (Y - pred), X) - np.multiply(beta, L)
return -loglik, -der
def precisionLL(X, beta, L):
nvar = X.shape[1]
Xbeta = np.einsum('i,ji->j', beta, X)
pred = 1 / (1 + np.exp(-Xbeta))
hess = - np.einsum('i,i,ij,ik->jk', pred, (1 - pred), X, X)
hess[np.diag_indices(nvar)] -= L
return -hess
def get_mS(X, Y, beta0, L):
nvar = X.shape[1]
args = X, Y, L
gmode = optimize.minimize(log_likelihood,
beta0,
args=args,
method='L-BFGS-B',
jac=True,
bounds=None,
options={'maxiter': 20000000,
'maxfun': 20000000,
'ftol': 1e-9,
'gtol': 1e-9
#'disp': True
})
M = gmode.x
Sinv = precisionLL(X, M, L)
return M, Sinv
def get_qL_log_posterior(beta, L, M, Sinv, logdetSinv, logprior):
blessM = beta - M
bMSbM = np.dot(blessM.T, np.dot(Sinv, blessM))
bLb = np.einsum('i, i', np.square(beta), L)
logdetLinv = - np.sum(np.log(L))
logposterior = 0.5 * (logdetSinv + logdetLinv - bMSbM + bLb)
logposterior += logprior
return logposterior
We calculate the likelihood, prior, true posterior and the quasi-Laplace posterior. Note that the posteriors are not normalized. We apply quasi-Laplace (QL) approximation with some $\mathbf{\Lambda}$ and show QL posterior distribution, which is given by \begin{equation*} p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right) \propto \frac{\mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{m}, \mathbf{S} \right) p\left(\boldsymbol{\beta}\mid g \right) }{ \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right) } \end{equation*} Here, we assume that we know $\mathbf{\Lambda}$. In reality, we will not know $\mathbf{\Lambda}$ but will have to learn it from the data or make some educated guess from the prior choice.
# collapse-hide
bchoose = np.argsort(abs(beta))[-2:]
nplotx = 20
nploty = 20
b1min = -5
b1max = 5
b2min = -5
b2max = 5
beta1 = np.linspace(b1min, b1max, nplotx)
beta2 = np.linspace(b2min, b2max, nploty)
logL = np.zeros((nploty, nplotx))
logPr = np.zeros((nploty, nplotx))
logPs = np.zeros((nploty, nplotx))
logQL = np.zeros((nploty, nplotx))
thisbeta = beta.copy()
mask = np.ones(nvar, bool)
mask[bchoose] = False
true_pi = np.sum(gammajk, axis = 0) / np.sum(gammajk)
#reg = 1 / np.einsum('i,i', true_pi, sigmak2)
#regL = np.repeat(reg, nvar)
regL = np.repeat(num_inf, nvar)
for j, b in enumerate(beta):
k = np.where(gammajk[j, :] == 1)[0][0]
if k > 0:
regL[j] = 1 / sigmak2[k]
M, Sinv = get_mS(X, Y, beta, regL)
sgndetSinv, logdetSinv = np.linalg.slogdet(Sinv)
for i, b1 in enumerate(beta1):
for j, b2 in enumerate(beta2):
thisbeta[bchoose] = np.array([b1, b2])
logL[j, i] = get_log_likelihood(Y, X, thisbeta)
logPr[j, i] = get_log_prior(thisbeta, gammajk, probk, sigmak2)
logQL[j, i] = get_qL_log_posterior(thisbeta, regL, M, Sinv, logdetSinv, logPr[j, i])
logPs = logL + logPr
Here, we plot the contour maps. The x and y-axis show the two coefficients $\beta_1$ and $\beta_2$, which we chose to vary. The blue dot shows the coordinates of the true values of $\{\beta_1, \beta_2\}$ and the black dot shows the maximum of the log probabilities. Note how the non-Gaussian true posterior is now approximated by a Gaussian QL posterior.
fig = plt.figure(figsize = (12, 8))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
norm = cm.colors.Normalize(vmin=np.min(logPs), vmax=np.max(logPs))
plot_contours(ax1, beta1, beta2, logL, beta[bchoose], None, cstep = 50, zlabel = "Log Likelihood")
plot_contours(ax2, beta1, beta2, logPr, beta[bchoose], None, cstep = 20, zlabel = "Log Prior")
plot_contours(ax3, beta1, beta2, logPs, beta[bchoose], None, cstep = 50, zlabel = "Log Posterior")
plot_contours(ax4, beta1, beta2, logQL, beta[bchoose], None, cstep = 40, zlabel = "Log QL Posterior")
plt.tight_layout()
plt.show()
Effect of regularizer
Let us assume that we do not know $\mathbf{\Lambda}$ and all $\lambda_j$'s are equal. Here we look at how the QL posterior changes with varying $\beta_2$ for different values of $\lambda_j$.
Here I define a function for calculating the QL posterior and true posterior for changing the coefficients of a single variable:
# collapse-hide
def get_logQL_logPs_single_variable(X, Y, beta, regvar, betavar, bidx,
regL, gammajk, probk, sigmak2):
nvar = beta.shape[0]
nplotrv = regvar.shape[0]
nplotx = betavar.shape[0]
logL = np.zeros(nplotx)
logPr = np.zeros(nplotx)
logPs = np.zeros(nplotx)
logQL = np.zeros((nplotrv, nplotx))
thisbeta = beta.copy()
thisL = regL.copy()
for j, b2 in enumerate(betavar):
thisbeta[bidx] = b2
logL[j] = get_log_likelihood(Y, X, thisbeta)
logPr[j] = get_log_prior(thisbeta, gammajk, probk, sigmak2)
logPs = logL + logPr
for i, r1 in enumerate(regvar):
thisL = np.repeat(r1, nvar)
#thisL[bidx] = r1
M, Sinv = get_mS(X, Y, beta, thisL)
sgndetSinv, logdetSinv = np.linalg.slogdet(Sinv)
for j, b2 in enumerate(betavar):
thisbeta[bidx] = b2
logQL[i, j] = get_qL_log_posterior(thisbeta, thisL, M, Sinv, logdetSinv, logPr[j])
return logPs, logQL
And then look at the posteriors for $\beta_2$.
#collapse-hide
nplotx_rv = 200
nplotrv = 4
bmin = -10
bmax = 15
rvmin = 1
rvmax = 10
bidx = bchoose[1]
fig = plt.figure(figsize = (8, 8))
ax1 = fig.add_subplot(111)
betavals = np.linspace(bmin, bmax, nplotx_rv)
regvals = np.linspace(rvmin, rvmax, nplotrv)
logPs_rv, logQL_rv = get_logQL_logPs_single_variable(X, Y, beta, regvals, betavals, bidx,
regL, gammajk, probk, sigmak2)
ax1.plot(betavals, logPs_rv, lw = 3, zorder = 2, label = 'True Posterior')
ax1.scatter(betavals[np.argmax(logPs_rv)], logPs_rv[np.argmax(logPs_rv)], s = 40, zorder = 10, color = 'black')
for i, r1 in enumerate(regvals):
ax1.plot(betavals, logQL_rv[i, :], lw = 2, zorder = 5, label = f'$\lambda =${r1:.2f}')
ix = np.argmax(logQL_rv[i, :])
ax1.scatter(betavals[ix], logQL_rv[i, ix], s = 40, zorder = 10, color = 'black')
ax1.axvline(beta[bidx], ls = 'dotted', zorder = 1)
ax1.legend(handlelength = 1.5)
ax1.set_xlabel(r'$\beta$')
ax1.set_ylabel('Log Posterior')
plt.tight_layout()
plt.show()
What happens to the QL posterior for other variables? Let us look at every $\beta_j$ and their individual maximum posterior, while all others are kept fixed at optimum values. Here, I have arranged the $\beta_j$ in ascending order for easy viewing.
#collapse-hide
bidx_sorted = np.argsort(beta)
bidx_sorted_nz = bidx_sorted #bidx_sorted[beta[bidx_sorted]!=0]
maxQL = np.zeros((nplotrv, len(bidx_sorted_nz)))
maxPs = np.zeros(len(bidx_sorted_nz))
for i, bidx in enumerate(bidx_sorted_nz):
_logPs, _logQL = get_logQL_logPs_single_variable(X, Y, beta, regvals, betavals, bidx,
regL, gammajk, probk, sigmak2)
maxPs[i] = betavals[np.argmax(_logPs)]
for j in range(len(regvals)):
maxQL[j, i] = betavals[np.argmax(_logQL[j, :])]
fig = plt.figure(figsize = (16, 8))
ax1 = fig.add_subplot(111)
xvals = np.arange(len(bidx_sorted_nz))
ax1.plot(xvals, maxPs, lw = 2, zorder = 2, label = 'True Posterior')
for i, r1 in enumerate(regvals):
ax1.plot(xvals, maxQL[i, :], lw = 2, zorder = 5, label = f'$\lambda =${r1:.2f}')
ax1.scatter(xvals, maxPs, s = 20, zorder = 1)
for i, r1 in enumerate(regvals):
ax1.scatter(xvals, maxQL[i, :], s = 20, zorder = 1)
ax1.scatter(xvals, beta[bidx_sorted_nz], s = 80, zorder = 10, color = 'maroon', label = 'Input')
ax1.legend(handlelength = 1.5)
ax1.set_xlabel(r'Index of $\beta$')
ax1.set_ylabel(r'$\beta$ at maximum log posterior')
ax1.set_xticks(xvals)
ax1.set_xticklabels([f'{idx}' for idx in bidx_sorted_nz])
plt.tight_layout()
plt.show()