About

In the quasi-Laplace approximation, we apply the Laplace approximation to a regularized likelihood $\mathscr{L}_{\mathrm{reg}}(\boldsymbol{\beta})$ defined as the product of the likelihood and a Gaussian regularizer, \begin{equation*} \mathscr{L}_{\mathrm{reg}}(\boldsymbol{\beta}) \triangleq p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right) \propto \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{m}, \mathbf{S} \right) \end{equation*} such that the mode of the regularized likelihood is near the mode of the posterior.

  • Likelihood: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right)$
  • Regularized likelihood: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right)$
  • Prior: $p\left(\boldsymbol{\beta}\mid g \right)$
  • Posterior: $p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right)$
  • Marginal likelihood: $p \left( \mathbf{y} \mid \mathbf{X} \right) = \int p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right) d\boldsymbol{\beta}$

Here, $g$ is the mixture of Gaussians with known variances but unknown mixture coefficients. The precision matrix $\mathbf{\Lambda}$ is defined as a diagonal matrix, $\mathbf{\Lambda} \triangleq \mathrm{diag}\left(\boldsymbol{\lambda}\right)$, whose elements $\lambda_j$ are roughly set to some expected value to ensure that the regularized likelihood is centered at the mode of the posterior.

The quasi-Laplace approximation is helpful in cases where the marginal likelihood (and therefore, the posterior) is not analytically tractable, for example in logistic regression.

#collapse_hide

import numpy as np
np.set_printoptions(precision = 4, suppress=True)

from scipy import optimize
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import ticker as plticker
from mpl_toolkits.axes_grid1 import make_axes_locatable

import sys
sys.path.append("../../utils/")
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'kelly', dpi = 72)

Generate toy data

Let us consider a logistic model with sparse coefficients, so that the number of causal variables is much less than the number of variables nvar in the model. This is ensured by sampling the betas from a Gaussian mixture prior with variances given by $\sigma_k^2$ (sigmak2), where the numbers of components is given by nGcomp and the mixture coefficients given by probk, the sparsity of which is controlled by the variable sparsity that specifies the mixture coefficient of the component $\mathcal{N}(0, 0)$ (or the delta function).

nsample = 20
nvar = 30
nGcomp = 3
sparsity = 0.8
prior_strength = 5
num_inf = 1e4 # a large number for 1/sigma_k^2 when sigma_k^2 = 0

probk = np.zeros(nGcomp)
probk[0] = sparsity
probk[1:(nGcomp - 1)] = (1 - sparsity) / (nGcomp - 1)
probk[nGcomp - 1] = 1 - np.sum(probk)
sigmak2 = np.array([prior_strength * np.square(np.power(2, (i)/nGcomp) - 1) for i in range(nGcomp)])

The data is generated from Bernoulli trials (equivalent to binomial distribution with $n=1$ and $p=\sigma(\mathbf{X} \boldsymbol{\beta})$, where $\sigma(\cdot)$ is the logistic function. $\mathbf{X}$ is centered and scaled such that for each variable $j$, the variance is $\mathrm{var}(\mathbf{x}_j) = 1$.

# collapse-hide

def standardize(X):
    Xnorm = (X - np.mean(X, axis = 0)) 
    Xstd =  Xnorm / np.std(Xnorm, axis = 0)
    return Xstd

def logistic_data(X, beta):
    Xbeta = np.dot(X, beta)
    pred = 1 / (1 + np.exp(-Xbeta))
    Y = np.random.binomial(1, pred)
    return Y


X = np.random.rand(nsample * nvar).reshape(nsample, nvar)
X = standardize(X)

gammajk = np.random.multinomial(1, probk, size = nvar)
beta = np.zeros(nvar)
for j in range(nvar):
    if gammajk[j, 0] != 1:
        kidx = np.where(gammajk[j, :] == 1)[0][0]
        kstd = np.sqrt(sigmak2[kidx])
        beta[j] = np.random.normal(loc = 0., scale = kstd)

ncausal = beta[beta != 0].shape[0]
betavar = np.var(beta[beta != 0])

Y = logistic_data(X, beta)

Let us have a look at the generated data.

# collapse-hide

print(f'There are {ncausal} non-zero coefficients with variance {betavar:.4f}')

fig = plt.figure(figsize = (6,6))
ax1 = fig.add_subplot(111)

Xbeta = np.dot(X, beta)
pred = 1 / (1 + np.exp(-Xbeta))
ax1.scatter(Xbeta, Y, s = 10)
ax1.scatter(Xbeta, pred, s = 10)

ax1.set_xlabel(r'$\sum_i X_{ni} \beta_i$')
ax1.set_ylabel(r'$Y_n$')
plt.show()
There are 5 non-zero coefficients with variance 0.5920

True posterior vs quasi-Laplace posterior

We select two causal variables (with maximum effect size) and fix all the others to optimum values to understand how the likelihood and posterior depends on these two chosen variables. To avoid the sum over the indicator variables, we use the joint prior $p\left(\boldsymbol{\beta}, \boldsymbol{\gamma} \mid g\right)$.

Some useful function definitions:

# collapse-hide

def get_log_likelihood(Y, X, beta):
    Xbeta = np.dot(X, beta)
    logL = np.sum(Y * Xbeta - np.log(1 + np.exp(Xbeta)))
    return logL

def get_log_prior(beta, gammajk, probk, sigmak2):
    logprior = 0
    for j, b in enumerate(beta):
        k = np.where(gammajk[j, :] == 1)[0][0]
        logprior += np.log(probk[k])
        if k > 0:
            logprior += - 0.5 * (np.log(2 * np.pi) + np.log(sigmak2[k]) + b * b / sigmak2[k]) 
    return logprior

def plot_contours(ax, X, Y, Z, beta, norm, cstep = 10, zlabel = ""):
    zmin = np.min(Z) - 1 * np.std(Z)
    zmax = np.max(Z) + 1 * np.std(Z)
    ind = np.unravel_index(np.argmax(Z, axis=None), Z.shape)

    levels = np.linspace(zmin, zmax, 200)
    clevels = np.linspace(zmin, zmax, 20)
    cmap = cm.YlOrRd_r

    if norm:
        cset1 = ax.contourf(X, Y, Z, levels, norm = norm,
                            cmap=cm.get_cmap(cmap, len(levels) - 1))
    else:
        cset1 = ax.contourf(X, Y, Z, levels,
                            cmap=cm.get_cmap(cmap, len(levels) - 1))
    cset2 = ax.contour(X, Y, Z, clevels, colors='k')
    for c in cset2.collections:
        c.set_linestyle('solid')


    ax.set_aspect("equal")
    ax.scatter(beta[0], beta[1], color = 'blue', s = 100)
    ax.scatter(X[ind[1]], Y[ind[0]], color = 'k', s = 100)

    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.2)
    cbar = plt.colorbar(cset1, cax=cax)
    ytickpos = np.arange(int(zmin / cstep) * cstep, zmax, cstep)
    cbar.set_ticks(ytickpos)
    if zlabel:
        cax.set_ylabel(zlabel)

    #loc = plticker.AutoLocator()
    #ax.xaxis.set_major_locator(loc)
    #ax.yaxis.set_major_locator(loc)
    
def log_likelihood(beta, X, Y, L):
    nvar = beta.shape[0]
    Xbeta = np.dot(X, beta)

    ## Function
    llb = np.sum(np.dot(Y, Xbeta) - np.log(1 + np.exp(Xbeta)))
    #reg = 0.5 * np.sum(np.log(L)) - 0.5 * np.einsum('i,i->i', np.square(beta), L)
    reg = - 0.5 * np.einsum('i,i->i', np.square(beta), L)
    loglik = llb + reg
    
    ## Gradient
    pred = 1 / (1 + np.exp(-Xbeta))
    der = np.einsum('i,ij->j', (Y - pred), X) - np.multiply(beta, L)
    
    return -loglik, -der

def precisionLL(X, beta, L):
    nvar = X.shape[1]
    Xbeta = np.einsum('i,ji->j', beta, X)
    pred = 1 / (1 + np.exp(-Xbeta))
    hess = - np.einsum('i,i,ij,ik->jk', pred, (1 - pred), X, X)
    hess[np.diag_indices(nvar)] -= L
    return -hess

def get_mS(X, Y, beta0, L):
    nvar = X.shape[1]
    args  = X, Y, L

    gmode = optimize.minimize(log_likelihood,
                              beta0,
                              args=args,
                              method='L-BFGS-B',
                              jac=True,
                              bounds=None,
                              options={'maxiter': 20000000,
                                       'maxfun': 20000000,
                                       'ftol': 1e-9,
                                       'gtol': 1e-9
                                       #'disp': True
                                      })
    
    M = gmode.x
    Sinv = precisionLL(X, M, L)
    return M, Sinv

def get_qL_log_posterior(beta, L, M, Sinv, logdetSinv, logprior):
    blessM = beta - M
    bMSbM = np.dot(blessM.T, np.dot(Sinv, blessM))
    bLb = np.einsum('i, i', np.square(beta), L)
    logdetLinv = - np.sum(np.log(L))
    logposterior = 0.5 * (logdetSinv + logdetLinv - bMSbM + bLb)
    logposterior += logprior
    return logposterior

We calculate the likelihood, prior, true posterior and the quasi-Laplace posterior. Note that the posteriors are not normalized. We apply quasi-Laplace (QL) approximation with some $\mathbf{\Lambda}$ and show QL posterior distribution, which is given by \begin{equation*} p\left(\mathbf{y} \mid \mathbf{X}, \boldsymbol{\beta}\right) p\left(\boldsymbol{\beta}\mid g \right) \propto \frac{\mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{m}, \mathbf{S} \right) p\left(\boldsymbol{\beta}\mid g \right) }{ \mathcal{N}\left( \boldsymbol{\beta} \mid \mathbf{0}, \mathbf{\Lambda}^{-1} \right) } \end{equation*} Here, we assume that we know $\mathbf{\Lambda}$. In reality, we will not know $\mathbf{\Lambda}$ but will have to learn it from the data or make some educated guess from the prior choice.

# collapse-hide

bchoose = np.argsort(abs(beta))[-2:]
nplotx = 20
nploty = 20

b1min = -5
b1max = 5
b2min = -5
b2max = 5
beta1 = np.linspace(b1min, b1max, nplotx)
beta2 = np.linspace(b2min, b2max, nploty)
logL  = np.zeros((nploty, nplotx))
logPr = np.zeros((nploty, nplotx))
logPs = np.zeros((nploty, nplotx))
logQL = np.zeros((nploty, nplotx))

thisbeta = beta.copy()
mask = np.ones(nvar, bool)
mask[bchoose] = False

true_pi = np.sum(gammajk, axis = 0) / np.sum(gammajk)
#reg = 1 / np.einsum('i,i', true_pi, sigmak2)
#regL = np.repeat(reg, nvar)
regL = np.repeat(num_inf, nvar)
for j, b in enumerate(beta):
    k = np.where(gammajk[j, :] == 1)[0][0]
    if k > 0:
        regL[j] = 1 / sigmak2[k]
M, Sinv = get_mS(X, Y, beta, regL)
sgndetSinv, logdetSinv = np.linalg.slogdet(Sinv)

for i, b1 in enumerate(beta1):
    for j, b2 in enumerate(beta2):
        thisbeta[bchoose] = np.array([b1, b2])
        logL[j, i] = get_log_likelihood(Y, X, thisbeta)
        logPr[j, i] = get_log_prior(thisbeta, gammajk, probk, sigmak2)
        logQL[j, i] = get_qL_log_posterior(thisbeta, regL, M, Sinv, logdetSinv, logPr[j, i])
logPs = logL + logPr

Here, we plot the contour maps. The x and y-axis show the two coefficients $\beta_1$ and $\beta_2$, which we chose to vary. The blue dot shows the coordinates of the true values of $\{\beta_1, \beta_2\}$ and the black dot shows the maximum of the log probabilities. Note how the non-Gaussian true posterior is now approximated by a Gaussian QL posterior.

fig = plt.figure(figsize = (12, 8))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

norm = cm.colors.Normalize(vmin=np.min(logPs), vmax=np.max(logPs))

plot_contours(ax1, beta1, beta2, logL,  beta[bchoose], None, cstep = 50, zlabel = "Log Likelihood")
plot_contours(ax2, beta1, beta2, logPr, beta[bchoose], None, cstep = 20,  zlabel = "Log Prior")
plot_contours(ax3, beta1, beta2, logPs, beta[bchoose], None, cstep = 50, zlabel = "Log Posterior")
plot_contours(ax4, beta1, beta2, logQL,  beta[bchoose], None, cstep = 40, zlabel = "Log QL Posterior")

plt.tight_layout()
plt.show()

Effect of regularizer

Let us assume that we do not know $\mathbf{\Lambda}$ and all $\lambda_j$'s are equal. Here we look at how the QL posterior changes with varying $\beta_2$ for different values of $\lambda_j$.

Here I define a function for calculating the QL posterior and true posterior for changing the coefficients of a single variable:

# collapse-hide

def get_logQL_logPs_single_variable(X, Y, beta, regvar, betavar, bidx,
                                    regL, gammajk, probk, sigmak2):

    nvar    = beta.shape[0]
    nplotrv = regvar.shape[0]
    nplotx  = betavar.shape[0]
    logL  = np.zeros(nplotx)
    logPr = np.zeros(nplotx)
    logPs = np.zeros(nplotx)
    logQL = np.zeros((nplotrv, nplotx))

    thisbeta = beta.copy()
    thisL = regL.copy()

    for j, b2 in enumerate(betavar):
        thisbeta[bidx] = b2
        logL[j]  = get_log_likelihood(Y, X, thisbeta)
        logPr[j] = get_log_prior(thisbeta, gammajk, probk, sigmak2)
    logPs = logL + logPr

    for i, r1 in enumerate(regvar):
        thisL = np.repeat(r1, nvar)
        #thisL[bidx] = r1
        M, Sinv = get_mS(X, Y, beta, thisL)
        sgndetSinv, logdetSinv = np.linalg.slogdet(Sinv)
        for j, b2 in enumerate(betavar):
            thisbeta[bidx] = b2
            logQL[i, j] = get_qL_log_posterior(thisbeta, thisL, M, Sinv, logdetSinv, logPr[j])

    return logPs, logQL

And then look at the posteriors for $\beta_2$.

#collapse-hide

nplotx_rv = 200
nplotrv   = 4
bmin = -10
bmax = 15
rvmin  = 1
rvmax  = 10
bidx = bchoose[1]

fig = plt.figure(figsize = (8, 8))
ax1 = fig.add_subplot(111)

betavals = np.linspace(bmin, bmax, nplotx_rv)
regvals  = np.linspace(rvmin, rvmax, nplotrv)
logPs_rv, logQL_rv = get_logQL_logPs_single_variable(X, Y, beta, regvals, betavals, bidx,
                                                     regL, gammajk, probk, sigmak2)

ax1.plot(betavals, logPs_rv, lw = 3, zorder = 2, label = 'True Posterior')
ax1.scatter(betavals[np.argmax(logPs_rv)], logPs_rv[np.argmax(logPs_rv)], s = 40, zorder = 10, color = 'black')
for i, r1 in enumerate(regvals):
    ax1.plot(betavals, logQL_rv[i, :], lw = 2, zorder = 5, label = f'$\lambda =${r1:.2f}')
    ix = np.argmax(logQL_rv[i, :])
    ax1.scatter(betavals[ix], logQL_rv[i, ix], s = 40, zorder = 10, color = 'black')
ax1.axvline(beta[bidx], ls = 'dotted', zorder = 1)

ax1.legend(handlelength = 1.5)
ax1.set_xlabel(r'$\beta$')
ax1.set_ylabel('Log Posterior')
plt.tight_layout()
plt.show()

What happens to the QL posterior for other variables? Let us look at every $\beta_j$ and their individual maximum posterior, while all others are kept fixed at optimum values. Here, I have arranged the $\beta_j$ in ascending order for easy viewing.

#collapse-hide

bidx_sorted = np.argsort(beta)
bidx_sorted_nz = bidx_sorted #bidx_sorted[beta[bidx_sorted]!=0]


maxQL = np.zeros((nplotrv, len(bidx_sorted_nz)))
maxPs = np.zeros(len(bidx_sorted_nz))
for i, bidx in enumerate(bidx_sorted_nz):
    _logPs, _logQL = get_logQL_logPs_single_variable(X, Y, beta, regvals, betavals, bidx,
                                                         regL, gammajk, probk, sigmak2)
    maxPs[i] = betavals[np.argmax(_logPs)]
    for j in range(len(regvals)):
        maxQL[j, i] = betavals[np.argmax(_logQL[j, :])]
        
fig = plt.figure(figsize = (16, 8))
ax1 = fig.add_subplot(111)

xvals = np.arange(len(bidx_sorted_nz))

ax1.plot(xvals, maxPs, lw = 2, zorder = 2, label = 'True Posterior')
for i, r1 in enumerate(regvals):
    ax1.plot(xvals, maxQL[i, :], lw = 2, zorder = 5, label = f'$\lambda =${r1:.2f}')
    
ax1.scatter(xvals, maxPs, s = 20, zorder = 1)
for i, r1 in enumerate(regvals):
    ax1.scatter(xvals, maxQL[i, :], s = 20, zorder = 1)
ax1.scatter(xvals, beta[bidx_sorted_nz], s = 80, zorder = 10, color = 'maroon', label = 'Input')

ax1.legend(handlelength = 1.5)
ax1.set_xlabel(r'Index of $\beta$')
ax1.set_ylabel(r'$\beta$ at maximum log posterior')
ax1.set_xticks(xvals)
ax1.set_xticklabels([f'{idx}' for idx in bidx_sorted_nz])

plt.tight_layout()
plt.show()