About

Here, I am checking whether the factorization has any effect on the optimization in the variational approximation of EBMR. Earlier, I found that the variational approximation for the product of two normals leads to severe overfitting (see here).

import numpy as np
import pandas as pd
from scipy import linalg as sc_linalg
import matplotlib.pyplot as plt

import sys
sys.path.append("../../ebmrPy/")
from inference.ebmr import EBMR
from inference import f_elbo
from inference import f_sigma
from inference import penalized_em
from utils import log_density

import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)

Toy example

The same trend-filtering data as used previously.

def standardize(X):
    Xnorm = (X - np.mean(X, axis = 0)) 
    #Xstd =  Xnorm / np.std(Xnorm, axis = 0)
    Xstd = Xnorm / np.sqrt((Xnorm * Xnorm).sum(axis = 0))
    return Xstd

def trend_data(n, p, bval = 1.0, sd = 1.0, seed=100):
    np.random.seed(seed)
    X = np.zeros((n, p))
    for i in range(p):
        X[i:n, i] = np.arange(1, n - i + 1)
    #X = standardize(X)
    btrue = np.zeros(p)
    idx = int(n / 3)
    btrue[idx] = bval
    btrue[idx + 1] = -bval
    y = np.dot(X, btrue) + np.random.normal(0, sd, n)
    # y = y / np.std(y)
    return X, y, btrue

n = 100
p = 200
bval = 8.0
sd = 2.0
X, y, btrue = trend_data(n, p, bval = bval, sd = sd)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(np.arange(n), np.dot(X, btrue), label = "Xb")
ax1.scatter(np.arange(n), y, edgecolor = 'black', facecolor='white', label = "Xb + e")
ax1.legend()
ax1.set_xlabel("Sample index")
ax1.set_ylabel("y")
plt.show()

Factorization 1

$\displaystyle q\left(\mathbf{b}, \mathbf{w}\right) = q\left(\mathbf{b}\right) \prod_{j=1}^{P}q_j\left(w_j\right) $.

def ridge_mll(X, y, s2, sb2, W):
    n, p = X.shape
    Xscale = np.dot(X, np.diag(W))
    XWWtXt = np.dot(Xscale, Xscale.T)
    sigmay = s2 * (np.eye(n) + sb2 * XWWtXt)
    muy    = np.zeros((n, 1))
    return log_density.mgauss(y.reshape(-1,1), muy, sigmay)

def grr_b(X, y, s2, sb2, Wbar, varWj, XTX, XTy):
    n, p = X.shape
    W = np.diag(Wbar)
    WtXtXW = np.linalg.multi_dot([W.T, XTX, W])
    VW = np.diag(XTX) * varWj
    
    sigmabinv = (WtXtXW + np.diag(VW) + np.eye(p) * s2 / sb2) / s2
    sigmab = np.linalg.inv(sigmabinv)
    mub = np.linalg.multi_dot([sigmab, W.T, XTy]) / s2
    
    XWmu = np.linalg.multi_dot([X, W, mub])
    mub2 = np.square(mub)
    s2 = (np.sum(np.square(y - XWmu)) \
          + np.dot((WtXtXW + np.diag(VW)), sigmab).trace() \
          + np.sum(mub2 * VW)) / n
    sb2 = (np.sum(mub2) + sigmab.trace()) / p
    return s2, sb2, mub, sigmab

def grr_W_old(X, y, s2, sw2, mub, sigmab, muWj, XTX, XTy):
    n, p = X.shape
    R = np.einsum('i,j->ij', mub, mub) + sigmab
    XTXRjj = np.array([XTX[j, j] * R[j, j] for j in range(p)])
    #wXTXRj = np.array([np.sum(muWj * XTX[:, j] * R[:, j]) - (muWj[j] * XTXRjj[j]) for j in range(p)])
    sigmaWj2 = 1 / ((XTXRjj / s2) + (1 / sw2))
    for j in range(p):
        wXTXRj = np.sum(muWj * XTX[:, j] * R[:, j]) - (muWj[j] * XTXRjj[j])
        muWj[j] = sigmaWj2[j] * (mub[j] * XTy[j] - 0.5 * wXTXRj) / s2
    sw2 = np.sum(np.square(muWj) + sigmaWj2) / p
    return sw2, muWj, sigmaWj2


def grr_W(X, y, s2, sw2, mub, sigmab, muWj, XTX, XTy):
    n, p = X.shape
    R = np.einsum('i,j->ij', mub, mub) + sigmab
    XTXRjj = np.diag(XTX) * np.diag(R)
    sigmaWj2inv = (XTXRjj / s2) + (1 / sw2)
    wXTXRj = np.array([np.sum(muWj * XTX[:, j] * R[:, j]) - (muWj[j] * XTXRjj[j]) for j in range(p)])
    sigmaWj2 = 1 / sigmaWj2inv
    muWj = sigmaWj2 * (mub * XTy - wXTXRj) / s2
    sw2 = np.sum(np.square(muWj) + sigmaWj2) / p
    #sigmaWj2 = np.zeros(p)
    return sw2, muWj, sigmaWj2
        

def elbo(X, y, s2, sb2, sw2, mub, sigmab, Wbar, varWj, XTX):
    '''
    Wbar is a vector which contains the diagonal elements of the diagonal matrix W
    W = diag_matrix(Wbar)
    Wbar = diag(W)
    --
    VW is a vector which contains the diagonal elements of the diagonal matrix V_w
    '''
    n, p = X.shape
    VW = np.diag(XTX) * varWj
    elbo = c_func(n, p, s2, sb2, sw2) \
           + h1_func(X, y, s2, sb2, sw2, mub, Wbar, VW) \
           + h2_func(p, s2, sb2, sw2, XTX, Wbar, sigmab, varWj, VW)
    return elbo


def c_func(n, p, s2, sb2, sw2):
    val  =   p
    val += - 0.5 * n * np.log(2.0 * np.pi * s2)
    val += - 0.5 * p * np.log(sb2)
    val += - 0.5 * p * np.log(sw2)
    return val


def h1_func(X, y, s2, sb2, sw2, mub, Wbar, VW):
    XWmu = np.linalg.multi_dot([X, np.diag(Wbar), mub])
    val1 = - (0.5 / s2) * np.sum(np.square(y - XWmu))
    val2 = - 0.5 * np.sum(np.square(mub) * ((VW / s2) + (1 / sb2)))
    val3 = - 0.5 * np.sum(np.square(Wbar)) / sw2
    val  = val1 + val2 + val3
    return val


def h2_func(p, s2, sb2, sw2, XTX, Wbar, sigmab, varWj, VW):
    (sign, logdetS) = np.linalg.slogdet(sigmab)
    logdetV = np.sum(np.log(varWj))
    W = np.diag(Wbar)
    WtXtXW = np.linalg.multi_dot([W.T, XTX, W])
    val  =   0.5 * logdetS + 0.5 * logdetV
    val += - 0.5 * np.trace(sigmab) / sb2 - 0.5 * np.sum(varWj) / sw2
    val += - 0.5 * np.dot(WtXtXW + np.diag(VW), sigmab).trace() / s2
    return val
    

def ebmr_WB1(X, y,
            s2_init = 1.0, sb2_init = 1.0, sw2_init = 1.0,
            binit = None, winit = None,
            max_iter = 1000, tol = 1e-8
           ):
    XTX = np.dot(X.T, X)
    XTy = np.dot(X.T, y)
    n_samples, n_features = X.shape
    elbo_path = np.zeros(max_iter + 1)
    mll_path = np.zeros(max_iter + 1)
    
    '''
    Iteration 0
    '''
    niter = 0
    s2 = s2_init
    sb2 = sb2_init
    sw2 = sw2_init
    mub = np.ones(n_features)  if binit is None else binit
    muWj = np.ones(n_features) if winit is None else winit
    sigmab = np.zeros((n_features, n_features))
    sigmaWj2 = np.zeros(n_features)
    elbo_path[0] = -np.inf
    mll_path[0] = -np.inf
    
    for itn in range(1, max_iter + 1):
        '''
        GRR for b
        '''
        s2, sb2, mub, sigmab = grr_b(X, y, s2, sb2, muWj, sigmaWj2, XTX, XTy)
        
        '''
        GRR for W
        '''
        sw2, muWj, sigmaWj2  = grr_W(X, y, s2, sw2, mub, sigmab, muWj, XTX, XTy)
        
        '''
        Convergence
        '''
        niter += 1
        elbo_path[itn] = elbo(X, y, s2, sb2, sw2, mub, sigmab, muWj, sigmaWj2, XTX)
        mll_path[itn] = ridge_mll(X, y, s2, sb2, muWj)
        if elbo_path[itn] - elbo_path[itn - 1] < tol: break
        #if mll_path[itn] - mll_path[itn - 1] < tol: break
    return s2, sb2, sw2, mub, sigmab, muWj, sigmaWj2, niter, elbo_path[:niter + 1], mll_path[:niter + 1]

And this leads to overfitting as we have seen previously.

m1 = ebmr_WB1(X, y)

s2, sb2, sw2, mub, sigmab, W, sigmaW, niter, elbo_path, mll_path = m1
bpred = mub * W
ypred = np.dot(X, bpred)


fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
#ax4 = fig.add_subplot(224)

yvals = np.log(np.max(elbo_path[1:]) - elbo_path[1:] + 1)
ax1.scatter(np.arange(niter), yvals, edgecolor = 'black', facecolor='white')
ax1.plot(np.arange(niter), yvals)
ax1.set_xlabel("Iterations")
ax1.set_ylabel("log (max(ELBO) - ELBO[itn] + 1)")

ax2.scatter(np.arange(n), y, edgecolor = 'black', facecolor='white')
ax2.plot(np.arange(n), ypred, color = 'salmon', label="Predicted")
ax2.plot(np.arange(n), np.dot(X, btrue), color = 'dodgerblue', label="True")
ax2.legend()
ax2.set_xlabel("Sample Index")
ax2.set_ylabel("y")

ax3.scatter(np.arange(p), btrue, edgecolor = 'black', facecolor='white', label="True")
ax3.scatter(np.arange(p), bpred, label="Predicted")
ax3.legend()
ax3.set_xlabel("Predictor Index")
ax3.set_ylabel("wb")

# nstep = min(80, niter - 2)
# ax4.scatter(np.arange(nstep), mll_path[-nstep:], edgecolor = 'black', facecolor='white')
# ax4.plot(np.arange(nstep), elbo_path[-nstep:])
# ax4.set_xlabel("Iterations")
# ax4.set_ylabel("ELBO / Evidence")

plt.tight_layout()
plt.show()

Factorization 2

$\displaystyle q\left(\mathbf{b}, \mathbf{w}\right) = q\left(\mathbf{b}\right) q\left(\mathbf{w}\right) $.

This allows us to use the same update equations for both $\mathbf{b}$ and $\mathbf{w}$.

$q\left(\mathbf{b}\right) = N\left( \mathbf{b} \mid \mathbf{m}, \mathbf{S}\right)$,

$q\left(\mathbf{w}\right) = N\left( \mathbf{w} \mid \mathbf{a}, \mathbf{V}\right)$,

$\displaystyle \mathbf{S} = \left[ \frac{1}{\sigma^2}\left(\mathbf{A}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{X}\mathbf{A} + \mathbf{\Lambda}_w\right) + \frac{1}{\sigma_b^2}\mathbb{I} \right]^{-1}$,

$\displaystyle \mathbf{m} = \frac{1}{\sigma^2} \mathbf{S}\mathbf{A}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{y}$,

$\displaystyle \mathbf{V} = \left[ \frac{1}{\sigma^2}\left(\mathbf{M}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{X}\mathbf{M} + \mathbf{\Lambda}_b\right) + \frac{1}{\sigma_w^2}\mathbb{I} \right]^{-1}$,

$\displaystyle \mathbf{a} = \frac{1}{\sigma^2} \mathbf{V}\mathbf{M}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{y}$,

$\mathbf{A}$, $\mathbf{M}$, $\mathbf{\Lambda}_w$ and $\mathbf{\Lambda}_b$ are diagonal matrices whose diagonal elements are given by

$\mathbf{A}_{jj} := a_j$,

$(\mathbf{\Lambda}_w)_{jj} := (\mathbf{X}^{\mathsf{T}}\mathbf{X})_{jj}\mathbf{V}_{jj}^2$,

$\mathbf{M}_{jj} := m_j$,

$(\mathbf{\Lambda}_b)_{jj} := (\mathbf{X}^{\mathsf{T}}\mathbf{X})_{jj}\mathbf{S}_{jj}^2$.

The updates of $\sigma^2$, $\sigma_b^2$ and $\sigma_w^2$ can be obtained by taking the derivative of $F(q(\mathbf{b}, \mathbf{W}))$ and setting them to 0 respectively. This yields

$\displaystyle \sigma^2 = \frac{1}{N} \left\{ \left\Vert \mathbf{y} - \mathbf{X}\mathbf{A}\mathbf{m} \right\Vert_{2}^{2} + \mathrm{Tr}\left(\left(\mathbf{A}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{X}\mathbf{A} + \mathbf{\Lambda}_w\right)\mathbf{S} \right) + \mathbf{m}^{\mathsf{T}}\mathbf{\Lambda}_w\mathbf{m}\right\}$,

$\displaystyle \sigma_b^2 = \frac{1}{P} \mathrm{Tr}\left(\mathbf{m}\mathbf{m}^{\mathsf{T}} + \mathbf{S}\right)$ and

$\displaystyle \sigma_w^2 = \frac{1}{P} \mathrm{Tr}\left(\mathbf{a}\mathbf{a}^{\mathsf{T}} + \mathbf{V}\right)$.

The iteration should stop when the ELBO $F(q(\mathbf{b}, \mathbf{w}))$ converges to a tolerance value (tol). The ELBO can be calculated as

$\displaystyle F(q) = P - \frac{N}{2}\ln(2\pi\sigma^2) - \frac{P}{2}\ln(\sigma_b^2) - \frac{P}{2}\ln(\sigma_w^2) - \frac{1}{2\sigma^2}\left\{\left\Vert \mathbf{y} - \mathbf{X}\mathbf{A}\mathbf{m} \right\Vert_2^2 + \mathrm{Tr}\left(\left(\mathbf{A}^{\mathsf{T}}\mathbf{X}^{\mathsf{T}}\mathbf{X}\mathbf{A} + \mathbf{\Lambda}_w\right)\mathbf{S} \right) + \mathbf{m}^{\mathsf{T}}\mathbf{\Lambda}_w\mathbf{m} \right\} - \frac{1}{2\sigma_b^2}\mathrm{Tr}\left(\mathbf{m}\mathbf{m}^{\mathsf{T}} + \mathbf{S}\right) - \frac{1}{2\sigma_w^2}\mathrm{Tr}\left(\mathbf{a}\mathbf{a}^{\mathsf{T}} + \mathbf{V}\right) + \frac{1}{2}\ln\left\lvert\mathbf{S}\right\rvert + \frac{1}{2}\ln\left\lvert\mathbf{V}\right\rvert$

def ridge_mll(X, y, s2, sb2, W):
    n, p = X.shape
    Xscale = np.dot(X, np.diag(W))
    XWWtXt = np.dot(Xscale, Xscale.T)
    sigmay = s2 * (np.eye(n) + sb2 * XWWtXt)
    muy    = np.zeros((n, 1))
    return log_density.mgauss(y.reshape(-1,1), muy, sigmay)

def grr_step(X, y, s2, sb2, muW, varW, XTX, XTy, useVW=True):
    n, p = X.shape
    W = np.diag(muW)
    WtXtXW = np.linalg.multi_dot([W.T, XTX, W])
    VW = np.diag(XTX) * np.diag(varW) if useVW else np.zeros(p)
    
    sigmabinv = (WtXtXW + np.diag(VW) + np.eye(p) * s2 / sb2) / s2
    sigmab = np.linalg.inv(sigmabinv)
    mub = np.linalg.multi_dot([sigmab, W.T, XTy]) / s2
    
    XWmu = np.linalg.multi_dot([X, W, mub])
    mub2 = np.square(mub)
    s2 = (np.sum(np.square(y - XWmu)) \
          + np.dot((WtXtXW + np.diag(VW)), sigmab).trace() \
          + np.sum(mub2 * VW)) / n
    sb2 = (np.sum(mub2) + sigmab.trace()) / p
    return s2, sb2, mub, sigmab


def elbo(X, y, s2, sb2, sw2, mub, sigmab, Wbar, varW, XTX, useVW=True):
    '''
    Wbar is a vector which contains the diagonal elements of the diagonal matrix W
    W = diag_matrix(Wbar)
    Wbar = diag(W)
    --
    VW is a vector which contains the diagonal elements of the diagonal matrix V_w
    '''
    n, p = X.shape
    VW = np.diag(XTX) * np.diag(varW) if useVW else np.zeros(p)
    elbo = c_func(n, p, s2, sb2, sw2) \
           + h1_func(X, y, s2, sb2, sw2, mub, Wbar, VW) \
           + h2_func(p, s2, sb2, sw2, XTX, Wbar, sigmab, varW, VW)
    return elbo


def c_func(n, p, s2, sb2, sw2):
    val  =   p
    val += - 0.5 * n * np.log(2.0 * np.pi * s2)
    val += - 0.5 * p * np.log(sb2)
    val += - 0.5 * p * np.log(sw2)
    return val


def h1_func(X, y, s2, sb2, sw2, mub, Wbar, VW):
    XWmu = np.linalg.multi_dot([X, np.diag(Wbar), mub])
    val1 = - (0.5 / s2) * np.sum(np.square(y - XWmu))
    val2 = - 0.5 * np.sum(np.square(mub) * ((VW / s2) + (1 / sb2)))
    val3 = - 0.5 * np.sum(np.square(Wbar)) / sw2
    val  = val1 + val2 + val3
    return val


def h2_func(p, s2, sb2, sw2, XTX, Wbar, sigmab, sigmaw, VW):
    (sign, logdetS) = np.linalg.slogdet(sigmab)
    (sign, logdetV) = np.linalg.slogdet(sigmaw)
    W = np.diag(Wbar)
    WtXtXW = np.linalg.multi_dot([W.T, XTX, W])
    val  =   0.5 * logdetS + 0.5 * logdetV
    val += - 0.5 * np.trace(sigmab) / sb2 - 0.5 * np.trace(sigmaw) / sw2
    val += - 0.5 * np.dot(WtXtXW + np.diag(VW), sigmab).trace() / s2
    return val
    

def ebmr_WB2(X, y,
             s2_init = 1.0, sb2_init = 1.0, sw2_init = 1.0,
             binit = None, winit = None,
             use_wb_variance=True,
             max_iter = 1000, tol = 1e-8
            ):
    XTX = np.dot(X.T, X)
    XTy = np.dot(X.T, y)
    n_samples, n_features = X.shape
    elbo_path = np.zeros(max_iter + 1)
    mll_path = np.zeros(max_iter + 1)
    
    '''
    Iteration 0
    '''
    niter = 0
    s2 = s2_init
    sb2 = sb2_init
    sw2 = sw2_init
    mub = np.ones(n_features) if binit is None else binit
    muw = np.ones(n_features) if winit is None else winit
    sigmab = np.zeros((n_features, n_features))
    sigmaw = np.zeros((n_features, n_features))
    elbo_path[0] = -np.inf
    mll_path[0] = -np.inf
    
    for itn in range(1, max_iter + 1):
        '''
        GRR for b
        '''
        s2, sb2, mub, sigmab = grr_step(X, y, s2, sb2, muw, sigmaw, XTX, XTy, useVW=use_wb_variance)
        
        '''
        GRR for W
        '''
        s2, sw2, muw, sigmaw = grr_step(X, y, s2, sw2, mub, sigmab, XTX, XTy, useVW=use_wb_variance)
        
        '''
        Convergence
        '''
        niter += 1
        elbo_path[itn] = elbo(X, y, s2, sb2, sw2, mub, sigmab, muw, sigmaw, XTX, useVW=use_wb_variance)
        mll_path[itn] = ridge_mll(X, y, s2, sb2, muw)
        if elbo_path[itn] - elbo_path[itn - 1] < tol: break
        #if mll_path[itn] - mll_path[itn - 1] < tol: break
    return s2, sb2, sw2, mub, sigmab, muw, sigmaw, niter, elbo_path[:niter + 1], mll_path[:niter + 1]

However, there is still an overfitting.

m2 = ebmr_WB2(X, y)

s2, sb2, sw2, mub, sigmab, W, sigmaW, niter, elbo_path, mll_path = m2
bpred = mub * W
ypred = np.dot(X, bpred)


fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
#ax4 = fig.add_subplot(224)

yvals = np.log(np.max(elbo_path[1:]) - elbo_path[1:] + 1)
ax1.scatter(np.arange(niter), yvals, edgecolor = 'black', facecolor='white')
ax1.plot(np.arange(niter), yvals)
ax1.set_xlabel("Iterations")
ax1.set_ylabel("log (max(ELBO) - ELBO[itn] + 1)")

ax2.scatter(np.arange(n), y, edgecolor = 'black', facecolor='white')
ax2.plot(np.arange(n), ypred, color = 'salmon', label="Predicted")
ax2.plot(np.arange(n), np.dot(X, btrue), color = 'dodgerblue', label="True")
ax2.legend()
ax2.set_xlabel("Sample Index")
ax2.set_ylabel("y")

ax3.scatter(np.arange(p), btrue, edgecolor = 'black', facecolor='white', label="True")
ax3.scatter(np.arange(p), bpred, label="Predicted")
ax3.legend()
ax3.set_xlabel("Predictor Index")
ax3.set_ylabel("wb")

# nstep = min(80, niter - 2)
# ax4.scatter(np.arange(nstep), mll_path[-nstep:], edgecolor = 'black', facecolor='white', label="Evidence")
# ax4.plot(np.arange(nstep), elbo_path[-nstep:], label="ELBO")
# ax4.legend()
# ax4.set_xlabel("Iterations")
# ax4.set_ylabel("ELBO / Evidence")

plt.tight_layout()
plt.show()

However, if I set $\mathbf{\Lambda}_w = \mathbf{\Lambda}_b = \mathbf{0}$, then we get back the simple EM updates leading to optimal prediction.

m3 = ebmr_WB2(X, y, use_wb_variance=False)

s2, sb2, sw2, mub, sigmab, W, sigmaW, niter, elbo_path, mll_path = m3
bpred = mub * W
ypred = np.dot(X, bpred)


fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
#ax4 = fig.add_subplot(224)


yvals = np.log(np.max(elbo_path[1:]) - elbo_path[1:] + 1)
ax1.scatter(np.arange(niter), yvals, edgecolor = 'black', facecolor='white')
ax1.plot(np.arange(niter), yvals)
ax1.set_xlabel("Iterations")
ax1.set_ylabel("log (max(ELBO) - ELBO[itn] + 1)")

ax2.scatter(np.arange(n), y, edgecolor = 'black', facecolor='white')
ax2.plot(np.arange(n), ypred, color = 'salmon', label="Predicted")
ax2.plot(np.arange(n), np.dot(X, btrue), color = 'dodgerblue', label="True")
ax2.legend()
ax2.set_xlabel("Sample Index")
ax2.set_ylabel("y")

ax3.scatter(np.arange(p), btrue, edgecolor = 'black', facecolor='white', label="True")
ax3.scatter(np.arange(p), bpred, label="Predicted")
ax3.legend()
ax3.set_xlabel("Predictor Index")
ax3.set_ylabel("wb")

# nstep = min(80, niter)
# ax4.scatter(np.arange(nstep), mll_path[-nstep:], edgecolor = 'black', facecolor='white', label="Evidence")
# ax4.plot(np.arange(nstep), elbo_path[-nstep:], label="ELBO")
# ax4.legend()
# ax4.set_xlabel("Iterations")
# ax4.set_ylabel("ELBO / Evidence")

plt.tight_layout()
plt.show()

Toy model without noise

Here, I am just trying another toy model to check how the method performs in a simpler setting, without any noise in the data, but with sparse predictor (all coefficients are zero, except 10 out of 200 predictors).

def lasso_data(nsample, nvar, neff, errsigma, sb2 = 100, seed=200):
    np.random.seed(seed)
    X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
    X = standardize(X)
    btrue = np.zeros(nvar)
    bidx = np.random.choice(nvar, neff , replace = False)
    btrue[bidx] = np.random.normal(0, np.sqrt(sb2), neff)
    y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
    y = y - np.mean(y)
    #y = y / np.std(y)
    return X, y, btrue

def lims_xy(ax):
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    return lims

def plot_diag(ax):
    lims = lims_xy(ax)
    ax.plot(lims, lims, ls='dotted', color='gray')

peff = 10
sb2 = 100.0
sd = 0.0001
X, y, btrue = lasso_data(n, p, peff, sd, sb2)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(np.dot(X,btrue), y)
plot_diag(ax1)
ax1.set_xlabel("Xb")
ax1.set_ylabel("y")

plt.show()

It gets the coefficients almost correctly, but there is a systematic tendency to underestimate the coefficients.

m4 = ebmr_WB2(X, y)
s2, sb2, sw2, mub, sigmab, W, sigmaW, niter, elbo_path, mll_path = m4
bpred = mub * W
ypred = np.dot(X, bpred)


fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
#ax4 = fig.add_subplot(224)


yvals = np.log(np.max(elbo_path[1:]) - elbo_path[1:] + 1)
ax1.scatter(np.arange(niter), yvals, edgecolor = 'black', facecolor='white')
ax1.plot(np.arange(niter), yvals)
ax1.set_xlabel("Iterations")
ax1.set_ylabel("log (max(ELBO) - ELBO[itn] + 1)")

ax2.scatter(np.dot(X,btrue), y, edgecolor = 'black', facecolor='white', label="True")
ax2.scatter(np.dot(X,btrue), ypred, color = 'salmon', label="Predicted")
plot_diag(ax2)
ax2.legend()
ax2.set_xlabel("Xb")
ax2.set_ylabel("y")

ax3.scatter(np.arange(p), btrue, edgecolor = 'black', facecolor='white', label="True")
ax3.scatter(np.arange(p), bpred, label="Predicted")
ax3.legend()
ax3.set_xlabel("Predictor Index")
ax3.set_ylabel("wb")

# nstep = min(80, niter)
# ax4.scatter(np.arange(nstep), mll_path[-nstep:], edgecolor = 'black', facecolor='white', label="Evidence")
# ax4.plot(np.arange(nstep), elbo_path[-nstep:], label="ELBO")
# ax4.legend()
# ax4.set_xlabel("Iterations")
# ax4.set_ylabel("ELBO / Evidence")

plt.tight_layout()
plt.show()

sigmab.trace()

95.26544609444606

sigmaW.trace()

60.98925911204604

np.diag(sigmab)[btrue != 0]

array([0.05601364, 0.08098608, 0.08084723, 0.0840591 , 0.045118  ,
       0.1231217 , 0.49224221, 0.05510448, 0.49224221, 0.22969095])

np.diag(sigmaW)[btrue != 0]

array([0.03586012, 0.05184756, 0.05175867, 0.05381492, 0.0288847 ,
       0.07882293, 0.31513512, 0.03527807, 0.31513512, 0.14704892])

s2

0.4024155770439133

sb2

0.801017889123893

sw2

0.512814349102875

(mub * W)[btrue != 0]

array([-8.95029733e+00,  6.52191422e+00,  5.71359528e+00,  5.40759037e+00,
        1.10498294e+01, -3.79645146e+00,  6.82826917e-52, -9.02268238e+00,
       -6.15810037e-90,  1.49639826e+00])