About

I am trying to understand the problem in the VEB implementation of multiple regression with product of normals. Here, I will numerically check the underlying distributions for a special case with only two predictors. For the single predictor ($p=1$) analysis, check here.

import numpy as np
import pandas as pd
from scipy import linalg as sc_linalg
from scipy import special as sc_special
import matplotlib.pyplot as plt

import mpl_stylesheet
import mpl_utils
from matplotlib import cm
from matplotlib import ticker as plticker
from mpl_toolkits.axes_grid1 import make_axes_locatable
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)

Toy model

I defined a toy model with two predictors and 10 samples. I sampled the predictor from the standard normal distribution $\mathcal{N}(0, 1)$. The model used is described in the writeup.

I chose $\sigma = 10.0$, $\sigma_b = 2.0$ and $\sigma_w = 4.0$. Let the true values for $b$ and $w$ be denoted as $\hat{b}$ and $\hat{w}$. I calculated the log posterior of $bw$ from the sum of the log prior and the log likelihood and normalized it using a discrete grid of $bw$.

def variance_explained(y, ypred):
    ss_err = np.sum(np.square(y - ypred))
    ss_tot = np.sum(np.square(y - np.mean(y)))
    r2 = 1 - (ss_err / ss_tot)
    return r2

def prod_norm_prior_pdf(z, s1, s2):
    x = np.abs(z) / s1 / s2
    prob = sc_special.kn(0, x) / (np.pi * s1 * s2)
    return prob

def prod_norm_prior_gridpdf(b1, b2, s1, s2):
    n1 = b1.shape[0]
    n2 = b2.shape[0]
    prob = np.zeros((n2, n1))
    pdf2 = prod_norm_prior_pdf(b2, s1, s2)
    for i, _b1 in enumerate(b1):
        pdf1 = prod_norm_prior_pdf(_b1, s1, s2)
        prob[:, i] = pdf1 * pdf2
    return prob

def normal_logdensity_onesample(z, m, s2):
    logdensity = - 0.5 * np.log(2 * np.pi * s2) \
                 - 0.5 * (z - m) * (z - m) / s2
    return logdensity

def normal_logdensity(y, mean, sigma2):
    n = y.shape[0]
    logdensity = - 0.5 * n * np.log(2 * np.pi * sigma2) \
                 - 0.5 * np.sum(np.square(y - mean)) / sigma2
    return logdensity

def data_log_likelihood(X, y, b1, b2, sigma):
    n1 = b1.shape[0]
    n2 = b2.shape[0]
    ll = np.zeros((n2, n1))
    for i, _b1 in enumerate(b1):
        for j, _b2 in enumerate(b2):
            mean = np.dot(X, np.array([_b1, _b2]))
            ll[j, i]= normal_logdensity(y, mean, sigma * sigma)
    return ll

def normalize_by_volume(x, y, z):
    '''
    y is the row, x is the column of z
    '''
    grid_area = np.einsum('i, j -> ij', np.diff(y), np.diff(x))
    volume = np.sum(np.multiply(z[1:, 1:], grid_area))
    return z / volume

def normalize_logdensity_by_volume(x, y, lnz):
    z = np.exp(lnz)
    grid_area = np.einsum('i, j -> ij', np.diff(y), np.diff(x))
    V = np.sum(np.multiply(z[1:, 1:], grid_area))
    return lnz - np.log(V)

# Use Green's theorem to compute the area
# enclosed by a given contour.
def area_greens(vs):
    xvs = vs[:, 0]
    yvs = vs[:, 1]
    area = 0.5 * np.sum(xvs[:-1] * np.diff(yvs) - yvs[:-1] * np.diff(xvs))
    return np.abs(area)

def plot_contours(ax, X, Y, Z, beta, norm, cstep = 10, 
                  xlabel = "", ylabel = "", zlabel = "",
                  nlevels = 5, nstd = 0, showContourArea=False,
                  showbeta=False, showZmax=False, showColbar=False):
    zmin = np.min(Z) - nstd * np.std(Z)
    zmax = np.max(Z) + nstd * np.std(Z)
    ind = np.unravel_index(np.argmax(Z, axis=None), Z.shape)

    levels = np.linspace(zmin, zmax, 200)
    clevels = np.linspace(zmin, zmax, nlevels)
    cmap = cm.YlOrRd

    if norm:
        cset1 = ax.contourf(X, Y, Z, levels, norm = norm,
                            cmap=cm.get_cmap(cmap, len(levels) - 1))
    else:
        cset1 = ax.contourf(X, Y, Z, levels,
                            cmap=cm.get_cmap(cmap, len(levels) - 1))
    cset2 = ax.contour(X, Y, Z, clevels, colors='k')
    
    if showContourArea:            
        for i, contour in enumerate(cset2.collections):
            cpaths = contour.get_paths()
            if len(cpaths) > 0:
                vs = contour.get_paths()[0].vertices
                # Compute area enclosed by vertices.
                a = area_greens(vs)
                print ("r = " + str(clevels[i]) + ": a =" + str(a))
            
    ax.clabel(cset2, inline=1, fontsize=10)
    for c in cset2.collections:
        c.set_linestyle('solid')

    ax.set_aspect("equal")
    if showbeta:
        ax.scatter(beta[0], beta[1], color = 'blue', s = 100)
    if showZmax:
        ax.scatter(X[ind[1]], Y[ind[0]], color = 'k', s = 100)
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)

    if showColbar:
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=0.2)
        cbar = plt.colorbar(cset1, cax=cax)
        ytickpos = np.arange(int(zmin / cstep) * cstep, zmax, cstep)
        cbar.set_ticks(ytickpos)
        if zlabel:
            cax.set_ylabel(zlabel)
    return cset1, cset2, clevels

def simulate_data(n, p, s, sb, sw, seed = 200):
    np.random.seed(seed)
    b = np.random.normal(0, sb, p)
    w = np.random.normal(0, sw, p)
    bw = np.multiply(b, w)
    X = np.zeros((n, p))
    for i in range(p):
        X[:, i] = np.random.normal(0, 1, n)
    err = np.random.normal(0, s, n)
    y = np.dot(X, bw) + err
    return X, y, b, w, bw

def print_matrix(matrix, fmt):
    print('\n'.join(['\t'.join([fmt.format(cell) for cell in row]) for row in matrix]))

nsample = 50
npred = 2
sigbtrue = 2.0
sigwtrue = 4.0
sigtrue = 15.0
X, y, btrue, wtrue, bwtrue = simulate_data(nsample, npred, sigtrue, sigbtrue, sigwtrue, seed = 300)
ypredtrue = np.dot(X, bwtrue)
r2 = variance_explained(y, ypredtrue)
bwprior = np.multiply(np.random.normal(0, sigbtrue, 1000),
                      np.random.normal(0, sigwtrue, 1000))
# mcmc_trace, mcmc_posterior = gibbs(X, y, sigtrue, sigbtrue, sigwtrue,
#                                    burn_iter = 1000, max_iter = 10000)

print(f"Fraction of variance explained by X: {r2:.2f}")

Fraction of variance explained by X: 0.66

bwvals = np.linspace(-40, 40, 1000)
bwvals1 = np.linspace(13, 23, 100)
bwvals2 = np.linspace(-3, 7, 80)
numerical_prior_1d = prod_norm_prior_pdf(bwvals, sigbtrue, sigwtrue)
numerical_prior = prod_norm_prior_gridpdf(bwvals1, bwvals2, sigbtrue, sigwtrue)
numerical_logll = data_log_likelihood(X, y, bwvals1, bwvals2, sigtrue)
numerical_logll = normalize_logdensity_by_volume(bwvals1, bwvals2, numerical_logll)
numerical_logpost = numerical_logll + np.log(numerical_prior)
numerical_logpost = normalize_logdensity_by_volume(bwvals1, bwvals2, numerical_logpost)

fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

ax1.scatter(ypredtrue, y)
mpl_utils.plot_diag(ax1)
ax1.text(0.02, 0.94, f"bw = ({bwtrue[0]:.2f}, {bwtrue[1]:.2f})", transform = ax1.transAxes)
#ax1.set_title("y = Xbw + e", pad = 20.0)
ax1.set_xlabel("Xbw")
ax1.set_ylabel("y")

ax2.hist(bwprior, bins = 50, density = True, alpha = 0.4, label="Simulation")
ax2.plot(bwvals, numerical_prior_1d, label = "Analytical")
ax2.legend()
ax2.set_title("Prior distribution", pad = 20.0)
ax2.set_xlabel("bw")
ax2.set_ylabel("Density")

zdata = np.exp(numerical_logll)
norm = cm.colors.Normalize(vmin=np.min(zdata), vmax=np.max(zdata))
cs1, cs2, zlevels = \
    plot_contours(ax3, bwvals1, bwvals2, zdata, 
                  bwtrue, norm = norm, cstep = 10, nlevels = 5,
                  showbeta = True, showZmax = False,
                  xlabel = r"$b_1 w_1$",
                  ylabel = r"$b_2 w_2$",
                  zlabel = "Likelihood",
                 )
ax3.set_title("Likelihood", pad = 20.0)

zdata = np.exp(numerical_logpost)
norm = cm.colors.Normalize(vmin=np.min(zdata), vmax=np.max(zdata))
cs1, cs2, zlevels = \
    plot_contours(ax4, bwvals1, bwvals2, zdata, 
                  bwtrue, norm = norm, cstep = 10, nlevels = 5,
                  showbeta = True, showZmax = False,
                  xlabel = r"$b_1 w_1$",
                  ylabel = r"$b_2 w_2$",
                  zlabel = "Posterior",
                 )
ax4.set_title("Posterior", pad = 20.0)

plt.tight_layout()
plt.show()

Variational approximation with fixed $\sigma$, $\sigma_b$ and $\sigma_w$

Here, I obtain the variational parameters using coordinate ascent updates of $m_b$, $s_b$, $m_w$ and $s_w$, while the hyperparameters are fixed at their true values, $\sigma = 15.0$, $\sigma_b = 2.0$ and $\sigma_w = 4.0$.

def get_elbo(X, XTX, XTy, yTy, s2, sb2, sw2, mb, mw, covb, covw):
    n, p = X.shape    
    elbo = cfunc(n, p, yTy, s2, sb2, sw2, covb, covw) \
            - hfunc(mb, covb, sb2) \
            - efunc(XTX, XTy, mb, mw, covb, covw, s2, sw2)
    return elbo

def get_elbo_grid(X, y, s, sb, sw, mb, mw, covb, covw, bwvals1, bwvals2, debug = False):
    k1 = bwvals1.shape[0]
    k2 = bwvals2.shape[0]
    elbo_grid = np.zeros((k2, k1))
    s2  = s * s
    sb2 = sb * sb
    sw2 = sw * sw
    XTX = np.dot(X.T, X)
    XTy = np.dot(X.T, y)
    yTy = np.dot(y.T, y)
    for i in range(k1):
        for j in range(k2):
            _mb = np.array([bwvals1[i] / mw[0], bwvals2[j] / mw[1]])
            if debug: print(np.multiply(_mb, mw))
            elbo_grid[j, i] = get_elbo(X, XTX, XTy, yTy, s2, sb2, sw2, _mb, mw, covb, covw)
    return elbo_grid
    
def cfunc(n, p, yTy, s2, sb2, sw2, covb, covw):
    sign, det_covb = np.linalg.slogdet(covb)
    sign, det_covw = np.linalg.slogdet(covw)
    val  =   0.5 * p 
    val += - 0.5 * p * np.log(sb2) - 0.5 * p * np.log(sw2)
    val += - 0.5 * n * np.log(2.0 * np.pi * s2)
    val +=   0.5 * det_covb + 0.5 * det_covw
    val +=   0.5 * yTy / s2
    return val

def hfunc(m, cov, s2):
    h = np.sum(np.square(m)) + cov.trace()
    return 0.5 * h / s2

def efunc(XTX, XTy, mb, mw, covb, covw, sigma2, sigmaw2):
    p = mb.shape[0]
    mmTb = np.einsum('i,j->ij', mb, mb)
    EBTXTXB = np.multiply(XTX, mmTb + covb)
    Rb = (EBTXTXB / sigma2) + (np.eye(p) / sigmaw2)
    vb = np.einsum('i,i->i', mb, XTy) / sigma2
    t1 = 0.5 * np.linalg.multi_dot([mw.T, Rb, mw])
    t2 = np.dot(mw.T, vb)
    t3 = 0.5 * np.dot(Rb, covw).trace()
    return t1 - t2 + t3

def qposterior_hist2d(mb, mw, covb, covw, bwvals1, bwvals2, n=1e8, density=True):
    '''
    calculate n samples of b and w, from which create an array of bw
    For the 2d histogram,
        bwvals1 := bin edges on the x-axis
        bwvals2 := bin edges on the y-axis
    Returns
        bw_counts := bin counts (or density) on a grid of bwvals1 on the x-axis (columns)
                                                      and bwvals2 on the y-axis (rows)
    '''
    bvals = np.random.multivariate_normal(mb, covb, int(n))
    wvals = np.random.multivariate_normal(mw, covw, int(n))
    bw_samples = np.multiply(bvals, wvals)
    xbins = np.digitize(bw_samples[:, 0], bwvals1)
    ybins = np.digitize(bw_samples[:, 1], bwvals2)
    bwbins = np.array([xbins, ybins]).T
    k1 = bwvals1.shape[0]
    k2 = bwvals2.shape[0]
    bw_counts = np.zeros((k2, k1))
    for i in range(1, k1):
        for j in range(1, k2):
            bw_counts[j, i] = np.sum((bwbins[:, None] == np.array([i, j])).all(-1))
    if density:
        bw_counts = normalize_by_volume(bwvals1, bwvals2, bw_counts)
    '''
    Another way to do this would be:
    bw_hist2d, _, _ = np.histogram2d(bw_samples[:, 0], bw_samples[:, 1], [bwvals1, bwvals2], density = density)
    Implement after debugging
    '''
    return bw_counts

def veb_ridge_step(XTX, XTy, sigma2, sigmab2, mw, covw):
    p = XTy.shape[0]
    mmT = np.einsum('i,j->ij', mw, mw)
    EWTXTXW = np.multiply(XTX, mmT + covw)
    covbinv = (EWTXTXW / sigma2) + (np.eye(p) / sigmab2)
    covb = np.linalg.inv(covbinv)
    mb = np.linalg.multi_dot([covb, np.diag(mw).T, XTy]) / sigma2
    return mb, covb


def get_sigma_updates(X, y, XTX, XTy, yTy, mb, mw, covb, covw):
    n, p = X.shape
    mmTw = np.einsum('i,j->ij', mw, mw)
    mmTb = np.einsum('i,j->ij', mb, mb)
    EWTXTXW = np.multiply(XTX, mmTw + covw)
    mbmwXTy = np.dot(mb, np.einsum('i,i->i', mw, XTy))
    trace_cov = np.dot(EWTXTXW, mmTb + covb).trace()
    sigma2 = (yTy - 2 * mbmwXTy + trace_cov) / n
    sigmab2 = np.sum(np.square(mb) + np.diag(covb)) / p
    sigmaw2 = np.sum(np.square(mw) + np.diag(covw)) / p
    return sigma2, sigmab2, sigmaw2


def veb_iridge(X, y,
         tol = 1e-8, max_iter = 10000,
         init_sigma = 1.0, init_sigmab = 1.0, init_sigmaw = 1.0,
         init_mb = 1.0, init_mw = 1.0, 
         init_sb = 1.0, init_sw = 1.0,
         use_convergence = True,
         update_sigma = True,
         update_sigmab = True,
         update_sigmaw = True,
         debug = True
        ):

    n, p = X.shape
    XTX = np.dot(X.T, X)
    XTy = np.dot(X.T, y)
    yTy = np.dot(y.T, y)
    elbo_path = np.zeros(max_iter + 1)
    
    # Initialize hyperparameters
    sigma   = init_sigma
    sigmab  = init_sigmab
    sigmaw  = init_sigmaw
    sigma2  = sigma  * sigma
    sigmab2 = sigmab * sigmab
    sigmaw2 = sigmaw * sigmaw
    
    # Initialize variational parameters
    covb = np.eye(p) * init_sb * init_sb
    covw = np.eye(p) * init_sw * init_sw
    mb   = np.repeat(init_mb, p)
    mw   = np.repeat(init_mw, p)
    
    niter = 0
    elbo_path[0] = -np.inf
    for itn in range(1, max_iter + 1):
        '''
        Update
        '''
        mb, covb = veb_ridge_step(XTX, XTy, sigma2, sigmab2, mw, covw)
        mw, covw = veb_ridge_step(XTX, XTy, sigma2, sigmaw2, mb, covb)
        
        if update_sigma or update_sigmab or update_sigmaw:
            _sigma2, _sigmab2, _sigmaw2 = get_sigma_updates(X, y, XTX, XTy, yTy, mb, mw, covb, covw)
            
        if update_sigma:  sigma2  = _sigma2
        if update_sigmab: sigmab2 = _sigmab2
        if update_sigmaw: sigmaw2 = _sigmaw2

        sigma  = np.sqrt(sigma2)
        sigmab = np.sqrt(sigmab2)
        sigmaw = np.sqrt(sigmaw2)
            
        if debug:
            print(f"Iteration {itn}")
            print(sigma2, sigmab2, sigmaw2)
        '''
        Convergence
        '''
        niter += 1
        elbo_path[itn] = get_elbo(X, XTX, XTy, yTy, sigma2, sigmab2, sigmaw2, mb, mw, covb, covw)
        if use_convergence:
            if elbo_path[itn] - elbo_path[itn - 1] < tol: break

    return mb, mw, covb, covw, niter, elbo_path[:niter + 1], sigma, sigmab, sigmaw

def veb_show_result(X, y,
                    vebres, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    true_ypred, true_posterior,
                    use_logscale_elbo = True):
    mb, mw, covb, covw, niter, elbo_path, sigma, sigmab, sigmaw = vebres
    bw = np.multiply(mb, mw)
    ypred = np.dot(X, bw)
    veb_posterior = qposterior_hist2d(mb, mw, covb, covw, bwvals1, bwvals2, n = 1e6)
    
    _sigma2, _sigmab2, _sigmaw2 = \
        get_sigma_updates(X, y, np.dot(X.T, X), np.dot(X.T, y), np.dot(y.T, y), 
                          mb, mw, covb, covw)
    sigma = np.sqrt(_sigma2)
    sigmab = np.sqrt(_sigmab2)
    sigmaw = np.sqrt(_sigmaw2)

    
    print(f"Optimized in {niter} iterations. Final ELBO: {elbo_path[-1]:.3f}")
    print("Optimum parameters:")
    print(f"mb = {mb[0]:g}, {mb[1]:g}")
    print(f"mw = {mw[0]:g}, {mw[1]:g}")
    print(f"sb")
    print_matrix(covb, "{:g}")
    print("")
    print(f"sw")
    print_matrix(covw, "{:g}")
    print("")
    
    max_elbo = np.max(elbo_path)
    elbo_diff = max_elbo - elbo_path[1:]
    min_diff = np.min(elbo_diff[elbo_diff!=0])
    elbo_diff[elbo_diff == 0] = min_diff

    ax1.scatter(np.arange(niter - 1), elbo_diff[:-1])
    ax1.plot(np.arange(niter - 1), elbo_diff[:-1])
    if use_logscale_elbo:
        ax1.set_yscale('log')
    ax1.set_xlabel('Iteration')
    ax1.set_ylabel('Distance to "best" ELBO')

    ax2.scatter(ypred, ypredtrue)
    mpl_utils.plot_diag(ax2)
    ax2.text(0.02, 0.94, f"bw = ({bw[0]:.2f}, {bw[1]:.2f})", transform = ax2.transAxes)
    ax2.set_xlabel('VEB  $y_{\mathrm{pred}}$')
    ax2.set_ylabel('True  $y_{\mathrm{pred}}$')

    zdata = true_posterior
    norm = cm.colors.Normalize(vmin=np.min(zdata), vmax=np.max(zdata))
    cs1, cs2, zlevels = \
        plot_contours(ax4, bwvals1, bwvals2, zdata, 
                      bwtrue, norm = norm, cstep = 10, nlevels = 5,
                      showbeta = True, showZmax = False,
                      xlabel = r"$b_1 w_1$",
                      ylabel = r"$b_2 w_2$",
                      zlabel = "Posterior",
                     )
    ax4.set_title("True Posterior", pad = 20.0)

    zdata = veb_posterior
    cs1, cs2, zlevels = \
        plot_contours(ax3, bwvals1, bwvals2, zdata, 
                      bwtrue, norm = norm, cstep = 10, nlevels = 5,
                      showbeta = True, showZmax = False,
                      xlabel = r"$b_1 w_1$",
                      ylabel = r"$b_2 w_2$",
                      zlabel = "Posterior",
                     )
    ax3.set_title("Variational Posterior", pad = 20.0)
    return sigma, sigmab, sigmaw

veb1_res = veb_iridge(X, y, init_sigma = sigtrue, init_sigmab = sigbtrue, init_sigmaw = sigwtrue,
                     update_sigma = False, update_sigmab = False, update_sigmaw = False, debug = False)

fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

veb1_sigma, veb1_sigmab, veb1_sigmaw = \
    veb_show_result(X, y,
                    veb1_res, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    ypredtrue, np.exp(numerical_logpost)
                   )

plt.tight_layout()
plt.show()

Optimized in 91 iterations. Final ELBO: -66.106
Optimum parameters:
mb = 3.04132, 0.349786
mw = 6.08147, 0.69957
sb
0.0964886	-0.0023612
-0.0023612	0.807325

sw
0.385802	-0.00944298
-0.00944298	3.2293

VEB optimization of all parameters

The problem starts when I try to obtain both the variational parameters $m_b$, $s_b$, $m_w$ and $s_w$, and the hyperparameters $\sigma$, $\sigma_b$ and $\sigma_w$ using the update scheme described in the writeup. I have switched off the convergence criteria to check how the ELBO evolves over time.

I found that the convergence depends on the initialization of $\sigma$, $\sigma_b$ and $\sigma_w$. Initializing at the true values of the parameters lead to "unnatural" ELBO updates. (I had observed similar ELBO updates in the trend filtering example).

veb2_res = veb_iridge(X, y, init_sigma = sigtrue, init_sigmab = sigbtrue, init_sigmaw = sigwtrue,
                      debug = False,
                      use_convergence = False, max_iter = 100)

fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

veb2_sigma, veb2_sigmab, veb2_sigmaw = \
    veb_show_result(X, y,
                    veb2_res, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    ypredtrue, np.exp(numerical_logpost),
                    use_logscale_elbo = True
                   )

plt.tight_layout()
plt.show()

Optimized in 100 iterations. Final ELBO: -68.941
Optimum parameters:
mb = 6.4937, 0.836119
mw = 2.86254, 0.368576
sb
0.446534	-0.0123073
-0.0123073	3.74056

sw
0.0867704	-0.00239156
-0.00239156	0.726864

Variational approximation with fixed $\sigma$

If I fix $\sigma$ and allow update of $\sigma_b$ and $\sigma_w$, then the ELBO keeps increasing, provided $\sigma$ is initialized at the true value.

veb3_res = veb_iridge(X, y, init_sigma = sigtrue, init_sigmab = sigbtrue, init_sigmaw = sigwtrue,
                      debug = False, update_sigma = False,
                      use_convergence = False, max_iter = 100)

fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

veb3_sigma, veb3_sigmab, veb3_sigmaw = \
    veb_show_result(X, y,
                    veb3_res, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    ypredtrue, np.exp(numerical_logpost),
                    use_logscale_elbo = True
                   )

plt.tight_layout()
plt.show()

Optimized in 100 iterations. Final ELBO: -66.035
Optimum parameters:
mb = 6.49685, 0.888305
mw = 2.86272, 0.391416
sb
0.437943	-0.0128269
-0.0128269	3.66886

sw
0.0850294	-0.00249043
-0.00249043	0.712332

Variational approximation with fixed $\sigma_b$ and $\sigma_w$

Here I will allow update of $\sigma$, while keeping $\sigma_b$ and $\sigma_w$ fixed.

veb4_res = veb_iridge(X, y, init_sigma = sigtrue, init_sigmab = sigbtrue, init_sigmaw = sigwtrue,
                      debug = False, update_sigma = True, update_sigmab = False, update_sigmaw = False,
                      use_convergence = False, max_iter = 1000)

fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

veb4_sigma, veb4_sigmab, veb4_sigmaw = \
    veb_show_result(X, y,
                    veb4_res, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    ypredtrue, np.exp(numerical_logpost),
                    use_logscale_elbo = True
                   )

plt.tight_layout()
plt.show()

Optimized in 1000 iterations. Final ELBO: -69.151
Optimum parameters:
mb = 3.03998, 0.317298
mw = 6.07996, 0.634596
sb
0.0984972	-0.00218552
-0.00218552	0.824218

sw
0.393989	-0.00874209
-0.00874209	3.29687

Initialization with very high $\sigma$ leads to well-behaved optimization

If I initialize $\sigma = 100.0$ and fix the values of $\sigma_b = 2.0$ and $\sigma_w = 4.0$, then the ELBO is well-behaved (always increasing). The optimized values are the same as the previous one, where we initialize with $\sigma = 10.0$ and allowed updates for 1000 iterations (without stopping the iterations with any convergence criteria).

veb5_res = veb_iridge(X, y, init_sigma = 100.0, init_sigmab = sigbtrue, init_sigmaw = sigwtrue,
                      debug = False, update_sigma = True, update_sigmab = False, update_sigmaw = False,
                      use_convergence = False, max_iter = 250)

fig = plt.figure(figsize=(12, 12))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

veb5_sigma, veb5_sigmab, veb5_sigmaw = \
    veb_show_result(X, y,
                    veb5_res, ax1, ax2, ax3, ax4,
                    bwvals1, bwvals2, 
                    ypredtrue, np.exp(numerical_logpost),
                    use_logscale_elbo = True
                   )

plt.tight_layout()
plt.show()

Optimized in 250 iterations. Final ELBO: -69.151
Optimum parameters:
mb = 3.03998, 0.317298
mw = 6.07996, 0.634596
sb
0.0984972	-0.00218552
-0.00218552	0.824218

sw
0.393989	-0.00874209
-0.00874209	3.29687

What is the expectation of $\sigma$, $\sigma_b$ and $\sigma_w$?

data = [[sigtrue,  sigbtrue,  sigwtrue],
        [veb1_sigma, veb1_sigmab, veb1_sigmaw],
        [veb2_sigma, veb2_sigmab, veb2_sigmaw],
        [veb3_sigma, veb3_sigmab, veb3_sigmaw],
        [veb4_sigma, veb4_sigmab, veb4_sigmaw],
        [veb5_sigma, veb5_sigmab, veb5_sigmaw]
       ]
colnames = ['s', 'sb', 'sw']
rownames = ['True', 'VEB-fix', 'VEB-full', 'VEB-fix-s', 'VEB-fix-sb-sw', 'VEB-high-s']
df = pd.DataFrame.from_records(data, columns = colnames, index = rownames)
df.style.format("{:.3f}")

	s	sb	sw
True	15.000	2.000	4.000
VEB-fix	15.144	2.267	4.533
VEB-full	15.150	4.850	2.138
VEB-fix-s	15.138	4.853	2.138
VEB-fix-sb-sw	15.157	2.265	4.531
VEB-high-s	15.157	2.265	4.531