About

A sanity check that all ridge regression methods perform similarly.

import numpy as np
import scipy
from scipy import linalg as sc_linalg
from scipy import sparse as sc_sparse
from sklearn.linear_model import Ridge

import glmnet_python
from glmnet import glmnet
from glmnetPrint import glmnetPrint
from glmnetCoef import glmnetCoef
from glmnetPredict import glmnetPredict

import matplotlib.pyplot as plt
import sys
sys.path.append("../../utils/")
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)

../../utils/mpl_stylesheet.py:26: MatplotlibDeprecationWarning: Support for setting the 'text.latex.preamble' or 'pgf.preamble' rcParam to a list of strings is deprecated since 3.3 and will be removed two minor releases later; set it to a single string instead.
  matplotlib.rcParams['text.latex.preamble'] = [r'\usepackage[scaled=.86]{ClearSans}',

def standardize(X):
    Xnorm = (X - np.mean(X, axis = 0)) 
    #Xstd =  Xnorm / np.std(Xnorm, axis = 0)
    Xstd = Xnorm / np.sqrt((Xnorm * Xnorm).sum(axis = 0))
    return Xstd

def ridge_data(nsample, nvar, errsigma):
    X = np.random.normal(0, 1, nsample * nvar).reshape(nsample, nvar)
    X = standardize(X)
    btrue = np.random.normal(0, 1, nvar)
    y = np.dot(X, btrue) + np.random.normal(0, errsigma, nsample)
    y = y - np.mean(y)
    y = y / np.std(y)
    return X, y, btrue

def rsquare(ytrue, ypred):
    sst = np.sum(np.square(ytrue - np.mean(ytrue)))
    sse = np.sum(np.square(ytrue - ypred))
    rsq = 1 - (sse / sst)
    return rsq

def logpdf_multivariate_gauss(x, mu, cov):
    '''
    Caculate the multivariate normal density (pdf)

    Keyword arguments:
        x = numpy array of a "d x 1" sample vector
        mu = numpy array of a "d x 1" mean vector
        cov = "numpy array of a d x d" covariance matrix
    '''
    assert(mu.shape[0] > mu.shape[1]), 'mu must be a row vector'
    assert(x.shape[0] > x.shape[1]), 'x must be a row vector'
    assert(cov.shape[0] == cov.shape[1]), 'covariance matrix must be square'
    assert(mu.shape[0] == cov.shape[0]), 'cov_mat and mu_vec must have the same dimensions'
    assert(mu.shape[0] == x.shape[0]), 'mu and x must have the same dimensions'
    part1 = - nsample * 0.5 * np.log(2. * np.pi) - 0.5 * np.linalg.slogdet(cov)[1]
    xlm = x - mu
    part2 =  - 0.5 * np.dot(xlm.T, np.dot(np.linalg.inv(cov), xlm))
    return float(part1 + part2)

def ridge_em(X, Y, s2, sb2, niter = 10):
    XTX = np.dot(X.T, X)
    XTY = np.dot(X.T, Y)
    YTY = np.dot(Y.T, Y)
    nsample = X.shape[0]
    nvar = X.shape[1]
    loglik = np.zeros(niter)
    i = 0
    while i < niter:
        V = XTX + np.eye(nvar) * (s2 / sb2)
        Vinv = sc_linalg.cho_solve(sc_linalg.cho_factor(V, lower=True), np.eye(nvar))
        SigmaY = sb2 * np.dot(X, X.T) + np.eye(nsample) * s2
        loglik[i] = logpdf_multivariate_gauss(Y.reshape(-1, 1), np.zeros((nsample, 1)), SigmaY)
        Sigmab = s2 * Vinv  # posterior variance of b
        mub = np.dot(Vinv, XTY) # posterior mean of b
        b2m = np.einsum('i,j->ij', mub, mub) + Sigmab
        s2 = (YTY + np.dot(XTX, b2m).trace() - 2 * np.dot(XTY, mub)) / nsample
        sb2 = np.sum(np.square(mub) + np.diag(Sigmab)) / nvar
        i += 1
    return s2, sb2, loglik, mub.reshape(-1), Sigmab

def ridge_ols(X, Y, lmbda):
    XTX = np.dot(X.T, X)
    XTY = np.dot(X.T, Y)
    nvar = X.shape[1]
    V = XTX + np.eye(nvar) * lmbda
    Vinv = sc_linalg.cho_solve(sc_linalg.cho_factor(V, lower=True), np.eye(nvar))
    bhat = np.dot(Vinv, XTY)
    return bhat

def svd2XTX(svd):
    U = svd[0]
    S = svd[1]
    Vh = svd[2]
    nmax = max(S.shape[0], Vh.shape[0])
    Sdiag = np.zeros((nmax, nmax))
    Sdiag[np.diag_indices(S.shape[0])] = np.square(S)
    return np.dot(Vh.T, np.dot(Sdiag, Vh))

def c_func(nsample, s2, ElogW):
    val  = - 0.5 * nsample * np.log(2. * np.pi * s2)
    val += - 0.5 * np.sum(ElogW)
    return val

def h1_func(X, Y, s2, mu, Wbar):
    val = - (0.5 / s2) * (np.sum(np.square(Y - np.dot(X, mu))) \
                          + np.sum(np.square(mu) / Wbar))
    return val

def h2_func(svd, Sigma, Wbar):
    XTX = svd2XTX(svd)
    (sign, logdet) = np.linalg.slogdet(Sigma)
    val = - 0.5 * np.trace(np.dot(XTX + np.diag(1 / Wbar), Sigma)) + 0.5 * logdet
    return val

def ebmr_initialize(X, Y):
    svd = sc_linalg.svd(X)
    XTY = np.dot(X.T, Y)
    mu  = np.zeros(nvar)
    Sigma = np.zeros((nvar, nvar))
    return svd, XTY, mu, Sigma

def update_Sigma(svd, Wbar, nvar):
    XTX = svd2XTX(svd)
    Sigma = sc_linalg.cho_solve(sc_linalg.cho_factor(XTX + np.diag(1 / Wbar), lower=True), np.eye(nvar))
    return Sigma

def update_mu(Sigma, XTY):
    return np.dot(Sigma, XTY)

def update_s2(X, Y, mu, Wbar, nsample):
    A = np.sum(np.square(Y - np.dot(X, mu)))
    s2 = (A + np.sum(np.square(mu) / Wbar)) / nsample
    return s2
    
def update_wg_ridge(mu, Sigma, s2, nvar):
    bj2 = np.square(mu) + np.diag(Sigma) * s2
    W = np.repeat(np.sum(bj2) / s2 / nvar, nvar)
    KLW = 0.
    return W, KLW

def update_elbo(X, Y, s2, mu, Sigma, Wbar, KLw, svd, nsample, nvar):
    ElogW = np.log(Wbar)
    elbo = c_func(nsample, s2, ElogW) \
           + h1_func(X, Y, s2, mu, Wbar) \
           + h2_func(svd, Sigma, Wbar) \
           + KLw
    return elbo
    
def ebmr(X, Y, niter = 10, tol = 1e-4):
    nvar = X.shape[1]
    nsample = X.shape[0]
    svdX, XTY, mu, Sigma = ebmr_initialize(X, Y)
    s2 = np.var(Y)
    Wbar = np.ones(nvar)
    elbo = -np.inf
    
    i = 0
    while i < niter:
        #print(i)
        #Sigma = update_Sigma(svdX, Wbar, nvar)
        XTX   = svd2XTX(svdX)
        Sigma = sc_linalg.cho_solve(sc_linalg.cho_factor(XTX + np.diag(1 / Wbar), lower=True), np.eye(nvar))
        mu    = update_mu(Sigma, XTY)
        s2    = update_s2(X, Y, mu, Wbar, nsample)
        Wbar, KLw  = update_wg_ridge(mu, Sigma, s2, nvar)
        
        elbo_new = update_elbo(X, Y, s2, mu, Sigma, Wbar, KLw, svdX, nsample, nvar)
        if elbo_new - elbo < tol: break
        elbo = elbo_new
        i += 1

    return s2, mu, Sigma, Wbar

nsample = 50
nvar = 100
nsim = 20
errsigmas = np.logspace(-0.1, 1, 5)

r2 = [None for i in errsigmas]


for i, sd in enumerate(errsigmas):
    
    lmbda = np.square(sd)
    
    r2[i] = dict()
    r2[i]['ridge_mle'] = list()
    r2[i]['ridge_em'] = list()
    r2[i]['ebmr'] = list()
    r2[i]['sklearn'] = list()
    r2[i]['sp_lsqr'] = list()
    r2[i]['glmnet'] = list()
    
    for isim in range(nsim):
        X, y, btrue = ridge_data(nsample, nvar, sd)
        
        # Ridge_OLS
        b_ridge_ols = ridge_ols(X, y, lmbda)
        y_ridge_ols = np.dot(X, b_ridge_ols)
        r2[i]['ridge_mle'].append(rsquare(y, y_ridge_ols))
        #r2[i]['ridge_ols'].append(y_ridge_ols)
        #r2[i]['ridge_ols'].append(np.square(y - y_ridge_ols))
        #r2[i]['ridge_ols'].append(y)
    
        # Ridge EM
        _, _, _, b_ridge_em, _ = ridge_em(X, y, 1, 1, 500)
        r2[i]['ridge_em'].append(rsquare(y, np.dot(X, b_ridge_em)))

        # EBMR
        _, b_ebmr, _, _ = ebmr(X, y, 1000)
        y_ebmr = np.dot(X, b_ebmr)
        r2[i]['ebmr'].append(rsquare(y, y_ebmr))

        #Sklearn Ridge
        clf = Ridge(alpha=lmbda, fit_intercept = False, normalize = False, solver = 'sparse_cg')
        clf.fit(X, y)
        b_sklearn = clf.coef_
        y_sklearn = np.dot(X, b_sklearn)
        r2[i]['sklearn'].append(rsquare(y, y_sklearn))
        
        #Sparse Lsqr
        b_sp_lsqr = sc_sparse.linalg.lsqr(X, y, damp=np.sqrt(lmbda))[0]
        #b_sp_lsqr = my_lsqr(X, y, damp=1.0)[0]
        y_sp_lsqr = np.dot(X, b_sp_lsqr)
        r2[i]['sp_lsqr'].append(rsquare(y, y_sp_lsqr))
        #r2[i]['sp_lsqr'].append(y_sp_lsqr)
        #r2[i]['sp_lsqr'].append(np.square(y - y_sp_lsqr))
        #r2[i]['sp_lsqr'].append(y)
        
        #glmnet
        lmbda_glmnet = lmbda / X.shape[0]
        fit = glmnet(x = X.copy(), y = y.copy(), family = 'gaussian', alpha = 0.0,
                     intr = False, standardize = False,
                     lambdau = np.array([lmbda_glmnet, 1.0]))
        b_glmnet = glmnetCoef(fit, s = np.float64([lmbda_glmnet]), exact = False)[1:].reshape(-1)
        y_glmnet = np.dot(X, b_glmnet)
        r2[i]['glmnet'].append(rsquare(y, y_glmnet))
        #r2[i]['glmnet'].append(y_glmnet)
        #r2[i]['glmnet'].append(np.square(y - y_glmnet))
        #r2[i]['glmnet'].append(y)

fig = plt.figure(figsize = (16,6))
ax1 = fig.add_subplot(111)

colors = {'ridge_em': '#2D69C4',
         'ebmr': '#CC2529',
         'sklearn': '#93AA00',
         'sp_lsqr': '#535154',
         'glmnet': '#6B4C9A',
         'ridge_mle': '#FFB300'}

facecolors = {'ridge_em': '#719ad8',
              'ebmr': '#f2888b',
              'sklearn': '#c4d64f',
              'sp_lsqr': '#a6a3a7',
              'glmnet': '#a98fd2',
              'ridge_mle': '#fbd67e'}

barwidth = 0.1
nsigma = len(errsigmas)
xpos = [(k+1)*2 for k in range(nsigma)]

plot_methods = ['ridge_mle', 'sklearn', 'sp_lsqr', 'ridge_em', 'ebmr', 'glmnet']
bxplt = [None for x in plot_methods]

for i, method in enumerate(plot_methods):
#for i, method in enumerate(['ridge_ols', 'sp_lsqr', 'glmnet']):
    #pdata = [np.hstack(r2[k][method]) for k in range(nsigma)]
    pdata = [r2[k][method] for k in range(nsigma)]
    xloc = [x + (i * barwidth) + (i * barwidth / 3) for x in xpos]
    medianprops = dict(linewidth=2, color = colors[method])
    whiskerprops = dict(linewidth=2, color = facecolors[method])
    boxprops = dict(linewidth=2, color = colors[method], facecolor = facecolors[method])
    bxplt[i] = ax1.boxplot(pdata, positions = xloc, 
                           showfliers = False, showcaps = False, widths=barwidth,
                           patch_artist=True, notch = False,
                           boxprops = boxprops,
                           medianprops = medianprops, whiskerprops = whiskerprops,
                          )

leghandles = [x["boxes"][0] for x in bxplt]
ax1.legend(leghandles, plot_methods, loc='lower left', handlelength = 1.2, labelspacing = 0.2,)
ax1.set_xticks(xpos)
ax1.set_xticklabels([f'{x:.2f}' for x in errsigmas])
ax1.set_xlim(min(xpos) - 1, max(xpos) + 1)
ax1.set_xlabel(r'Prior $\sigma$')
ax1.set_ylabel(r'$R^2$')
ax1.set_title(r'n = 50, p = 100, fixed $\lambda$ estimated from prior')


#plt.savefig('compare_ridge_methods.png', bbox_inches='tight', facecolor='white', transparent=True)
plt.tight_layout()
plt.show()

def jitter(arr):
    stdev = .1 * (max(arr) - min(arr))
    return arr + abs(np.random.randn(len(arr)) * stdev)

fig = plt.figure(figsize = (8, 8))
ax1 = fig.add_subplot(111)

nshow = 2

for i, method in enumerate(plot_methods):
    ydata = r2[nshow][method]
    if method == 'sklearn' or method == 'sp_lsqr' or method == 'glmnet':
        ydata = jitter(ydata)
    ax1.scatter(r2[nshow]['ridge_mle'], ydata, color = colors[method])

    
ax1.set_title(f'Comparison of $R^2$ ($\sigma$ = {errsigmas[nshow]:.2f})', pad = 20)
ax1.set_xlabel('ridge_mle')
ax1.set_ylabel('All ridge regression methods')
#ax1.set_xticks([0.03, 0.04, 0.05])
ax1.set_xlim([0.25, 0.45])
ax1.set_ylim([0, 1.05])
ax1.plot([0,1],[0,1], ls = 'dashed', color = 'gray')

#plt.savefig('compare_ridge_methods_scatter.png', bbox_inches='tight', facecolor='white', transparent=True)
plt.show()