Figures for RPPR

Author

Saikat Banerjee

Published

April 12, 2024

Abstract
Plots used in the RPPR, using NYGC color palette.

Setup

Code
import numpy as np
import pandas as pd
import pickle
import sys
import os
import dsc
from dsc.query_engine import Query_Processor as dscQP
from dsc import dsc_io

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pymir import mpl_stylesheet
from pymir import mpl_utils
Code
# NYGC Color Palette
nygc_colors = {
    'brown': '#7F0814',
    'darkred': '#d42e12',
    'orange': '#F37239',
    'darkyellow': '#F79320',
    'yellow': '#FFE438',
    'darkblue': '#003059',
    'blue': '#266DB6',
    'lightblue': '#A3D5ED',
    'darkgreen': '#006838',
    'green': '#0A8A42',
    'lightgreen': '#74B74A',
    'yellowgreen': '#BAD75F',
    'darkgray': '#1A1A1A',
    'gray': '#666666',
    'lightgray': '#CCCCCC',
    'khaki': '#ADA194',
    'darkkhaki': '#5E514D',
}

# Style sheet for NYGC poster
mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 28, 
    splinecolor = nygc_colors['darkgray'], black = nygc_colors['darkgray'])

Simulation

Code
dsc_output = "/gpfs/commons/groups/knowles_lab/sbanerjee/low_rank_matrix_approximation_numerical_experiments/blockdiag"
dsc_fname  = os.path.basename(os.path.normpath(dsc_output))
db = os.path.join(dsc_output, dsc_fname + ".db")
dscoutpkl = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/lrma-dsc/dsc/results", dsc_fname + "_dscout.pkl")
dscout    = pd.read_pickle(dscoutpkl)
dscout
DSC simulate simulate.n simulate.p simulate.k simulate.h2 simulate.h2_shared_frac simulate.aq lowrankfit matfactor score.L_rmse score.F_rmse score.Z_rmse score.L_psnr score.F_psnr score.Z_psnr score.adj_MI
0 1 blockdiag 200.0 2000.0 10.0 0.2 0.5 0.6 identical truncated_svd 0.244982 0.414399 0.004264 28.429593 26.081163 23.105261 0.018100
1 2 blockdiag 200.0 2000.0 10.0 0.2 0.5 0.6 identical truncated_svd 0.264152 0.417129 0.004318 28.688465 25.536584 23.284510 0.015638
2 3 blockdiag 200.0 2000.0 10.0 0.2 0.5 0.6 identical truncated_svd 0.252526 0.419340 0.004285 28.831802 25.198688 23.760390 0.654857
3 4 blockdiag 200.0 2000.0 10.0 0.2 0.5 0.6 identical truncated_svd 0.285154 0.409911 0.004718 28.881899 25.741441 24.626305 0.561981
4 5 blockdiag 200.0 2000.0 10.0 0.2 0.5 0.6 identical truncated_svd 0.292962 0.420819 0.004262 27.246965 25.527629 23.594094 0.024386
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
895 8 blockdiag_aq 200.0 2000.0 10.0 0.2 0.5 0.8 identical factorgo 0.597619 0.720961 0.002017 20.790216 21.536737 30.477673 0.010297
896 9 blockdiag_aq 200.0 2000.0 10.0 0.2 0.5 0.4 identical factorgo 0.201241 0.400942 0.002174 30.378105 26.809769 29.938074 -0.009727
897 9 blockdiag_aq 200.0 2000.0 10.0 0.2 0.5 0.8 identical factorgo 0.509286 0.669955 0.002038 21.469111 20.631187 28.405981 0.003175
898 10 blockdiag_aq 200.0 2000.0 10.0 0.2 0.5 0.4 identical factorgo 0.270847 0.429577 0.002189 28.086705 25.500348 30.518931 0.022548
899 10 blockdiag_aq 200.0 2000.0 10.0 0.2 0.5 0.8 identical factorgo 0.626669 0.756680 0.002087 19.320562 20.277492 30.701310 -0.004610

900 rows × 17 columns

Code
def stratify_dfcol(df, colname, value):
    #return pd_utils.select_dfrows(df, [f"$({colname}) == {value}"])
    return df.loc[df[colname] == value]

def stratify_dfcols(df, condition_list):
    for (colname, value) in condition_list:
        df = stratify_dfcol(df, colname, value)
    return df

def stratify_dfcols_in_list(df, colname, values):
    return df.loc[df[colname].isin(values)]

methods = {
    "rpca" : ["rpca", "truncated_svd"],
    "nnm"  : ["nnm", "truncated_svd"],
    "nnm_sparse" : ["nnm_sparse", "truncated_svd"],
    "truncated_svd" : ["identical", "truncated_svd"],
    "factorgo" : ["identical", "factorgo"],
}
method_labels = {
    "rpca" : "RobustPCA",
    "nnm" : "NNM",
    "nnm_sparse" : "NNM-Sparse",
    "truncated_svd": "tSVD",
    "factorgo": "FactorGO",
}

method_colors = {
    "rpca" : nygc_colors['brown'], # Vivid Orange
    "nnm" : nygc_colors['darkred'], # Vivid Red
    "nnm_sparse" : nygc_colors['darkyellow'], # Strong Purple
    "truncated_svd" : nygc_colors['lightblue'], # gray
    "factorgo" : nygc_colors['lightgreen'], # Very Light Blue
}

# method_colors = {
#     "rpca" : '#FF6800', # Vivid Orange
#     "nnm" : '#C10020', # Vivid Red
#     "nnm_sparse" : '#803E75', # Strong Purple
#     "truncated_svd" : '#535154', # gray
#     "factorgo" : '#A6BDD7', # Very Light Blue
# }

# Base parameters
simparams = {'p': 2000, 'k': 10, 'h2': 0.2, 'h2_shared_frac': 0.5, 'aq': 0.6}
score_names = {
    'L_rmse': r"$\| L - \hat{L}\|_F$",
    'F_rmse': r"$\| F - \hat{F}\|_F$",
    'Z_rmse': r"$\| LF^{T} - \hat{L}\hat{F}^{T}\|_F$",
    'adj_MI': "Adjusted MI Score",
}

def get_simulation_with_variable(df, var_name, var_values):
    condition = [(f'simulate.{k}', v) for k, v in simparams.items() if k != var_name]
    df1 = stratify_dfcols(df, condition)
    df2 = stratify_dfcols_in_list(df1, f'simulate.{var_name}', var_values)
    return df2

def get_scores_from_dataframe(df, score_name, variable_name, variable_values, 
        methods = methods):
    simdf = get_simulation_with_variable(df, variable_name, variable_values)
    scores = {key: list() for key in methods.keys()}
    for method, mlist in methods.items():
        mrows = stratify_dfcols(simdf, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
        for value in variable_values:
            vrows = stratify_dfcol(mrows, f'simulate.{variable_name}', value)
            scores[method].append(vrows[f'score.{score_name}'].to_numpy())
    return scores

def random_jitter(xvals, yvals, d = 0.1):
    xjitter = [x + np.random.randn(len(y)) * d for x, y in zip(xvals, yvals)]
    return xjitter
Code
fig = plt.figure(figsize = (32, 32))
gs = GridSpec(nrows = 5, ncols = 3, figure=fig, height_ratios=[0.4, 1, 1, 1, 1])
axcol = [None for i in range(4)]

def boxplot_col(fig, grid, colidx, variable, variable_values, variable_label,
        methods = methods, score_names = score_names,
        dscout = dscout, method_colors = method_colors):
    
    nmethods = len(methods)
    nvariables = len(variable_values)
    nscores = len(score_names)

    axs = [fig.add_subplot(grid[i + 1, colidx]) for i in range(nscores)]
    boxs = {x: None for x in methods.keys()}
    
    for i, (score_name, score_label) in enumerate(score_names.items()):
        scores = get_scores_from_dataframe(dscout, score_name, variable, variable_values)
        for j, mkey in enumerate(methods.keys()):
            boxcolor = method_colors[mkey]
            boxface = f'#{boxcolor[1:]}80'
            medianprops = dict(linewidth=0, color = boxcolor)
            whiskerprops = dict(linewidth=2, color = boxcolor)
            boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
            flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

            xpos = [x * (nmethods + 1) + j for x in range(nvariables)]
            boxs[mkey] = axs[i].boxplot(scores[mkey], positions = xpos,
                showcaps = False, showfliers = False,
                widths = 0.7, patch_artist = True, notch = False,
                flierprops = flierprops, boxprops = boxprops,
                medianprops = medianprops, whiskerprops = whiskerprops)
            
            axs[i].scatter(random_jitter(xpos, scores[mkey]), scores[mkey], 
                           edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                           s = 10)

        xcenter = [x * (nmethods + 1) + (nmethods - 1) / 2 for x in range(nvariables)]
        axs[i].set_xticks(xcenter)
        axs[i].set_xticklabels(variable_values)
        if i == 0:
            axs[i].set_xlabel(variable_label, labelpad = 30)
            axs[i].xaxis.set_label_position('top')
        # axs[i].set_ylabel(score_label)
        xlim_low = 0 - (nvariables - 1) / 2
        #xlim_high = (nvariables - 1) * (nmethods + 1) + (nmethods - 1) + (nvariables - 1) / 2
        xlim_high = (nmethods + 1.5) * nvariables - 2.5
        axs[i].set_xlim( xlim_low, xlim_high )

    plt.tight_layout()
    return axs, boxs

colidx = 0
variable = 'p'
variable_values = [500, 1000, 2000, 5000, 10000]
# variable_values = [500, 1000]
variable_label = r'No. of variants ($p$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 1
variable = 'k'
variable_values = [2, 5, 10, 15, 20]
# variable_values = [2, 5]
variable_label = r'No. of factors ($k$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 2
variable = 'h2'
variable_values = [0.05, 0.1, 0.2, 0.3, 0.4]
# variable_values = [0.05, 0.1]
variable_label = r'Heritability ($h^2$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 3
variable = 'aq'
variable_values = [0.4, 0.6, 0.8]
# variable_values = [0.4, 0.6]
variable_label = r'Strength of correlation ($\alpha_q$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)


# Legend
axleg = fig.add_subplot(gs[0, :])
handles = [boxs[0][mkey]["boxes"][0] for mkey in methods.keys()]
labels  = [method_labels[mkey] for mkey in methods.keys()]
axleg.legend(handles = handles, labels = labels, loc = 'upper left', frameon = False, handlelength = 4, ncol = 2)

for side, border in axleg.spines.items():
    border.set_visible(False)
axleg.tick_params(bottom = False, top = False, left = False, right = False,
                   labelbottom = False, labeltop = False, labelleft = False, labelright = False)


# Score labels for each row
for i, (score_name, score_label) in enumerate(score_names.items()):
    ax = fig.add_subplot(gs[i + 1, :])
    ax.patch.set_facecolor("None")
    ax.set_ylabel(score_label, labelpad = 120)
    ax.tick_params(bottom = False, top = False, left = False, right = False,
        labelbottom = False, labeltop = False, labelleft = False, labelright = False)
    for side, border in ax.spines.items():
        border.set_visible(False)

plt.savefig('../plots/rppr-2024/numerical_experiments.png', bbox_inches='tight')
plt.savefig('../plots/rppr-2024/numerical_experiments.pdf', bbox_inches='tight')
plt.show()

Code
def get_allsim_scores_from_dataframe(df, score_name,  methods = methods):
    scores = {key: None for key in methods.keys()}
    for method, mlist in methods.items():
        mrows = stratify_dfcols(dscout, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
        scores[method] = mrows[f'score.{score_name}'].to_numpy()
    return scores

scores = get_allsim_scores_from_dataframe(dscout, 'adj_MI')
Code
fig = plt.figure(figsize = (8, 8))
ax1 = fig.add_subplot(111)

nmethods = len(methods)
boxs = {x: None for x in methods.keys()}

for j, mkey in enumerate(methods.keys()):
    boxcolor = method_colors[mkey]
    boxface = f'#{boxcolor[1:]}80'
    medianprops = dict(linewidth=0, color = boxcolor)
    whiskerprops = dict(linewidth=2, color = boxcolor)
    boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
    flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

    xpos = j + 1
    boxs[mkey] = ax1.boxplot([scores[mkey],], 
        positions = [xpos,],
        showcaps = False, showfliers = False,
        widths = 0.7, patch_artist = True, notch = False,
        flierprops = flierprops, boxprops = boxprops,
        medianprops = medianprops, whiskerprops = whiskerprops)

    ax1.scatter(random_jitter([xpos,], [scores[mkey],]), [scores[mkey],], 
                   edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                   s = 10)
ax1.set_xticklabels([method_labels[m] for m in methods.keys()], rotation = 90)
ax1.set_ylabel("Adjusted MI Score")

plt.savefig('../plots/rppr-2024/allsim_adjusted_MI.png', bbox_inches='tight')
plt.savefig('../plots/rppr-2024/allsim_adjusted_MI.pdf', bbox_inches='tight')
plt.show()