Figures for RPPR

Author

Saikat Banerjee

Published

April 12, 2024

Abstract

Plots used in the RPPR, using NYGC color palette.

Setup

Code

import numpy as np
import pandas as pd
import pickle
import sys
import os
import dsc
from dsc.query_engine import Query_Processor as dscQP
from dsc import dsc_io

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pymir import mpl_stylesheet
from pymir import mpl_utils

Code

# NYGC Color Palette
nygc_colors = {
    'brown': '#7F0814',
    'darkred': '#d42e12',
    'orange': '#F37239',
    'darkyellow': '#F79320',
    'yellow': '#FFE438',
    'darkblue': '#003059',
    'blue': '#266DB6',
    'lightblue': '#A3D5ED',
    'darkgreen': '#006838',
    'green': '#0A8A42',
    'lightgreen': '#74B74A',
    'yellowgreen': '#BAD75F',
    'darkgray': '#1A1A1A',
    'gray': '#666666',
    'lightgray': '#CCCCCC',
    'khaki': '#ADA194',
    'darkkhaki': '#5E514D',
}

# Style sheet for NYGC poster
mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 28, 
    splinecolor = nygc_colors['darkgray'], black = nygc_colors['darkgray'])

Simulation

Code

dsc_output = "/gpfs/commons/groups/knowles_lab/sbanerjee/low_rank_matrix_approximation_numerical_experiments/blockdiag"
dsc_fname  = os.path.basename(os.path.normpath(dsc_output))
db = os.path.join(dsc_output, dsc_fname + ".db")
dscoutpkl = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/lrma-dsc/dsc/results", dsc_fname + "_dscout.pkl")
dscout    = pd.read_pickle(dscoutpkl)
dscout

	DSC	simulate	simulate.n	simulate.p	simulate.k	simulate.h2	simulate.h2_shared_frac	simulate.aq	lowrankfit	matfactor	score.L_rmse	score.F_rmse	score.Z_rmse	score.L_psnr	score.F_psnr	score.Z_psnr	score.adj_MI
0	1	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.244982	0.414399	0.004264	28.429593	26.081163	23.105261	0.018100
1	2	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.264152	0.417129	0.004318	28.688465	25.536584	23.284510	0.015638
2	3	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.252526	0.419340	0.004285	28.831802	25.198688	23.760390	0.654857
3	4	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.285154	0.409911	0.004718	28.881899	25.741441	24.626305	0.561981
4	5	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.292962	0.420819	0.004262	27.246965	25.527629	23.594094	0.024386
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
895	8	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.597619	0.720961	0.002017	20.790216	21.536737	30.477673	0.010297
896	9	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.4	identical	factorgo	0.201241	0.400942	0.002174	30.378105	26.809769	29.938074	-0.009727
897	9	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.509286	0.669955	0.002038	21.469111	20.631187	28.405981	0.003175
898	10	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.4	identical	factorgo	0.270847	0.429577	0.002189	28.086705	25.500348	30.518931	0.022548
899	10	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.626669	0.756680	0.002087	19.320562	20.277492	30.701310	-0.004610

900 rows × 17 columns

Code

def stratify_dfcol(df, colname, value):
    #return pd_utils.select_dfrows(df, [f"$({colname}) == {value}"])
    return df.loc[df[colname] == value]

def stratify_dfcols(df, condition_list):
    for (colname, value) in condition_list:
        df = stratify_dfcol(df, colname, value)
    return df

def stratify_dfcols_in_list(df, colname, values):
    return df.loc[df[colname].isin(values)]

methods = {
    "rpca" : ["rpca", "truncated_svd"],
    "nnm"  : ["nnm", "truncated_svd"],
    "nnm_sparse" : ["nnm_sparse", "truncated_svd"],
    "truncated_svd" : ["identical", "truncated_svd"],
    "factorgo" : ["identical", "factorgo"],
}
method_labels = {
    "rpca" : "RobustPCA",
    "nnm" : "NNM",
    "nnm_sparse" : "NNM-Sparse",
    "truncated_svd": "tSVD",
    "factorgo": "FactorGO",
}

method_colors = {
    "rpca" : nygc_colors['brown'], # Vivid Orange
    "nnm" : nygc_colors['darkred'], # Vivid Red
    "nnm_sparse" : nygc_colors['darkyellow'], # Strong Purple
    "truncated_svd" : nygc_colors['lightblue'], # gray
    "factorgo" : nygc_colors['lightgreen'], # Very Light Blue
}

# method_colors = {
#     "rpca" : '#FF6800', # Vivid Orange
#     "nnm" : '#C10020', # Vivid Red
#     "nnm_sparse" : '#803E75', # Strong Purple
#     "truncated_svd" : '#535154', # gray
#     "factorgo" : '#A6BDD7', # Very Light Blue
# }

# Base parameters
simparams = {'p': 2000, 'k': 10, 'h2': 0.2, 'h2_shared_frac': 0.5, 'aq': 0.6}
score_names = {
    'L_rmse': r"$\| L - \hat{L}\|_F$",
    'F_rmse': r"$\| F - \hat{F}\|_F$",
    'Z_rmse': r"$\| LF^{T} - \hat{L}\hat{F}^{T}\|_F$",
    'adj_MI': "Adjusted MI Score",
}

def get_simulation_with_variable(df, var_name, var_values):
    condition = [(f'simulate.{k}', v) for k, v in simparams.items() if k != var_name]
    df1 = stratify_dfcols(df, condition)
    df2 = stratify_dfcols_in_list(df1, f'simulate.{var_name}', var_values)
    return df2

def get_scores_from_dataframe(df, score_name, variable_name, variable_values, 
        methods = methods):
    simdf = get_simulation_with_variable(df, variable_name, variable_values)
    scores = {key: list() for key in methods.keys()}
    for method, mlist in methods.items():
        mrows = stratify_dfcols(simdf, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
        for value in variable_values:
            vrows = stratify_dfcol(mrows, f'simulate.{variable_name}', value)
            scores[method].append(vrows[f'score.{score_name}'].to_numpy())
    return scores

def random_jitter(xvals, yvals, d = 0.1):
    xjitter = [x + np.random.randn(len(y)) * d for x, y in zip(xvals, yvals)]
    return xjitter

Code

fig = plt.figure(figsize = (32, 32))
gs = GridSpec(nrows = 5, ncols = 3, figure=fig, height_ratios=[0.4, 1, 1, 1, 1])
axcol = [None for i in range(4)]

def boxplot_col(fig, grid, colidx, variable, variable_values, variable_label,
        methods = methods, score_names = score_names,
        dscout = dscout, method_colors = method_colors):
    
    nmethods = len(methods)
    nvariables = len(variable_values)
    nscores = len(score_names)

    axs = [fig.add_subplot(grid[i + 1, colidx]) for i in range(nscores)]
    boxs = {x: None for x in methods.keys()}
    
    for i, (score_name, score_label) in enumerate(score_names.items()):
        scores = get_scores_from_dataframe(dscout, score_name, variable, variable_values)
        for j, mkey in enumerate(methods.keys()):
            boxcolor = method_colors[mkey]
            boxface = f'#{boxcolor[1:]}80'
            medianprops = dict(linewidth=0, color = boxcolor)
            whiskerprops = dict(linewidth=2, color = boxcolor)
            boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
            flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

            xpos = [x * (nmethods + 1) + j for x in range(nvariables)]
            boxs[mkey] = axs[i].boxplot(scores[mkey], positions = xpos,
                showcaps = False, showfliers = False,
                widths = 0.7, patch_artist = True, notch = False,
                flierprops = flierprops, boxprops = boxprops,
                medianprops = medianprops, whiskerprops = whiskerprops)
            
            axs[i].scatter(random_jitter(xpos, scores[mkey]), scores[mkey], 
                           edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                           s = 10)

        xcenter = [x * (nmethods + 1) + (nmethods - 1) / 2 for x in range(nvariables)]
        axs[i].set_xticks(xcenter)
        axs[i].set_xticklabels(variable_values)
        if i == 0:
            axs[i].set_xlabel(variable_label, labelpad = 30)
            axs[i].xaxis.set_label_position('top')
        # axs[i].set_ylabel(score_label)
        xlim_low = 0 - (nvariables - 1) / 2
        #xlim_high = (nvariables - 1) * (nmethods + 1) + (nmethods - 1) + (nvariables - 1) / 2
        xlim_high = (nmethods + 1.5) * nvariables - 2.5
        axs[i].set_xlim( xlim_low, xlim_high )

    plt.tight_layout()
    return axs, boxs

colidx = 0
variable = 'p'
variable_values = [500, 1000, 2000, 5000, 10000]
# variable_values = [500, 1000]
variable_label = r'No. of variants ($p$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 1
variable = 'k'
variable_values = [2, 5, 10, 15, 20]
# variable_values = [2, 5]
variable_label = r'No. of factors ($k$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 2
variable = 'h2'
variable_values = [0.05, 0.1, 0.2, 0.3, 0.4]
# variable_values = [0.05, 0.1]
variable_label = r'Heritability ($h^2$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)

colidx = 3
variable = 'aq'
variable_values = [0.4, 0.6, 0.8]
# variable_values = [0.4, 0.6]
variable_label = r'Strength of correlation ($\alpha_q$)'
axcol[colidx], boxs[colidx] = boxplot_col(fig, gs, colidx, variable, variable_values, variable_label)


# Legend
axleg = fig.add_subplot(gs[0, :])
handles = [boxs[0][mkey]["boxes"][0] for mkey in methods.keys()]
labels  = [method_labels[mkey] for mkey in methods.keys()]
axleg.legend(handles = handles, labels = labels, loc = 'upper left', frameon = False, handlelength = 4, ncol = 2)

for side, border in axleg.spines.items():
    border.set_visible(False)
axleg.tick_params(bottom = False, top = False, left = False, right = False,
                   labelbottom = False, labeltop = False, labelleft = False, labelright = False)


# Score labels for each row
for i, (score_name, score_label) in enumerate(score_names.items()):
    ax = fig.add_subplot(gs[i + 1, :])
    ax.patch.set_facecolor("None")
    ax.set_ylabel(score_label, labelpad = 120)
    ax.tick_params(bottom = False, top = False, left = False, right = False,
        labelbottom = False, labeltop = False, labelleft = False, labelright = False)
    for side, border in ax.spines.items():
        border.set_visible(False)

plt.savefig('../plots/rppr-2024/numerical_experiments.png', bbox_inches='tight')
plt.savefig('../plots/rppr-2024/numerical_experiments.pdf', bbox_inches='tight')
plt.show()

Code

def get_allsim_scores_from_dataframe(df, score_name,  methods = methods):
    scores = {key: None for key in methods.keys()}
    for method, mlist in methods.items():
        mrows = stratify_dfcols(dscout, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
        scores[method] = mrows[f'score.{score_name}'].to_numpy()
    return scores

scores = get_allsim_scores_from_dataframe(dscout, 'adj_MI')

Code

fig = plt.figure(figsize = (8, 8))
ax1 = fig.add_subplot(111)

nmethods = len(methods)
boxs = {x: None for x in methods.keys()}

for j, mkey in enumerate(methods.keys()):
    boxcolor = method_colors[mkey]
    boxface = f'#{boxcolor[1:]}80'
    medianprops = dict(linewidth=0, color = boxcolor)
    whiskerprops = dict(linewidth=2, color = boxcolor)
    boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
    flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

    xpos = j + 1
    boxs[mkey] = ax1.boxplot([scores[mkey],], 
        positions = [xpos,],
        showcaps = False, showfliers = False,
        widths = 0.7, patch_artist = True, notch = False,
        flierprops = flierprops, boxprops = boxprops,
        medianprops = medianprops, whiskerprops = whiskerprops)

    ax1.scatter(random_jitter([xpos,], [scores[mkey],]), [scores[mkey],], 
                   edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                   s = 10)
ax1.set_xticklabels([method_labels[m] for m in methods.keys()], rotation = 90)
ax1.set_ylabel("Adjusted MI Score")

plt.savefig('../plots/rppr-2024/allsim_adjusted_MI.png', bbox_inches='tight')
plt.savefig('../plots/rppr-2024/allsim_adjusted_MI.pdf', bbox_inches='tight')
plt.show()