Comparison of different methods using numerical experiments

Author

Saikat Banerjee

Published

December 29, 2023

Abstract

We perform numerical experiments to compare the power and efficiency of different methods in detecting the hidden genetics factors contributing to disease pleiotropy.

Getting Setup

Code

import numpy as np
import pandas as pd
import pickle
import sys
import os
import dsc
from dsc.query_engine import Query_Processor as dscQP
from dsc import dsc_io

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120)

Code

def stratify_dfcol(df, colname, value):
    #return pd_utils.select_dfrows(df, [f"$({colname}) == {value}"])
    return df.loc[df[colname] == value]

def stratify_dfcols(df, condition_list):
    for (colname, value) in condition_list:
        df = stratify_dfcol(df, colname, value)
    return df

def stratify_dfcols_in_list(df, colname, values):
    return df.loc[df[colname].isin(values)]

Read Simulation Results

Code

dsc_output = "/gpfs/commons/groups/knowles_lab/sbanerjee/low_rank_matrix_approximation_numerical_experiments/blockdiag"
dsc_fname  = os.path.basename(os.path.normpath(dsc_output))
db = os.path.join(dsc_output, dsc_fname + ".db")
dscoutpkl = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/lrma-dsc/dsc/results", dsc_fname + "_dscout.pkl")
dscout    = pd.read_pickle(dscoutpkl)
dscout

	DSC	simulate	simulate.n	simulate.p	simulate.k	simulate.h2	simulate.h2_shared_frac	simulate.aq	lowrankfit	matfactor	score.L_rmse	score.F_rmse	score.Z_rmse	score.L_psnr	score.F_psnr	score.Z_psnr	score.adj_MI
0	1	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.244982	0.485097	0.005098	28.429593	24.712971	21.552452	0.018100
1	2	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.264152	0.483567	0.005141	28.688465	24.252872	21.767697	0.015638
2	3	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.252526	0.499034	0.005091	28.831802	23.687408	22.263407	0.654857
3	4	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.285154	0.484862	0.005550	28.881899	24.282865	23.215231	0.561981
4	5	blockdiag	200.0	2000.0	10.0	0.2	0.5	0.6	identical	truncated_svd	0.292962	0.474069	0.005080	27.246965	24.492696	22.070442	0.024386
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
895	8	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.597619	0.720961	0.002017	20.790216	21.536737	30.477673	0.010297
896	9	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.4	identical	factorgo	0.201241	0.400942	0.002174	30.378105	26.809769	29.938074	-0.009727
897	9	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.509286	0.669955	0.002038	21.469111	20.631187	28.405981	0.003175
898	10	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.4	identical	factorgo	0.270847	0.429577	0.002189	28.086705	25.500348	30.518931	0.022548
899	10	blockdiag_aq	200.0	2000.0	10.0	0.2	0.5	0.8	identical	factorgo	0.626669	0.756680	0.002087	19.320562	20.277492	30.701310	-0.004610

900 rows × 17 columns

Compare Methods

We use several metrics to compare the different methods.

methods = {
    "rpca" : ["rpca", "truncated_svd"],
    "nnm"  : ["nnm", "truncated_svd"],
    "nnm_sparse" : ["nnm_sparse", "truncated_svd"],
    "truncated_svd" : ["identical", "truncated_svd"],
    "factorgo" : ["identical", "factorgo"],
}
method_labels = {
    "rpca" : "RPCA-IALM",
    "nnm" : "NN-FW",
    "nnm_sparse" : "NN-Sparse-FW",
    "truncated_svd": "tSVD",
    "factorgo": "FactorGO",
}

method_colors = {
    "rpca" : '#FF6800', # Vivid Orange
    "nnm" : '#C10020', # Vivid Red
    "nnm_sparse" : '#803E75', # Strong Purple
    "truncated_svd" : '#535154', # gray
    "factorgo" : '#A6BDD7', # Very Light Blue
}

# Base parameters
simparams = {'p': 2000, 'k': 10, 'h2': 0.2, 'h2_shared_frac': 0.5, 'aq': 0.6}
score_names = {
    'L_rmse': r"$\| L - \hat{L}\|_F$",
    'F_rmse': r"$\| F - \hat{F}\|_F$",
    'Z_rmse': r"$\| LF^{T} - \hat{L}\hat{F}^{T}\|_F$",
    'adj_MI': "Adjusted Mutual Information Score",
}

# score_names = {
#     'L_psnr': r"$\| L - \hat{L}\|_F$",
#     'F_psnr': r"$\| F - \hat{F}\|_F$",
#     'Z_psnr': r"$\| LF^{T} - \hat{L}\hat{F}^{T}\|_F$",
#     'adj_MI': "Adjusted Mutual Information Score",
# }

Code

def get_simulation_with_variable(df, var_name, var_values):
    condition = [(f'simulate.{k}', v) for k, v in simparams.items() if k != var_name]
    df1 = stratify_dfcols(df, condition)
    df2 = stratify_dfcols_in_list(df1, f'simulate.{var_name}', var_values)
    return df2

def get_scores_from_dataframe(df, score_name, variable_name, variable_values, 
        methods = methods):
    simdf = get_simulation_with_variable(df, variable_name, variable_values)
    scores = {key: list() for key in methods.keys()}
    for method, mlist in methods.items():
        mrows = stratify_dfcols(simdf, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
        for value in variable_values:
            vrows = stratify_dfcol(mrows, f'simulate.{variable_name}', value)
            scores[method].append(vrows[f'score.{score_name}'].to_numpy())
    return scores

Code

def random_jitter(xvals, yvals, d = 0.1):
    xjitter = [x + np.random.randn(len(y)) * d for x, y in zip(xvals, yvals)]
    return xjitter

def boxplot_scores(variable, variable_values, 
        methods = methods, score_names = score_names,
        dscout = dscout, method_colors = method_colors):
    
    nmethods = len(methods)
    nvariables = len(variable_values)
    nscores = len(score_names)
    
    figh = 6
    figw = (nscores * figh) + (nscores - 1)
    fig = plt.figure(figsize = (figw, figh))
    axs = [fig.add_subplot(1, nscores, x+1) for x in range(nscores)]
    boxs = {x: None for x in methods.keys()}
    
    for i, (score_name, score_label) in enumerate(score_names.items()):
        scores = get_scores_from_dataframe(dscout, score_name, variable, variable_values)
        for j, mkey in enumerate(methods.keys()):
            boxcolor = method_colors[mkey]
            boxface = f'#{boxcolor[1:]}80'
            medianprops = dict(linewidth=0, color = boxcolor)
            whiskerprops = dict(linewidth=2, color = boxcolor)
            boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
            flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

            xpos = [x * (nmethods + 1) + j for x in range(nvariables)]
            boxs[mkey] = axs[i].boxplot(scores[mkey], positions = xpos,
                showcaps = False, showfliers = False,
                widths = 0.7, patch_artist = True, notch = False,
                flierprops = flierprops, boxprops = boxprops,
                medianprops = medianprops, whiskerprops = whiskerprops)
            
            axs[i].scatter(random_jitter(xpos, scores[mkey]), scores[mkey], 
                           edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                           s = 10)

        xcenter = [x * (nmethods + 1) + (nmethods - 1) / 2 for x in range(nvariables)]
        axs[i].set_xticks(xcenter)
        axs[i].set_xticklabels(variable_values)
        axs[i].set_xlabel(variable)
        axs[i].set_ylabel(score_label)
        xlim_low = 0 - (nvariables - 1) / 2
        #xlim_high = (nvariables - 1) * (nmethods + 1) + (nmethods - 1) + (nvariables - 1) / 2
        xlim_high = (nmethods + 1.5) * nvariables - 2.5
        axs[i].set_xlim( xlim_low, xlim_high )

    plt.tight_layout()
    return axs, boxs

variable = 'p'
variable_values = [500, 1000, 2000, 5000, 10000]

axs, boxs = boxplot_scores(variable, variable_values)
plt.show()

Code

fig = plt.figure(figsize = (6, 2))
ax1 = fig.add_subplot(111)
handles = [boxs[mkey]["boxes"][0] for mkey in methods.keys()]
labels = [method_labels[mkey] for mkey in methods.keys()]
ax1.legend(handles = handles, labels = labels, loc = 'upper left', frameon = False, handlelength = 4, ncol = 2)

for side, border in ax1.spines.items():
    border.set_visible(False)
ax1.tick_params(bottom = False, top = False, left = False, right = False,
                   labelbottom = False, labeltop = False, labelleft = False, labelright = False)


plt.show()

Code

variable = 'k'
variable_values = [2,5,10,15,20]
boxplot_scores(variable, variable_values)
plt.show()

Code

variable = 'h2'
variable_values = [0.05, 0.1, 0.2, 0.3, 0.4]
boxplot_scores(variable, variable_values)
plt.show()

Code

variable = 'h2_shared_frac'
variable_values = [0.2, 0.5, 0.8, 1.0]
boxplot_scores(variable, variable_values)
plt.show()

Code

variable = 'aq'
variable_values = [0.4, 0.6, 0.8]
boxplot_scores(variable, variable_values)
plt.show()

PSNR

Instead of the Frobenius norm, we can also use the Peak Signal-to-Noise Ratio (PSNR) for comparing the methods.

Code

psnr_score_names = {
    'L_psnr': r"PSNR ($L$)",
    'F_psnr': r"PSNR ($F$)",
    'Z_psnr': r"PSNR ($LF^T$)",
}

variable = 'p'
variable_values = [500, 1000, 2000, 5000, 10000]

axs, boxs = boxplot_scores(variable, variable_values, score_names = psnr_score_names)
plt.show()