CloriNN manuscript figures - softImpute benchmark

Author

Saikat Banerjee

Published

February 16, 2026

Abstract

High quality plots used for softImpute benchmark figures, using manuscript color palette.

Code

import numpy as np
import pandas as pd
import pickle
import sys
import os
import dsc
from dsc.query_engine import Query_Processor as dscQP
from dsc import dsc_io

import matplotlib
import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils

Code

# import matplotlib.font_manager as mpl_fm
# font_path = '/gpfs/commons/home/sbanerjee/nygc/Futura'
# mpl_fm.fontManager.addfont(font_path + '/FuturaStd-Book.otf') # Loads "Futura Std"

# mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 300)
# futura_book = FontProperties(fname='/gpfs/commons/home/sbanerjee/nygc/Futura/FuturaStd-Book.otf')

manuscript_colors = {
    'brown': '#7F180D',
    'darkred': '#C10020',
    'darkyellow': '#FF6800',
    'blue': '#00538A',
    'green': '#0A8A42',
    'lightgreen': '#74B74A',    
    'yellowgreen': '#93AA00',
    'lightblue': '#A6BDD7',
    'purple': '#803E75',
    'olive': '#232C16',
    'khaki': '#CEA262',
    'darkgray': '#1A1A1A',
    'orange': '#F37239',
}

# Style sheet for manuscript
mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 22, 
    splinecolor = manuscript_colors['darkgray'], black = manuscript_colors['darkgray'])
# plt.rcParams['font.family'] = 'Futura Std'

Code

dsc_output = "/gpfs/commons/groups/knowles_lab/sbanerjee/low_rank_matrix_approximation_numerical_experiments/mc_benchmark"
dsc_fname  = os.path.basename(os.path.normpath(dsc_output))
db = os.path.join(dsc_output, dsc_fname + ".db")
dscoutpkl = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/lrma-dsc/dsc/results", dsc_fname + "_dscout.pkl")
dscout    = pd.read_pickle(dscoutpkl)

dscout = dscout.rename(columns={
    "mcmethods.n_iter": "score.n_iter",
    "mcmethods.time_sec": "score.time_sec"
})

dscout["score.time_per_iter"] = dscout["score.time_sec"] / dscout["score.n_iter"]

dscout

	DSC	input_with_nan.n	input_with_nan.p	input_with_nan.k	input_with_nan.h2	input_with_nan.h2_shared_frac	input_with_nan.aq	input_with_nan.nsample_minmax	input_with_nan.missing_ratio	mcmethods	score.n_iter	score.time_sec	score.test_rmse	score.time_per_iter
0	1	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.05	frankwolfe	576	130.586818	1.755165	0.226713
1	1	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.10	frankwolfe	597	133.042829	1.752237	0.222852
2	1	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.20	frankwolfe	714	143.325775	1.779257	0.200736
3	1	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.40	frankwolfe	761	164.217400	1.743727	0.215792
4	2	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.05	frankwolfe	622	143.336019	1.721939	0.230444
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
75	9	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.40	softimpute	3	0.705388	1.740518	0.235129
76	10	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.05	softimpute	8	1.190660	1.717270	0.148833
77	10	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.10	softimpute	6	0.440634	1.829446	0.073439
78	10	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.20	softimpute	20	1.916495	1.802666	0.095825
79	10	200	2000	10	0.2	0.4	0.6	(10000,40000)	0.40	softimpute	3	0.714266	1.810311	0.238089

80 rows × 14 columns

Code

def stratify_dfcol(df, colname, value):
    #return pd_utils.select_dfrows(df, [f"$({colname}) == {value}"])
    if value is None:
        return df.loc[df[colname].isnull()]
    else:
        return df.loc[df[colname] == value]

def stratify_dfcols(df, condition_list):
    for (colname, value) in condition_list:
        df = stratify_dfcol(df, colname, value)
    return df

def stratify_dfcols_in_list(df, colname, values):
    return df.loc[df[colname].isin(values)]


method_labels = {
    "frankwolfe" : "Clorinn (NNM)",
    "softimpute": "SoftImpute",
}

method_colors = {
    "frankwolfe" : manuscript_colors['brown'],
    "softimpute" : manuscript_colors['blue'],
}

# Base parameters
simparams = {'p': 2000, 'k': 10, 'h2': 0.2, 'h2_shared_frac': 0.4, 'aq': 0.6}
score_names = {
    'test_rmse': "Test RMSE",
    'time_per_iter': "Time (sec) per iteration",
    'n_iter': "No. of iterations"
}
panel_labels = ["a)", "b)", "c)"]

def get_simulation_with_variable(df, var_name, var_values):
    condition = [(f'input_with_nan.{k}', v) for k, v in simparams.items() if k != var_name]
    df1 = stratify_dfcols(df, condition)
    df2 = stratify_dfcols_in_list(df1, f'input_with_nan.{var_name}', var_values)
    return df2

def get_scores_from_dataframe(df, score_name, variable_name, variable_values, 
        methods = method_labels.keys()):
    simdf = get_simulation_with_variable(df, variable_name, variable_values)
    scores = {key: list() for key in methods}
    for method in methods:
        mrows = stratify_dfcols(simdf, [('mcmethods', method)])
        # mrows = stratify_dfcols(simdf, [('lowrankfit', mlist[0]), ('mfmethods', mlist[1])])
        for value in variable_values:
            vrows = stratify_dfcol(mrows, f'input_with_nan.{variable_name}', value)
            scores[method].append(vrows[f'score.{score_name}'].to_numpy())
    return scores

def random_jitter(xvals, yvals, d = 0.1):
    xjitter = [x + np.random.randn(len(y)) * d for x, y in zip(xvals, yvals)]
    return xjitter

def boxplot_scores(variable, variable_values, 
        methods = method_labels.keys(), score_names = score_names,
        dscout = dscout, method_colors = method_colors,
        panel_labels = None,
        custom_font = 'Futura Std', xlabel = "Fraction of missing values"):
    
    nmethods = len(methods)
    nvariables = len(variable_values)
    nscores = len(score_names)
    
    figh = 8
    figw = (nscores * figh) + (nscores - 1)
    fig = plt.figure(figsize = (figw, figh + 1))
    axs = [fig.add_subplot(1, nscores, x+1) for x in range(nscores)]
    boxs = {x: None for x in methods}
    
    for i, (score_name, score_label) in enumerate(score_names.items()):
        scores = get_scores_from_dataframe(dscout, score_name, variable, variable_values)
        for j, mkey in enumerate(methods):
            boxcolor = method_colors[mkey]
            boxface = f'#{boxcolor[1:]}80'
            medianprops = dict(linewidth=0, color = boxcolor)
            whiskerprops = dict(linewidth=2, color = boxcolor)
            boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
            flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)

            xpos = [x * (nmethods + 1) + j for x in range(nvariables)]
            boxs[mkey] = axs[i].boxplot(scores[mkey], positions = xpos,
                showcaps = False, showfliers = False,
                widths = 0.7, patch_artist = True, notch = False,
                flierprops = flierprops, boxprops = boxprops,
                medianprops = medianprops, whiskerprops = whiskerprops)
            
            axs[i].scatter(random_jitter(xpos, scores[mkey]), scores[mkey], 
                           edgecolor = boxcolor, facecolor = boxface, linewidths = 1, 
                           s = 10)

        xcenter = [x * (nmethods + 1) + (nmethods - 1) / 2 for x in range(nvariables)]
        axs[i].set_xticks(xcenter)
        axs[i].set_xticklabels(variable_values)
        axs[i].set_xlabel(xlabel)
        axs[i].set_ylabel(score_label)
        xlim_low = 0 - (nvariables - 1) / 2
        #xlim_high = (nvariables - 1) * (nmethods + 1) + (nmethods - 1) + (nvariables - 1) / 2
        xlim_high = (nmethods + 1.5) * nvariables - 2.5
        axs[i].set_xlim( xlim_low, xlim_high )
        
        if panel_labels is not None:
            axs[i].text(0, 1.1, panel_labels[i], transform=axs[i].transAxes, 
                        fontweight='bold', fontsize = 28)

    plt.tight_layout()
    return axs, boxs

variable = 'missing_ratio'
variable_values = [0.05, 0.1, 0.2, 0.4]

axs, boxs = boxplot_scores(variable, variable_values, panel_labels = panel_labels)

handles = [boxs[mkey]["boxes"][0] for mkey in method_labels.keys()]
labels = [method_labels[mkey] for mkey in method_labels.keys()]
axs[0].legend(handles = handles, labels = labels, 
           loc = 'upper left', frameon = False, handlelength = 2, ncol = 1)

# plt.tight_layout()
# plt.savefig('../plots/softimpute_benchmark_results.png', bbox_inches='tight')
plt.savefig('../plots/softimpute_benchmark_results.pdf', bbox_inches='tight')
plt.show()