def stratify_dfcol(df, colname, value):
#return pd_utils.select_dfrows(df, [f"$({colname}) == {value}"])
return df.loc[df[colname] == value]
def stratify_dfcols(df, condition_list):
for (colname, value) in condition_list:
df = stratify_dfcol(df, colname, value)
return df
def stratify_dfcols_in_list(df, colname, values):
return df.loc[df[colname].isin(values)]
methods = {
"rpca" : ["rpca", "truncated_svd"],
"nnm" : ["nnm", "truncated_svd"],
"nnm_sparse" : ["nnm_sparse", "truncated_svd"],
"truncated_svd" : ["identical", "truncated_svd"],
"factorgo" : ["identical", "factorgo"],
}
method_labels = {
"rpca" : "RobustPCA",
"nnm" : "NNM",
"nnm_sparse" : "NNM-Sparse",
"truncated_svd": "tSVD",
"factorgo": "FactorGO",
}
method_colors = {
"rpca" : nygc_colors['brown'], # Vivid Orange
"nnm" : nygc_colors['darkred'], # Vivid Red
"nnm_sparse" : nygc_colors['darkyellow'], # Strong Purple
"truncated_svd" : nygc_colors['lightblue'], # gray
"factorgo" : nygc_colors['lightgreen'], # Very Light Blue
}
# method_colors = {
# "rpca" : '#FF6800', # Vivid Orange
# "nnm" : '#C10020', # Vivid Red
# "nnm_sparse" : '#803E75', # Strong Purple
# "truncated_svd" : '#535154', # gray
# "factorgo" : '#A6BDD7', # Very Light Blue
# }
# Base parameters
simparams = {'p': 2000, 'k': 10, 'h2': 0.2, 'h2_shared_frac': 0.5, 'aq': 0.6}
score_names = {
'adj_MI': "Adjusted MI Score",
'L_rmse': r"$\| L - \hat{L}\|_F$",
'F_rmse': r"$\| F - \hat{F}\|_F$",
'Z_rmse': r"$\| LF^{T} - \hat{L}\hat{F}^{T}\|_F$",
}
def get_simulation_with_variable(df, var_name, var_values):
condition = [(f'simulate.{k}', v) for k, v in simparams.items() if k != var_name]
df1 = stratify_dfcols(df, condition)
df2 = stratify_dfcols_in_list(df1, f'simulate.{var_name}', var_values)
return df2
def get_scores_from_dataframe(df, score_name, variable_name, variable_values,
methods = methods):
simdf = get_simulation_with_variable(df, variable_name, variable_values)
scores = {key: list() for key in methods.keys()}
for method, mlist in methods.items():
mrows = stratify_dfcols(simdf, [('lowrankfit', mlist[0]), ('matfactor', mlist[1])])
for value in variable_values:
vrows = stratify_dfcol(mrows, f'simulate.{variable_name}', value)
scores[method].append(vrows[f'score.{score_name}'].to_numpy())
return scores
def random_jitter(xvals, yvals, d = 0.1):
xjitter = [x + np.random.randn(len(y)) * d for x, y in zip(xvals, yvals)]
return xjitter
def boxplot_scores(variable, variable_values,
methods = methods, score_names = score_names,
dscout = dscout, method_colors = method_colors,
custom_font = 'Futura Std', xlabel = "No. of variants"):
nmethods = len(methods)
nvariables = len(variable_values)
nscores = len(score_names)
figh = 8
figw = (nscores * figh) + (nscores - 1)
fig = plt.figure(figsize = (figw, figh))
axs = [fig.add_subplot(1, nscores, x+1) for x in range(nscores)]
boxs = {x: None for x in methods.keys()}
for i, (score_name, score_label) in enumerate(score_names.items()):
scores = get_scores_from_dataframe(dscout, score_name, variable, variable_values)
for j, mkey in enumerate(methods.keys()):
boxcolor = method_colors[mkey]
boxface = f'#{boxcolor[1:]}80'
medianprops = dict(linewidth=0, color = boxcolor)
whiskerprops = dict(linewidth=2, color = boxcolor)
boxprops = dict(linewidth=2, color = boxcolor, facecolor = boxface)
flierprops = dict(marker='o', markerfacecolor=boxface, markersize=3, markeredgecolor = boxcolor)
xpos = [x * (nmethods + 1) + j for x in range(nvariables)]
boxs[mkey] = axs[i].boxplot(scores[mkey], positions = xpos,
showcaps = False, showfliers = False,
widths = 0.7, patch_artist = True, notch = False,
flierprops = flierprops, boxprops = boxprops,
medianprops = medianprops, whiskerprops = whiskerprops)
axs[i].scatter(random_jitter(xpos, scores[mkey]), scores[mkey],
edgecolor = boxcolor, facecolor = boxface, linewidths = 1,
s = 10)
xcenter = [x * (nmethods + 1) + (nmethods - 1) / 2 for x in range(nvariables)]
axs[i].set_xticks([])
#axs[i].set_xticklabels(variable_values)
#axs[i].set_xlabel(xlabel)
axs[i].set_ylabel(score_label)
#xlim_low = 0 - (nvariables - 1) / 2
#xlim_high = (nvariables - 1) * (nmethods + 1) + (nmethods - 1) + (nvariables - 1) / 2
#xlim_high = (nmethods + 1.5) * nvariables - 2.5
axs[i].set_xlim( -1, 5 )
plt.tight_layout()
return axs, boxs
variable = 'p'
variable_values = [2000,]
axs, boxs = boxplot_scores(variable, variable_values)
plt.savefig('../plots/colormann-manuscript/numerical_experiments_noisy.pdf', bbox_inches='tight')
plt.show()