About
Here, I check if the simulation benchmarking makes sense using simple examples. The idea is to run large scale simulations using pipelines. Before that, I want to look at simple examples.
Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black' , dpi = 120 , colors = 'kelly' )
import sys
sys.path.append("../utils/" )
import histogram as mpy_histogram
import simulate as mpy_simulate
import plot_functions as mpy_plotfn
from nnwmf.optimize import IALM
from nnwmf.optimize import FrankWolfe_CV
from nnwmf.optimize import FrankWolfe
ntrait = 4 # categories / class
ngwas = 500 # N
nsnp = 1000 # P
nfctr = 40 # K
Code
sample_dict = mpy_simulate.get_sample_indices(ntrait, ngwas, shuffle = False )
sample_indices = [x for _, x in sample_dict.items()]
unique_labels = [k for k, _ in sample_dict.items()]
class_labels = [None for x in range (ngwas)]
for k, x in sample_dict.items():
for i in x:
class_labels[i] = k
Code
Y, Y_true, L, F, mean, noise_var = mpy_simulate.simulate(ngwas, nsnp, ntrait, nfctr, sample_groups = sample_indices, std = 1.0 )
Y_cent = mpy_simulate.do_standardize(Y, scale = False )
fig = plt.figure(figsize = (8 , 8 ))
ax1 = fig.add_subplot(111 )
mpy_plotfn.plot_covariance_heatmap(ax1, L)
plt.tight_layout()
plt.show()
True components
Code
axmain, axs = mpy_plotfn.plot_principal_components(L, class_labels, unique_labels)
plt.show()
Truncated SVD
Code
def truncated_SVD(X, thres = 0.9 ):
U, S, Vt = np.linalg.svd(X, full_matrices = False )
k = np.where(np.cumsum(S / np.sum (S)) >= thres)[0 ][0 ]
pcomps = U[:, :k] @ np.diag(S[:k])
return U, S, Vt, pcomps
_, S_tsvd, _, pcomps_tsvd = truncated_SVD(Y_cent)
axmain, axs = mpy_plotfn.plot_principal_components(pcomps_tsvd, class_labels, unique_labels)
plt.show()
Code
S2 = np.square(S_tsvd)
fig = plt.figure()
ax1 = fig.add_subplot(111 )
ax1.plot(np.arange(S2.shape[0 ]), np.cumsum(S2 / np.sum (S2)), 'o-' )
plt.show()
Nuclear Norm Minimization using Frank-Wolfe algorithm
Code
nnmcv = FrankWolfe_CV(chain_init = True , reverse_path = False , kfolds = 2 )
nnmcv.fit(Y_cent)
Code
fig = plt.figure()
ax1 = fig.add_subplot(111 )
for k in range (2 ):
#ax1.plot(np.log10(list(nnmcv.training_error.keys())), [x[k] for x in nnmcv.training_error.values()], 'o-')
ax1.plot(np.log10(list (nnmcv.test_error.keys())), [x[k] for x in nnmcv.test_error.values()], 'o-' )
mpl_utils.set_xticks(ax1, scale = 'log10' , spacing = 'log2' )
plt.show()
Code
r_opt = 32.0
nnm = FrankWolfe(show_progress = True , svd_max_iter = 50 , debug = True , suppress_warnings = True )
nnm.fit(Y_cent, r_opt)
Y_nnm_cent = mpy_simulate.do_standardize(nnm.X, scale = False )
U_nnm, S_nnm, Vt_nnm = np.linalg.svd(Y_nnm_cent, full_matrices = False )
pcomps_nnm = U_nnm @ np.diag(S_nnm)
2023-07-28 11:16:31,147 | nnwmf.optimize.frankwolfe | INFO | Iteration 0. Step size 1.000. Duality Gap 2742.83
Code
axmain, axs = mpy_plotfn.plot_principal_components(pcomps_nnm, class_labels, unique_labels)
plt.show()
Weighted Nuclear Norm Minimization
Code
snp_weights = 1 / np.sqrt(noise_var)
weight = np.column_stack([snp_weights for _ in range (ngwas)]).T
wnnmcv = FrankWolfe_CV(chain_init = True , reverse_path = False , kfolds = 5 , debug = True )
wnnmcv.fit(Y_cent, weight = weight)
2023-07-28 11:16:55,580 | nnwmf.optimize.frankwolfe_cv | DEBUG | Cross-validation over 15 ranks.
2023-07-28 11:16:55,604 | nnwmf.optimize.frankwolfe_cv | DEBUG | Fold 1 ...
2023-07-28 11:17:13,426 | nnwmf.optimize.frankwolfe_cv | DEBUG | Fold 2 ...
2023-07-28 11:17:35,075 | nnwmf.optimize.frankwolfe_cv | DEBUG | Fold 3 ...
2023-07-28 11:17:57,035 | nnwmf.optimize.frankwolfe_cv | DEBUG | Fold 4 ...
2023-07-28 11:18:16,387 | nnwmf.optimize.frankwolfe_cv | DEBUG | Fold 5 ...
Code
fig = plt.figure()
ax1 = fig.add_subplot(111 )
for k in range (5 ):
#ax1.plot(np.log10(list(nnmcv.training_error.keys())), [x[k] for x in nnmcv.training_error.values()], 'o-')
ax1.plot(np.log10(list (wnnmcv.test_error.keys())), [x[k] for x in wnnmcv.test_error.values()], 'o-' )
mpl_utils.set_xticks(ax1, scale = 'log10' , spacing = 'log2' )
plt.show()
Code
r_opt = 32.0
wnnm = FrankWolfe(show_progress = True , svd_max_iter = 50 , debug = True )
wnnm.fit(Y_cent, r_opt, weight = weight)
Y_wnnm_cent = mpy_simulate.do_standardize(wnnm.X, scale = False )
U_wnnm, S_wnnm, Vt_wnnm = np.linalg.svd(Y_wnnm_cent, full_matrices = False )
pcomps_wnnm = U_wnnm @ np.diag(S_wnnm)
2023-07-28 11:18:52,909 | nnwmf.optimize.frankwolfe | INFO | Iteration 0. Step size 0.154. Duality Gap 7.2392e+06
2023-07-28 11:18:55,519 | nnwmf.optimize.frankwolfe | INFO | Iteration 100. Step size 0.003. Duality Gap 100806
2023-07-28 11:18:58,015 | nnwmf.optimize.frankwolfe | INFO | Iteration 200. Step size 0.003. Duality Gap 42452.7
2023-07-28 11:19:00,500 | nnwmf.optimize.frankwolfe | INFO | Iteration 300. Step size 0.002. Duality Gap 31460.9
2023-07-28 11:19:02,997 | nnwmf.optimize.frankwolfe | INFO | Iteration 400. Step size 0.005. Duality Gap 20861.9
2023-07-28 11:19:05,500 | nnwmf.optimize.frankwolfe | INFO | Iteration 500. Step size 0.004. Duality Gap 16232.8
Code
axmain, axs = mpy_plotfn.plot_principal_components(pcomps_wnnm, class_labels, unique_labels)
plt.show()
Robust PCA
Code
rpca = IALM()
rpca.fit(Y_cent)
np.linalg.matrix_rank(rpca.L_)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 1
----> 1 rpca = IALM ( )
2 rpca. fit(Y_cent)
3 np. linalg. matrix_rank(rpca. L_)
File ~/work/nnwmf/src/nnwmf/optimize/inexact_alm.py:85 , in IALM.__init__ (self, rho, tau, mu_update_method, max_iter, primal_tol, dual_tol)
82 assert rho > 1
83 assert tau > 1
---> 85 self . lmb_ = lmb
86 self . rho_ = rho
87 self . max_iter_ = max_iter
NameError : name 'lmb' is not defined
Code
Y_rpca_cent = mpy_simulate.do_standardize(L_rpca, scale = False )
U_rpca, S_rpca, Vt_rpca = np.linalg.svd(Y_rpca_cent, full_matrices = False )
pcomps_rpca = U_rpca @ np.diag(S_rpca)
axmain, axs = mpy_plotfn.plot_principal_components(pcomps_rpca, class_labels, unique_labels)
plt.show()
Code
def get_rmse(original, recovered, mask = None ):
if mask is None :
mask = np.ones(original.shape)
n = np.sum (mask)
mse = np.sum (np.square((original - recovered) * mask)) / n
return np.sqrt(mse)
Y_true_cent = mpy_simulate.do_standardize(Y_true, scale = False )
Y_rpca_cent = mpy_simulate.do_standardize(L_rpca, scale = False )
rmse_nnm = get_rmse(Y_true_cent, Y_nnm_cent)
rmse_wnnm = get_rmse(Y_true_cent, Y_wnnm_cent)
rmse_rpca = get_rmse(Y_true_cent, Y_rpca_cent)
print (f" { rmse_nnm:.4f} \t Nuclear Norm Minimization" )
print (f" { rmse_wnnm:.4f} \t Weighted Nuclear Norm Minimization" )
print (f" { rmse_rpca:.4f} \t Robust PCA" )
0.1448 Nuclear Norm Minimization
0.1427 Weighted Nuclear Norm Minimization
0.1054 Robust PCA
Code
def get_psnr(original, recovered):
n, p = original.shape
maxsig2 = np.square(np.max (original) - np.min (original))
mse = np.sum (np.square(recovered - original)) / (n * p)
res = 10 * np.log10(maxsig2 / mse)
return res
psnr_nnm = get_psnr(Y_true_cent, Y_nnm_cent)
psnr_wnnm = get_psnr(Y_true_cent, Y_wnnm_cent)
psnr_rpca = get_psnr(Y_true_cent, Y_rpca_cent)
print (f" { psnr_nnm:.4f} \t Nuclear Norm Minimization" )
print (f" { psnr_wnnm:.4f} \t Weighted Nuclear Norm Minimization" )
print (f" { psnr_rpca:.4f} \t Robust PCA" )
21.2372 Nuclear Norm Minimization
21.3642 Weighted Nuclear Norm Minimization
23.9949 Robust PCA
Code
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics as skmetrics
def get_adjusted_MI_score(pcomp, class_labels):
distance_matrix = skmetrics.pairwise.pairwise_distances(pcomp, metric= 'euclidean' )
model = AgglomerativeClustering(n_clusters = 4 , linkage = 'average' , metric = 'precomputed' )
class_pred = model.fit_predict(distance_matrix)
return skmetrics.adjusted_mutual_info_score(class_labels, class_pred)
adjusted_mi_nnm = get_adjusted_MI_score(pcomps_nnm, class_labels)
adjusted_mi_wnnm = get_adjusted_MI_score(pcomps_wnnm, class_labels)
adjusted_mi_rpca = get_adjusted_MI_score(pcomps_rpca, class_labels)
print (f" { adjusted_mi_nnm:.4f} \t Nuclear Norm Minimization" )
print (f" { adjusted_mi_wnnm:.4f} \t Weighted Nuclear Norm Minimization" )
print (f" { adjusted_mi_rpca:.4f} \t Robust PCA" )
-0.0006 Nuclear Norm Minimization
1.0000 Weighted Nuclear Norm Minimization
1.0000 Robust PCA