Singular value spectrum of low rank matrices obtained from PGC data

Author

Saikat Banerjee

Published

June 4, 2026

Abstract

This notebook analyzes the singular values from the low rank matrices of PGC data.

Background

From the subspace stability cross-validation, we observed that the NNM-Corr’s correlated-noise whitening converts NNM’s all-at-once spectral collapse into an ordered factor-entry path. Click here to view the diagnostics notebook. Here, we check whether the same behavior persists for the full PGC data.

We show the results for 3 groups of traits: 1. All 107 traits 2. Excluding “Biomarkers” traits 3. Excluding “Biomarkers” and “Neurite density” traits

Code

import re
import json
import pickle
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor='black', dpi=300, colors='kelly')

from matplotlib import colormaps as mpl_cmaps
import matplotlib.colors as mpl_colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

Code

trait_dirs = {
    "107 traits": Path("/gpfs/commons/home/sbanerjee/pgc/analysis/clorinn/low_rank_fit"),
    "78 traits":  Path("/gpfs/commons/home/sbanerjee/pgc/analysis/v1.1/low_rank_fit"),
    "51 traits":  Path("/gpfs/commons/home/sbanerjee/pgc/analysis/v1.2/low_rank_fit"),
}

Code

def load_svd_npz(svd_dir, prefix):
    svd_dir = Path(svd_dir)
    rows = []
    for path in svd_dir.glob(f"{prefix}_r*.npz"):
        with np.load(path, allow_pickle=True) as data:
            fit_params = data["fit_params"].item()
            nucnorm = fit_params["nucnorm"]
            s = data["s"]
            n_factors = len(s)
            row = {"nucnorm": nucnorm,}
            row.update({f"s{k}": s[k] for k in range(n_factors)})
            rows.append(row)
    df = (pd.DataFrame(rows)
              .sort_values(["nucnorm"])
              .reset_index(drop=True))
    return df

def get_singular_value_metrics(df, k_list=None, metric="value"):
    s_cols = sorted(
                [c for c in df.columns if re.match(r"^s\d+$", c)],
                key=lambda x: int(x[1:])
             )
    k_obs  = np.asarray([int(x[1:]) + 1 for x in s_cols]) # observed k values, idx + 1
    k_req  = k_obs if k_list is None else np.asarray(k_list)
    k_idx  = np.searchsorted(k_obs, k_req)
    
    if metric in ("gap_ratio", "relative_gap"):
        # requires k_next 
        valid = (k_idx >= 0) & (k_idx < len(k_obs) - 1)
        if not np.all(valid):
            raise ValueError(f"Some requested k are not available: {k_req[~valid]}")

    S     = df[s_cols].to_numpy() # nucnorms x k_obs
    s_1   = S[:, k_obs == 1] # first eigenvalue (max eigenvalue) for all r/x
    s_sum = np.sum(S, axis = 1, keepdims = True)
    
    y = S[:, k_idx]
    if metric == "norm_value":
        y /= s_1
    elif metric == "gap_ratio":
        y /= S[:, k_idx + 1]
    elif metric == "relative_gap":
        y = (y - S[:, k_idx + 1]) / np.where(y > 0, y, np.nan)
        
    return y


def get_effective_rank_at_nucnorm(df, nucnorm, *, thres=0.9, method="energy"):
    S_arr = get_singular_value_metrics(df, metric="value")
    norms = df["nucnorm"].to_numpy(dtype=float).ravel()
    r_idx = np.searchsorted(norms, nucnorm)
    s = S_arr[r_idx, :].ravel()

    if method == "energy":
        cum = np.cumsum(s) / np.sum(s)
        idx = np.searchsorted(cum, thres)
        return int(idx)
    elif method == "participation_ratio":
        s2 = np.sum(s**2)
        idx = float((np.sum(s)**2) / s2) if s2 > 0 else 0.0
        return idx
    elif method == "count":
        t = thres * np.max(s)
        return np.sum(s > t)
    elif method == "spectral_gap":
        gap = s[:-1] - s[1:]
        rel_gap = gap / np.where(s[:-1] > 0, s[:-1], np.nan)
        return int(np.argmax(rel_gap)) + 1
    elif method == "spectral_gap_loose":
        gap = s[:-1] - s[1:]
        rel_gap = gap / np.where(s[:-1] > 0, s[:-1], np.nan)
        gap_thres = thres * np.max(rel_gap)
        idx = np.max(np.where(rel_gap > gap_thres))
        return int(idx) + 1
    else:
        raise ValueError("Method undefined.")
        
def get_effective_ranks(df, nucnorms, *, thres=0.9, method="energy"):
    ranks= (
        [get_effective_rank_at_nucnorm(df, r, method=method, thres=thres) 
         for r in nucnorms]
    )
    return np.asarray(ranks)

Choose the ranks to show in each plot

Code

def r_at_target_rank(df, k_target, *, method="energy", thres=0.95):
    nucnorms = df["nucnorm"].to_numpy(float)
    ranks = get_effective_ranks(df, nucnorms, method=method, thres=thres).astype(float)
    order = np.argsort(nucnorms)
    r, g = nucnorms[order], ranks[order]
    hit = np.where(g >= k_target)[0]
    if len(hit) == 0:
        return np.nan                      # k_target not reached on this grid
    j = hit[0]
    if j == 0:
        return r[0]
    # snap to whichever bracketing grid point is closer in rank
    return r[j] if abs(g[j] - k_target) <= abs(g[j-1] - k_target) else r[j-1]


def choose_rank_for_one_subset(svd_out_dir, k_star):
    for k_target in [k_star]:
        r_corr = r_at_target_rank(load_svd_npz(svd_out_dir, "pgd_fw_nnm_corr"), k_target)
        r_nnm  = r_at_target_rank(load_svd_npz(svd_out_dir, "pgd_fw_nnm"),      k_target)

    return {
        "k": k_star,
        "NNM": r_nnm,
        "NNM-Corr": r_corr,
    }


# k_choose = {
#     "107 traits": 30,
#     "78 traits":  25,
#     "51 traits":  10,
# }

k_choose = {
    "107 traits": 20,
    "78 traits":  15,
    "51 traits":  6,
}
   
rows = {
    data_name: choose_rank_for_one_subset(
        Path(data_root) / "svd", 
        k_choose[data_name]) for data_name, data_root in trait_dirs.items()
}

rank_df = pd.DataFrame(rows).T
    
from IPython.display import display, HTML
# display(rank_df.style.format({"Effective Rank": "{:g}"}))
display(HTML(rank_df.to_html()))

	k	NNM	NNM-Corr
107 traits	20.0	8192.0	1290.0
78 traits	15.0	3251.0	406.0
51 traits	6.0	1290.0	406.0

Code

def get_r_scaled(x, scale="log2"):
    x = np.asarray(x, dtype=float)
    powers = np.arange(np.ceil(np.log2(x.min())), np.floor(np.log2(x.max())) + 1)
    xlabels = 2 ** powers
    
    if scale == "log10":
        xscale = np.log10(x)
        xticks = np.log10(xlabels)
    elif scale == "log2":
        xscale = np.log2(x)
        xticks = np.log2(xlabels)
    else:
        xscale = x
        xticks = xlabels

    xlabel_str = [f"{r:g}" for r in xlabels]
    return xscale, xticks, xlabel_str

def style_blank_axis(ax):
    ax.tick_params(bottom = False, top = False, left = False, right = False,
        labelbottom = False, labeltop = False, labelleft = False, labelright = False)
    

def style_colorbar_yticks(cbar, cmap, norm):
    for tick, value in zip(cbar.ax.yaxis.get_major_ticks(), cbar.ax.get_yticks()):
        color = cmap(norm(value))
        tick.tick1line.set_markeredgecolor(color)
        tick.tick2line.set_markeredgecolor(color)
        tick.tick1line.set_color("none")
        tick.tick2line.set_color("none")
    # for tick_label, value in zip(cbar.ax.get_yticklabels(), cbar.ax.get_yticks()):
        # tick_label.set_color(cmap(norm(value)))


def make_line_plots_with_colorbar_inset(ax, x, y, ycols,
        ylabel="", cbar_label="", cbar_xpos="", show_cbar=True,
        vcenter=None, cmap=None, norm=None):
    """
    Create K = len(ycols) line plots, color maps to values in ycols
    x : ndarray(N,)
    y : ndarray(N, K), columns of y correspond to values in k_list
    
    Returns cmap, norm for future use.
    """
    
    from matplotlib.cm import ScalarMappable

    if cmap is None:
        cmap = mpl_cmaps.get_cmap("viridis").copy()
    if norm is None:
        vmin = min(ycols)
        vmax = max(ycols)
        if vcenter is None: vcenter = (vmin + vmax) / 2.
        norm = mpl_colors.TwoSlopeNorm(vmin=vmin, vcenter=vcenter, vmax=vmax)
    
    for i, k in enumerate(ycols):
        ax.plot(x, y[:, i], color=cmap(norm(i)))
        
    ax.set_ylabel(ylabel)

    if show_cbar:
        cax = ax.inset_axes([cbar_xpos, 0.15, 0.02, 0.7]) 
        sm = ScalarMappable(norm=norm, cmap=cmap)
        cbar = plt.colorbar(sm, cax=cax, fraction = 0.1)

        # Colorbar style
        style_blank_axis(cbar.ax)
        cbar.ax.tick_params(right=True, labelright=True, pad=5, width=2, length=5)
        cbar.set_label(cbar_label, labelpad = 10)
        cbar.ax.yaxis.set_label_position('left')
        style_colorbar_yticks(cbar, cmap, norm)
        for side, border in list(cbar.ax.spines.items()):
            border.set_visible(False)

    return cmap, norm
    
        
def make_singular_value_plots(ax1, ax2, ax3, prefix, 
        svd_out_dir, r_scale='log10', show_yaxis=True, k_max=65,
        k_highlight=20, r_choose=None):
    # load data 
    df          = load_svd_npz(svd_out_dir, prefix)   
    nucnorms    = df["nucnorm"].to_numpy(dtype=float)
    
    # common x-axis for all plots
    x, xticks, xlabels  = get_r_scaled(nucnorms, scale=r_scale)
    ylabel=""
    
    # -- ax1: fan of normalized singular values --
    k_list = np.arange(2, k_max)
    k_center = 15
    y = get_singular_value_metrics(df, k_list, metric="norm_value")
    show_cbar = True
    if show_yaxis: 
        # show_cbar = True
        ylabel=r"Normalized eigenvalue $s_k / s_1$"
    cmap, norm = make_line_plots_with_colorbar_inset(
        ax1, x, y, k_list,
        show_cbar=show_cbar,
        ylabel=ylabel, cbar_label=r"Rank $k$", cbar_xpos=0.1,
        vcenter=k_center)
    ax1.plot(x, y[:, k_highlight-1], color='orangered', lw=3, label=f"k={k_highlight:d}")
    ax1.legend(frameon = False, handlelength = 2, loc='upper center')
    
    # -- ax2: effective ranks --
    rank_methods = {
        "participation_ratio": 
            {"thres": None, "label": "Participation ratio"}, 
        "energy": 
            {"thres": 0.95, "label": f"Cumulative singular values > 0.95"}, 
        "spectral_gap": 
            {"thres": None, "label": "k at which maximum spectral gap (gmax) sits"},
        "count": 
            {"thres": 1e-4, "label": "Number of eigenvalues > 1e-4"},
        "spectral_gap_loose":
            {"thres": 0.7,  "label": "Max k at which spectral gap > 70% gmax"},
    }
    ranks = {}
    rank_lines = {}
    for m, mdict in rank_methods.items():
        ranks[m] = get_effective_ranks(df, nucnorms, method=m, thres=mdict["thres"])
        if m != "spectral_gap_loose":
            rank_lines[m], = ax2.plot(x, ranks[m],  'o-', label=mdict["label"])

    if show_yaxis: 
        ylabel="Effective rank"
    ax2.legend(frameon = False, handlelength = 2)
    ax2.set_ylabel(ylabel)
    
    # -- ax3: relative spectral gap --
    # k_list = [8, 10, 12, 15, 20, 25, 30, 40]
    y = get_singular_value_metrics(df, k_list, metric="relative_gap")       
    for i, k in enumerate(k_list):
        # ax3.plot(x, y[:, i], 'o-', label=f"{k}")
        ax3.plot(x, y[:, i], color=cmap(norm(i)))
    ax3.plot(x, y[:, k_highlight-1], color='orangered', lw=3, label=f"k={k_highlight:d}")
    if show_yaxis:
        ylabel=r"Relative spectral gap"
    ax3.legend(frameon = False, handlelength = 2)
    ax3.set_ylabel(ylabel)
    
    # -- mark the chosen rank --
    if r_choose is not None:
        r_choose_idx = np.argwhere(nucnorms == r_choose)
        for ax in [ax1, ax2, ax3]:
            ax.axvline(x=x[r_choose_idx], color='grey', linestyle='dotted')

    
    # -- mark the x-axis --
    ax3.set_xticks(xticks)
    ax3.set_xticklabels(xlabels, rotation=90)
    ax3.set_xlabel("Nuclear norm constraint r")
    return

Code

def make_svd_spectrum_figure(svd_out_dir,
        nnm_r_choose=8192,
        nnm_corr_r_choose=2580,
        k_highlight=30, k_max=65, save_figure=True,
        file_out_suffix=""):

    fig = plt.figure(figsize=(20, 16))
    gs = fig.add_gridspec(nrows=3, ncols=2, wspace=0, hspace=0)

    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[1, 0])
    ax3 = fig.add_subplot(gs[2, 0])
    ax4 = fig.add_subplot(gs[0, 1], sharey=ax1)
    ax5 = fig.add_subplot(gs[1, 1], sharey=ax2)
    ax6 = fig.add_subplot(gs[2, 1], sharey=ax3)

    for ax in (ax1, ax2, ax3, ax4, ax5, ax6):
        style_blank_axis(ax)
        ax.patch.set_alpha(0.0)
    for ax in (ax1, ax2, ax3):
        ax.tick_params(left=True, labelleft=True)
    for ax in (ax3, ax6):
        ax.tick_params(bottom=True, labelbottom=True)

    prefix1 = "pgd_fw_nnm_corr"
    make_singular_value_plots(
        ax1, ax2, ax3, 
        prefix1, svd_out_dir, 
        r_scale='log2', 
        k_highlight=k_highlight,
        r_choose=nnm_corr_r_choose,
        k_max=k_max,
    )
    ax1.set_title("NNM-Corr / PGD+FW", pad=20)

    prefix2 = "pgd_fw_nnm"
    make_singular_value_plots(
        ax4, ax5, ax6, 
        prefix2, svd_out_dir, 
        r_scale='log2', show_yaxis=False, 
        k_highlight=k_highlight,
        r_choose=nnm_r_choose,
        k_max=k_max,
    )
    ax4.set_title("NNM / PGD+FW", pad=20)

    outfile_name=f"figures/pgc_singular_value_spectrum_{file_out_suffix}"
    # plt.savefig(f"{outfile_name}.pdf", bbox_inches='tight')
    # plt.savefig(f"{outfile_name}.png", bbox_inches='tight')
    plt.show()

107 traits

Code

data_name="107 traits"
data_root=trait_dirs[data_name]

make_svd_spectrum_figure(Path(data_root) / "svd",
        k_highlight=int(rank_df.loc[data_name]["k"]),
        nnm_r_choose=rank_df.loc[data_name]["NNM"],
        nnm_corr_r_choose=rank_df.loc[data_name]["NNM-Corr"],
        k_max=65, save_figure=True,
        file_out_suffix="v1_0")

78 traits (excluding “Biomarkers”)

Code

data_name="78 traits"
data_root=trait_dirs[data_name]

make_svd_spectrum_figure(Path(data_root) / "svd",
        k_highlight=int(rank_df.loc[data_name]["k"]),
        nnm_r_choose=rank_df.loc[data_name]["NNM"],
        nnm_corr_r_choose=rank_df.loc[data_name]["NNM-Corr"],
        k_max=65, save_figure=True,
        file_out_suffix="v1_1")

51 traits (excluding “Biomarker” and “Neurite density” traits)

Code

data_name="51 traits"
data_root=trait_dirs[data_name]

make_svd_spectrum_figure(Path(data_root) / "svd",
        k_highlight=int(rank_df.loc[data_name]["k"]),
        nnm_r_choose=rank_df.loc[data_name]["NNM"],
        nnm_corr_r_choose=rank_df.loc[data_name]["NNM-Corr"],
        k_max=50, save_figure=True,
        file_out_suffix="v1_2")

Code

prefix   = "pgd_fw_nnm_corr"

rank_methods = {
    "participation_ratio": 
        {"thres": None, "label": "Participation ratio"}, 
    "energy": 
        {"thres": 0.95, "label": f"Cumulative singular values > 0.95"}, 
    "spectral_gap": 
        {"thres": None, "label": "k at which maximum spectral gap (gmax) sits"},
    "count": 
        {"thres": 1e-4, "label": "Number of eigenvalues > 1e-4"},
    "spectral_gap_loose":
        {"thres": 0.7,  "label": "Max k at which spectral gap > 70% gmax"},
}

from IPython.display import display, HTML

for data_name, data_root in trait_dirs.items():
    svd_out_dir = Path(data_root) / "svd"
    df = load_svd_npz(svd_out_dir, prefix)
    r_choose = rank_df.loc[data_name]["NNM-Corr"]

    rows = dict()
    for m, mdict in rank_methods.items():
        rank = get_effective_rank_at_nucnorm(df, r_choose, thres=mdict["thres"], method=m)
        label = f"{mdict['label']}"
        rows[label] = {"Effective Rank": rank}
    print(f"{data_name} | Threshold r = {r_choose:g}")
    display(pd.DataFrame(rows).T.style.format({"Effective Rank": "{:g}"}))
    # display(HTML(pd.DataFrame(rows).T.to_html()))

107 traits | Threshold r = 1290

	Effective Rank
Participation ratio	14.9819
Cumulative singular values > 0.95	21
k at which maximum spectral gap (gmax) sits	28
Number of eigenvalues > 1e-4	29
Max k at which spectral gap > 70% gmax	31

78 traits | Threshold r = 406

	Effective Rank
Participation ratio	16.7555
Cumulative singular values > 0.95	16
k at which maximum spectral gap (gmax) sits	20
Number of eigenvalues > 1e-4	20
Max k at which spectral gap > 70% gmax	23

51 traits | Threshold r = 406

	Effective Rank
Participation ratio	4.37612
Cumulative singular values > 0.95	6
k at which maximum spectral gap (gmax) sits	9
Number of eigenvalues > 1e-4	11
Max k at which spectral gap > 70% gmax	11