Choose parameters for UMAP plot

Author

Saikat Banerjee

Published

April 3, 2024

Abstract

We look at the UMAP plots with different parameters

Code

import os
import numpy as np
import pandas as pd
import pickle
import re

import umap
from bokeh.plotting import figure as bokeh_figure
from bokeh.plotting import show as bokeh_show
from bokeh.layouts import column as bokeh_column
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import CategoricalColorMapper

output_notebook()

Loading BokehJS ...

Code

data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/nnsparsh/full/"

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_all.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_all_with_desc.pkl"))

variant_filename = f"{data_dir}/allvar.pruned.closesttss.hugo"
variant_df       = pd.read_csv(variant_filename, sep = '\t')

method = 'nnm_sparse'

res_filename = os.path.join(result_dir, f"{method}_model.pkl")
with (open(res_filename, "rb")) as fh:
    lowrank_model = pickle.load(fh)

X = np.array(zscore_df.drop(labels = ['rsid'], axis = 1).values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)
lowX = lowrank_model['X_']
lowX_cent = lowX - np.mean(lowX, axis = 0, keepdims = True)
lowX_std = lowX_cent / np.sqrt(np.prod(lowX_cent.shape))

print ("Nuclear Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'nuc'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'nuc'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'nuc'):.3f}")

Nuclear Norms
Low rank model: 8050.750
Input data: 496751.155
Input data (mean centered): 495872.387

Code

def compute_loadings_factors(X, k = None):
    #X_cent = mpy_simulate.do_standardize(X, scale = False)
    #X_cent /= np.sqrt(np.prod(X_cent.shape))
    U, S, Vt = np.linalg.svd(X, full_matrices = False)
    S2 = np.square(S)
    explained_variance = S2 / np.sum(S2)
    if k is None:
        k = np.where(explained_variance < 1e-12)[0][0] - 1
    U_low = U[:, :k]
    S_low = S[:k]
    factors = Vt[:k, :].T
    loadings = U_low @ np.diag(S_low)
    return U_low, S_low, loadings, factors

U0, S0, loadings0, factors0 = compute_loadings_factors(lowX, k = 100)
U1, S1, loadings1, factors1 = compute_loadings_factors(lowX_std, k = 100)
U2, S2, loadings2, factors2 = compute_loadings_factors(X_cent)

nnm_loadings  = loadings0.copy()
tsvd_loadings = U2[:, :loadings0.shape[1]] @ np.diag(S2[:loadings0.shape[1]])

Code

hex_colors = [
    '#2D69C4', # blue
    '#CC2529', # red
    '#93AA00', # Vivid Yellowish Green
    '#535154', # gray
    '#6B4C9A', # purple
    '#FFB300', # Vivid Yellow
    '#922428', # dark brown
    '#948B3D', # olive
]

hex_colors_40 = [
    "#3c3c3c0d",
    "#084609",
    "#ff4ff4",
    "#01d94a",
    "#b700ce",
    "#91c900",
    "#5f42ed",
    "#5fa200",
    "#8d6dff",
    "#c9f06b",
    "#0132a7",
    "#ffbb1f",
    "#0080ed",
    "#f56600",
    "#3afaf5",
    "#c10001",
    "#01e698",
    "#a20096",
    "#00e2c1",
    "#ff5ac8",
    "#008143",
    "#cd0057",
    "#4aeeff",
    "#8c001a",
    "#b5f2a2",
    "#5d177d",
    "#a99900",
    "#e299ff",
    "#5b6b00",
    "#96aeff",
    "#a46f00",
    "#007acb",
    "#ff9757",
    "#00a8e0",
    "#ff708e",
    "#baefc7",
    "#622b25",
    "#c8c797",
    "#885162",
    "#ffb7a5",
    "#ffa3c3"]

trait_type_unique = trait_df['trait_type'].unique().tolist()
trait_type_dict = {
    trait: color for trait, color in zip(
        trait_type_unique, 
        hex_colors[:len(trait_type_unique)])
    }

llm_methods = [
    "ls-da3m0ns/bge_large_medical",
    "medicalai/ClinicalBERT",
    "emilyalsentzer/Bio_ClinicalBERT",
]

llm_ctypes = ["community", "kmeans"]

llm_clusters = {method : { x : None for x in llm_ctypes } for method in llm_methods}
llm_outdir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"

for method in llm_methods:
    for ctype in llm_ctypes:
        m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")
        with open(m_filename, "rb") as fh:
            llm_clusters[method][ctype] = pickle.load(fh)
            
def get_llm_cluster_index(selectidx, method, ctype):
    clusteridx = np.full([selectidx.shape[0],], -1)
    for i, ccomps in enumerate(llm_clusters[method][ctype]):
        for idx in ccomps:
            clusteridx[idx] = i
    return clusteridx

Code

umap_metric_names = [
    "euclidean",
    "manhattan",
    "chebyshev",
    "minkowski",
    "canberra",
    "braycurtis",
    "mahalanobis",
    "wminkowski",
    "cosine",
    "correlation",
]

umap_embeddings = dict()
for metric in umap_metric_names:
    if metric not in umap_embeddings.keys():
        reducer = umap.UMAP(n_neighbors = 15, metric = metric, min_dist = 0.4)
        umap_embeddings[metric] = reducer.fit_transform(X)

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out

Code

umap_embeddings_filepath = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/umap",
                                        "X_embeddings_nbr_15_dist_04_metric_various.pkl")

def save_dict_to_pickle(filepath, data):
    dirname = os.path.dirname(filepath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(filepath, "wb") as fh:
        pickle.dump(clusters, fh, protocol=pickle.HIGHEST_PROTOCOL)
        
save_dict_to_pickle(umap_embeddings_filepath, umap_embeddings)

Code

def get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, umap_string, color_palette = hex_colors_40):
    
    plot_dict = dict(
        x = embedding[selectidx, 0],
        y = embedding[selectidx, 1],
        trait_type_code = [f"{x}" for x in clusteridx],
        h2_alpha = [min(0.6, 10 * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()],
        fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx],
    )

    color_mapping = CategoricalColorMapper(factors = [f"{x}" for x in np.unique(clusteridx)], palette = color_palette)

    plot_tooltips = [
        ("Desc", "@fulldesc"),
    ]

    ax = bokeh_figure(
        width = 800, height = 800, 
        tooltips = plot_tooltips,
        title = umap_string ,
    )

    ax.circle('x', 'y', size = 10, 
        source = ColumnDataSource(plot_dict), 
        color = dict(field='trait_type_code', transform = color_mapping),
        line_alpha = dict(field='h2_alpha'),
        fill_alpha = dict(field='h2_alpha'),
    )
    ax.title.text_font_size = '20pt'
    ax.axis.major_label_text_font_size = '20pt'
    ax.axis.axis_line_width = 2
    ax.axis.major_tick_line_width = 2
    ax.grid.visible = False
    return ax

Code

selectidx  = np.array(trait_df.index)
clusteridx = get_llm_cluster_index(selectidx, "ls-da3m0ns/bge_large_medical", "kmeans")

axlist = [get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, metric) for metric, embedding in umap_embeddings.items()]


# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
bokeh_show(p)

Code

# plot_dict = dict(
#     x = tsvd_embedding[selectidx, 0],
#     y = tsvd_embedding[selectidx, 1],
#     trait_type_code = trait_df['trait_type'].tolist(),
#     desc = trait_df['description'].tolist(),
#     h2   = [f"{x:.3f}" for x in trait_df['estimates.final.h2_observed'].tolist()],
#     Neff = [f"{x:.2f}" for x in trait_df['Neff'].tolist()],
#     tidx = list(selectidx),
#     fulldesc = [f"{trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx]
# )

# plot_tooltips = [
#     ("index", "$index"),
#     ("Desc", "@desc"),
#     ("h2", "@h2"),
#     ("N_eff", "@Neff"),
# ]
# color_mapping = CategoricalColorMapper(factors = trait_type_unique, palette = hex_colors)



selectidx = np.array(trait_df.index)

axlist = list()
for llm_method in llm_methods:
    for llm_ctype in llm_ctypes:
        clusteridx = get_llm_cluster_index(selectidx, llm_method, llm_ctype)
        alpha_factor = 10 if llm_ctype == "kmeans" else 100
        plot_title = f"{llm_method} + {llm_ctype} clustering"
        axlist.append(get_bokeh_plot(X_embedding_cosine_30_01, selectidx, clusteridx, trait_df, plot_title, alpha_factor = alpha_factor))

# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
bokeh_show(p)

Figure 1