Choose parameters for UMAP plot

Author

Saikat Banerjee

Published

April 3, 2024

Abstract
We look at the UMAP plots with different parameters
Code
import os
import numpy as np
import pandas as pd
import pickle
import re

import umap
from bokeh.plotting import figure as bokeh_figure
from bokeh.plotting import show as bokeh_show
from bokeh.layouts import column as bokeh_column
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import CategoricalColorMapper

output_notebook()
Loading BokehJS ...
Code
data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/nnsparsh/full/"

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_all.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_all_with_desc.pkl"))

variant_filename = f"{data_dir}/allvar.pruned.closesttss.hugo"
variant_df       = pd.read_csv(variant_filename, sep = '\t')

method = 'nnm_sparse'

res_filename = os.path.join(result_dir, f"{method}_model.pkl")
with (open(res_filename, "rb")) as fh:
    lowrank_model = pickle.load(fh)

X = np.array(zscore_df.drop(labels = ['rsid'], axis = 1).values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)
lowX = lowrank_model['X_']
lowX_cent = lowX - np.mean(lowX, axis = 0, keepdims = True)
lowX_std = lowX_cent / np.sqrt(np.prod(lowX_cent.shape))

print ("Nuclear Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'nuc'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'nuc'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'nuc'):.3f}")
Nuclear Norms
Low rank model: 8050.750
Input data: 496751.155
Input data (mean centered): 495872.387
Code
def compute_loadings_factors(X, k = None):
    #X_cent = mpy_simulate.do_standardize(X, scale = False)
    #X_cent /= np.sqrt(np.prod(X_cent.shape))
    U, S, Vt = np.linalg.svd(X, full_matrices = False)
    S2 = np.square(S)
    explained_variance = S2 / np.sum(S2)
    if k is None:
        k = np.where(explained_variance < 1e-12)[0][0] - 1
    U_low = U[:, :k]
    S_low = S[:k]
    factors = Vt[:k, :].T
    loadings = U_low @ np.diag(S_low)
    return U_low, S_low, loadings, factors

U0, S0, loadings0, factors0 = compute_loadings_factors(lowX, k = 100)
U1, S1, loadings1, factors1 = compute_loadings_factors(lowX_std, k = 100)
U2, S2, loadings2, factors2 = compute_loadings_factors(X_cent)

nnm_loadings  = loadings0.copy()
tsvd_loadings = U2[:, :loadings0.shape[1]] @ np.diag(S2[:loadings0.shape[1]])
Code
hex_colors = [
    '#2D69C4', # blue
    '#CC2529', # red
    '#93AA00', # Vivid Yellowish Green
    '#535154', # gray
    '#6B4C9A', # purple
    '#FFB300', # Vivid Yellow
    '#922428', # dark brown
    '#948B3D', # olive
]

hex_colors_40 = [
    "#3c3c3c0d",
    "#084609",
    "#ff4ff4",
    "#01d94a",
    "#b700ce",
    "#91c900",
    "#5f42ed",
    "#5fa200",
    "#8d6dff",
    "#c9f06b",
    "#0132a7",
    "#ffbb1f",
    "#0080ed",
    "#f56600",
    "#3afaf5",
    "#c10001",
    "#01e698",
    "#a20096",
    "#00e2c1",
    "#ff5ac8",
    "#008143",
    "#cd0057",
    "#4aeeff",
    "#8c001a",
    "#b5f2a2",
    "#5d177d",
    "#a99900",
    "#e299ff",
    "#5b6b00",
    "#96aeff",
    "#a46f00",
    "#007acb",
    "#ff9757",
    "#00a8e0",
    "#ff708e",
    "#baefc7",
    "#622b25",
    "#c8c797",
    "#885162",
    "#ffb7a5",
    "#ffa3c3"]

trait_type_unique = trait_df['trait_type'].unique().tolist()
trait_type_dict = {
    trait: color for trait, color in zip(
        trait_type_unique, 
        hex_colors[:len(trait_type_unique)])
    }

llm_methods = [
    "ls-da3m0ns/bge_large_medical",
    "medicalai/ClinicalBERT",
    "emilyalsentzer/Bio_ClinicalBERT",
]

llm_ctypes = ["community", "kmeans"]

llm_clusters = {method : { x : None for x in llm_ctypes } for method in llm_methods}
llm_outdir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"

for method in llm_methods:
    for ctype in llm_ctypes:
        m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")
        with open(m_filename, "rb") as fh:
            llm_clusters[method][ctype] = pickle.load(fh)
            
def get_llm_cluster_index(selectidx, method, ctype):
    clusteridx = np.full([selectidx.shape[0],], -1)
    for i, ccomps in enumerate(llm_clusters[method][ctype]):
        for idx in ccomps:
            clusteridx[idx] = i
    return clusteridx
Code
umap_metric_names = [
    "euclidean",
    "manhattan",
    "chebyshev",
    "minkowski",
    "canberra",
    "braycurtis",
    "mahalanobis",
    "wminkowski",
    "cosine",
    "correlation",
]

umap_embeddings = dict()
for metric in umap_metric_names:
    if metric not in umap_embeddings.keys():
        reducer = umap.UMAP(n_neighbors = 15, metric = metric, min_dist = 0.4)
        umap_embeddings[metric] = reducer.fit_transform(X)
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
Code
umap_embeddings_filepath = os.path.join("/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/umap",
                                        "X_embeddings_nbr_15_dist_04_metric_various.pkl")

def save_dict_to_pickle(filepath, data):
    dirname = os.path.dirname(filepath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(filepath, "wb") as fh:
        pickle.dump(clusters, fh, protocol=pickle.HIGHEST_PROTOCOL)
        
save_dict_to_pickle(umap_embeddings_filepath, umap_embeddings)
Code
def get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, umap_string, color_palette = hex_colors_40):
    
    plot_dict = dict(
        x = embedding[selectidx, 0],
        y = embedding[selectidx, 1],
        trait_type_code = [f"{x}" for x in clusteridx],
        h2_alpha = [min(0.6, 10 * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()],
        fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx],
    )

    color_mapping = CategoricalColorMapper(factors = [f"{x}" for x in np.unique(clusteridx)], palette = color_palette)

    plot_tooltips = [
        ("Desc", "@fulldesc"),
    ]

    ax = bokeh_figure(
        width = 800, height = 800, 
        tooltips = plot_tooltips,
        title = umap_string ,
    )

    ax.circle('x', 'y', size = 10, 
        source = ColumnDataSource(plot_dict), 
        color = dict(field='trait_type_code', transform = color_mapping),
        line_alpha = dict(field='h2_alpha'),
        fill_alpha = dict(field='h2_alpha'),
    )
    ax.title.text_font_size = '20pt'
    ax.axis.major_label_text_font_size = '20pt'
    ax.axis.axis_line_width = 2
    ax.axis.major_tick_line_width = 2
    ax.grid.visible = False
    return ax
Code
selectidx  = np.array(trait_df.index)
clusteridx = get_llm_cluster_index(selectidx, "ls-da3m0ns/bge_large_medical", "kmeans")

axlist = [get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, metric) for metric, embedding in umap_embeddings.items()]


# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
bokeh_show(p)
Code
# plot_dict = dict(
#     x = tsvd_embedding[selectidx, 0],
#     y = tsvd_embedding[selectidx, 1],
#     trait_type_code = trait_df['trait_type'].tolist(),
#     desc = trait_df['description'].tolist(),
#     h2   = [f"{x:.3f}" for x in trait_df['estimates.final.h2_observed'].tolist()],
#     Neff = [f"{x:.2f}" for x in trait_df['Neff'].tolist()],
#     tidx = list(selectidx),
#     fulldesc = [f"{trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx]
# )

# plot_tooltips = [
#     ("index", "$index"),
#     ("Desc", "@desc"),
#     ("h2", "@h2"),
#     ("N_eff", "@Neff"),
# ]
# color_mapping = CategoricalColorMapper(factors = trait_type_unique, palette = hex_colors)



selectidx = np.array(trait_df.index)

axlist = list()
for llm_method in llm_methods:
    for llm_ctype in llm_ctypes:
        clusteridx = get_llm_cluster_index(selectidx, llm_method, llm_ctype)
        alpha_factor = 10 if llm_ctype == "kmeans" else 100
        plot_title = f"{llm_method} + {llm_ctype} clustering"
        axlist.append(get_bokeh_plot(X_embedding_cosine_30_01, selectidx, clusteridx, trait_df, plot_title, alpha_factor = alpha_factor))

# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
bokeh_show(p)
Figure 1