Interactive UMAP plot from PanUKB data

Author

Saikat Banerjee

Published

March 27, 2024

Abstract

To understand the disease network, we plot an interactive UMAP and compare with neighbors obtained from LLM.

Setup

We have to first load a bunch of useful tools including UMAP and Bokeh.

Code

import os
import numpy as np
import pandas as pd
import pickle
import re

import umap
from bokeh.plotting import figure as bokeh_figure
from bokeh.plotting import show as bokeh_show
from bokeh.layouts import column as bokeh_column
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import CategoricalColorMapper

output_notebook()

Loading BokehJS ...

Load data and results

Here, we explore the low rank model from nuclear norm minimization with the sparse matrix.

Code

data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/nnsparsh/full/"

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_all.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_all_with_desc.pkl"))

variant_filename = f"{data_dir}/allvar.pruned.closesttss.hugo"
variant_df       = pd.read_csv(variant_filename, sep = '\t')

method = 'nnm_sparse'

res_filename = os.path.join(result_dir, f"{method}_model.pkl")
with (open(res_filename, "rb")) as fh:
    lowrank_model = pickle.load(fh)

X = np.array(zscore_df.drop(labels = ['rsid'], axis = 1).values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)
lowX = lowrank_model['X_']
lowX_cent = lowX - np.mean(lowX, axis = 0, keepdims = True)
lowX_std = lowX_cent / np.sqrt(np.prod(lowX_cent.shape))

print ("Nuclear Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'nuc'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'nuc'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'nuc'):.3f}")

Nuclear Norms
Low rank model: 8050.750
Input data: 496751.155
Input data (mean centered): 495872.387

Apply UMAP

We apply UMAP on the original data.

Code

X_reducer_cosine_30_01   = umap.UMAP(n_neighbors = 30, metric = 'cosine', min_dist = 0.1)
X_embedding_cosine_30_01 = X_reducer_cosine_30_01.fit_transform(X)

X_reducer_euclidean_15_01   = umap.UMAP(n_neighbors = 15, metric = 'euclidean', min_dist = 0.1)
X_embedding_euclidean_15_01 = X_reducer_euclidean_15_01.fit_transform(X)

X_reducer_cosine_15_01   = umap.UMAP(n_neighbors = 15, metric = 'cosine', min_dist = 0.1)
X_embedding_cosine_15_01 = X_reducer_cosine_15_01.fit_transform(X)

Interactive Plots

Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions.

Code

hex_colors = [
    '#2D69C4', # blue
    '#CC2529', # red
    '#93AA00', # Vivid Yellowish Green
    '#535154', # gray
    '#6B4C9A', # purple
    '#FFB300', # Vivid Yellow
    '#922428', # dark brown
    '#948B3D', # olive
]

hex_colors_40 = [
    "#3c3c3c0d",
    "#084609",
    "#ff4ff4",
    "#01d94a",
    "#b700ce",
    "#91c900",
    "#5f42ed",
    "#5fa200",
    "#8d6dff",
    "#c9f06b",
    "#0132a7",
    "#ffbb1f",
    "#0080ed",
    "#f56600",
    "#3afaf5",
    "#c10001",
    "#01e698",
    "#a20096",
    "#00e2c1",
    "#ff5ac8",
    "#008143",
    "#cd0057",
    "#4aeeff",
    "#8c001a",
    "#b5f2a2",
    "#5d177d",
    "#a99900",
    "#e299ff",
    "#5b6b00",
    "#96aeff",
    "#a46f00",
    "#007acb",
    "#ff9757",
    "#00a8e0",
    "#ff708e",
    "#baefc7",
    "#622b25",
    "#c8c797",
    "#885162",
    "#ffb7a5",
    "#ffa3c3"]

trait_type_unique = trait_df['trait_type'].unique().tolist()
trait_type_dict = {
    trait: color for trait, color in zip(
        trait_type_unique, 
        hex_colors[:len(trait_type_unique)])
    }

llm_methods = [
    "ls-da3m0ns/bge_large_medical",
    "medicalai/ClinicalBERT",
    "emilyalsentzer/Bio_ClinicalBERT",
]

llm_ctypes = ["community", "kmeans"]

llm_clusters = {method : { x : None for x in llm_ctypes } for method in llm_methods}
llm_outdir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"

for method in llm_methods:
    for ctype in llm_ctypes:
        m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")
        with open(m_filename, "rb") as fh:
            llm_clusters[method][ctype] = pickle.load(fh)
            
def get_llm_cluster_index(selectidx, method, ctype):
    clusteridx = np.full([selectidx.shape[0],], -1)
    for i, ccomps in enumerate(llm_clusters[method][ctype]):
        for idx in ccomps:
            clusteridx[idx] = i
    return clusteridx

Code

def get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, umap_string, color_palette = hex_colors_40, alpha_factor = 10):
    
    plot_dict = dict(
        x = embedding[selectidx, 0],
        y = embedding[selectidx, 1],
        trait_type_code = [f"{x}" for x in clusteridx],
        h2_fill_alpha = [min(0.6, alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()],
        h2_line_alpha = [min(0.8, 1.3 * alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()],
        fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx],
    )

    color_mapping = CategoricalColorMapper(factors = [f"{x}" for x in np.unique(clusteridx)], palette = color_palette)

    plot_tooltips = [
        ("Desc", "@fulldesc"),
    ]

    ax = bokeh_figure(
        width = 800, height = 800, 
        tooltips = plot_tooltips,
        title = umap_string ,
    )

    ax.circle('x', 'y', size = 10, 
        source = ColumnDataSource(plot_dict), 
        color = dict(field='trait_type_code', transform = color_mapping),
        line_alpha = dict(field='h2_line_alpha'),
        fill_alpha = dict(field='h2_fill_alpha'),
    )
    ax.title.text_font_size = '20pt'
    ax.axis.major_label_text_font_size = '20pt'
    ax.axis.axis_line_width = 2
    ax.axis.major_tick_line_width = 2
    ax.grid.visible = False
    return ax

Code

selectidx = np.array(trait_df.index)

axlist = list()
for llm_method in llm_methods:
    for llm_ctype in llm_ctypes:
        clusteridx = get_llm_cluster_index(selectidx, llm_method, llm_ctype)
        alpha_factor = 10 if llm_ctype == "kmeans" else 100
        plot_title = f"{llm_method} + {llm_ctype} clustering"
        axlist.append(get_bokeh_plot(X_embedding_cosine_30_01, selectidx, clusteridx, trait_df, plot_title, alpha_factor = alpha_factor))

# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
bokeh_show(p)

(a) Visualization of UMAP embeddings of the original data. Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions. The opaciy of each point is proportional to the estimated heritability of the trait.

(b)

Figure 1