Interactive UMAP plot from PanUKB data


Saikat Banerjee


March 27, 2024

To understand the disease network, we plot an interactive UMAP and compare with neighbors obtained from LLM.


We have to first load a bunch of useful tools including UMAP and Bokeh.

import os
import numpy as np
import pandas as pd
import pickle
import re

import umap
from bokeh.plotting import figure as bokeh_figure
from bokeh.plotting import show as bokeh_show
from bokeh.layouts import column as bokeh_column
from import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import CategoricalColorMapper

Load data and results

Here, we explore the low rank model from nuclear norm minimization with the sparse matrix.

data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/nnsparsh/full/"

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_all.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_all_with_desc.pkl"))

variant_filename = f"{data_dir}/allvar.pruned.closesttss.hugo"
variant_df       = pd.read_csv(variant_filename, sep = '\t')

method = 'nnm_sparse'

res_filename = os.path.join(result_dir, f"{method}_model.pkl")
with (open(res_filename, "rb")) as fh:
    lowrank_model = pickle.load(fh)

X = np.array(zscore_df.drop(labels = ['rsid'], axis = 1).values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)
lowX = lowrank_model['X_']
lowX_cent = lowX - np.mean(lowX, axis = 0, keepdims = True)
lowX_std = lowX_cent / np.sqrt(

print ("Nuclear Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'nuc'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'nuc'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'nuc'):.3f}")
Nuclear Norms
Low rank model: 8050.750
Input data: 496751.155
Input data (mean centered): 495872.387

Apply UMAP

We apply UMAP on the original data.

X_reducer_cosine_30_01   = umap.UMAP(n_neighbors = 30, metric = 'cosine', min_dist = 0.1)
X_embedding_cosine_30_01 = X_reducer_cosine_30_01.fit_transform(X)

X_reducer_euclidean_15_01   = umap.UMAP(n_neighbors = 15, metric = 'euclidean', min_dist = 0.1)
X_embedding_euclidean_15_01 = X_reducer_euclidean_15_01.fit_transform(X)

X_reducer_cosine_15_01   = umap.UMAP(n_neighbors = 15, metric = 'cosine', min_dist = 0.1)
X_embedding_cosine_15_01 = X_reducer_cosine_15_01.fit_transform(X)

Interactive Plots

Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions.

hex_colors = [
    '#2D69C4', # blue
    '#CC2529', # red
    '#93AA00', # Vivid Yellowish Green
    '#535154', # gray
    '#6B4C9A', # purple
    '#FFB300', # Vivid Yellow
    '#922428', # dark brown
    '#948B3D', # olive

hex_colors_40 = [

trait_type_unique = trait_df['trait_type'].unique().tolist()
trait_type_dict = {
    trait: color for trait, color in zip(

llm_methods = [

llm_ctypes = ["community", "kmeans"]

llm_clusters = {method : { x : None for x in llm_ctypes } for method in llm_methods}
llm_outdir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"

for method in llm_methods:
    for ctype in llm_ctypes:
        m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")
        with open(m_filename, "rb") as fh:
            llm_clusters[method][ctype] = pickle.load(fh)
def get_llm_cluster_index(selectidx, method, ctype):
    clusteridx = np.full([selectidx.shape[0],], -1)
    for i, ccomps in enumerate(llm_clusters[method][ctype]):
        for idx in ccomps:
            clusteridx[idx] = i
    return clusteridx
def get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, umap_string, color_palette = hex_colors_40, alpha_factor = 10):
    plot_dict = dict(
        x = embedding[selectidx, 0],
        y = embedding[selectidx, 1],
        trait_type_code = [f"{x}" for x in clusteridx],
        h2_fill_alpha = [min(0.6, alpha_factor * x) for x in trait_df[''].fillna(1e-6).tolist()],
        h2_line_alpha = [min(0.8, 1.3 * alpha_factor * x) for x in trait_df[''].fillna(1e-6).tolist()],
        fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, '']:.3f} | {trait_df.loc[i, 'Neff']:.2f}" for i in selectidx],

    color_mapping = CategoricalColorMapper(factors = [f"{x}" for x in np.unique(clusteridx)], palette = color_palette)

    plot_tooltips = [
        ("Desc", "@fulldesc"),

    ax = bokeh_figure(
        width = 800, height = 800, 
        tooltips = plot_tooltips,
        title = umap_string ,
    )'x', 'y', size = 10, 
        source = ColumnDataSource(plot_dict), 
        color = dict(field='trait_type_code', transform = color_mapping),
        line_alpha = dict(field='h2_line_alpha'),
        fill_alpha = dict(field='h2_fill_alpha'),
    ax.title.text_font_size = '20pt'
    ax.axis.major_label_text_font_size = '20pt'
    ax.axis.axis_line_width = 2
    ax.axis.major_tick_line_width = 2
    ax.grid.visible = False
    return ax
selectidx = np.array(trait_df.index)

axlist = list()
for llm_method in llm_methods:
    for llm_ctype in llm_ctypes:
        clusteridx = get_llm_cluster_index(selectidx, llm_method, llm_ctype)
        alpha_factor = 10 if llm_ctype == "kmeans" else 100
        plot_title = f"{llm_method} + {llm_ctype} clustering"
        axlist.append(get_bokeh_plot(X_embedding_cosine_30_01, selectidx, clusteridx, trait_df, plot_title, alpha_factor = alpha_factor))

# put all the plots in a VBox
p = bokeh_column(*axlist)

# show the results
(a) Visualization of UMAP embeddings of the original data. Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions. The opaciy of each point is proportional to the estimated heritability of the trait.
Figure 1