ColormaNN manuscript figures - PanUKB

Author

Saikat Banerjee

Published

December 12, 2024

Abstract
High quality plots used for PanUKB results.
Code
import os
import numpy as np
import pandas as pd
import pickle
import re

import matplotlib
import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils

mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120)

import umap
from sklearn.neighbors import kneighbors_graph
from sklearn.manifold import SpectralEmbedding, TSNE, LocallyLinearEmbedding, Isomap, MDS
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

from sentence_transformers import util as st_util
Code
import matplotlib.font_manager as mpl_fm
font_path = '/gpfs/commons/home/sbanerjee/nygc/Boehringer_Forward_Latin_Cycrillic_Greek/Text'
mpl_fm.fontManager.addfont(font_path + '/BoehringerForwardText.ttf') # Loads "Boehringer Forward Text"

# NYGC Color Palette
nygc_colors = {
    'brown': '#7F0814',
    'darkred': '#d42e12',
    'orange': '#F37239',
    'darkyellow': '#F79320',
    'yellow': '#FFE438',
    'darkblue': '#003059',
    'blue': '#266DB6',
    'lightblue': '#A3D5ED',
    'darkgreen': '#006838',
    'green': '#0A8A42',
    'lightgreen': '#74B74A',
    'yellowgreen': '#BAD75F',
    'darkgray': '#1A1A1A',
    'gray': '#666666',
    'lightgray': '#CCCCCC',
    'khaki': '#ADA194',
    'darkkhaki': '#5E514D',
}

boehringer_colors = {
    'darkgreen' : '#08312A',
    'accentgreen': '#00E47C',
    'warmgray': '#E5E3DE',
    'lightgray': '#F6F5F3',
    'lightyellow': '#fbf9aa',
    'yellow': '#ffe667',
    'mediumyellow': '#ffd03d',
    'mediumred': '#ee6541',
    'mediumblue': '#6ad2e2',
    'lightviolet': '#e0e1f6',
    'violet': '#c5c3ee',
    'mediumviolet': '#928bde',
}

# # Style sheet for manuscript
# mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 28, 
#     splinecolor = nygc_colors['darkgray'], black = nygc_colors['darkgray'])

# Style sheet for Boehringer
mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 22, 
    splinecolor = boehringer_colors['darkgreen'], black = boehringer_colors['darkgreen'])
plt.rcParams['font.family'] = 'Boehringer Forward Text'
Code
data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/npddata/panukb/results/colormann-svd"

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_noRx.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_all_with_desc.pkl"))
trait_df  = trait_df.query('trait_type != "prescriptions"')

method = 'nnm-sparse'
method_resdir = os.path.join(result_dir, method, "noRx")
with (open(f"{method_resdir}/mf_comps_k200.pkl", "rb")) as fh:
    loadings, factors, cos2_pheno, cos2_variant, \
    contribution_pheno, contribution_variant = pickle.load(fh)
with (open(f"{method_resdir}/pca_comps.pkl", "rb")) as fh:
    U, S, V = pickle.load(fh)
with (open(f"{method_resdir}/sparseX.pkl", "rb")) as fh:
    sparseM = pickle.load(fh)


X = np.array(zscore_df.values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)

res_filename = os.path.join(method_resdir, "nnm_sparse_model_r155872_iter1000.pkl")
with (open(res_filename, "rb")) as fh:
    lowrank_model = pickle.load(fh)
lowX = lowrank_model['X_']
lowX_cent = lowX - np.mean(lowX, axis = 0, keepdims = True)
lowX_std = lowX_cent / np.sqrt(np.prod(lowX_cent.shape))

Properties of the low rank matrix

Code
print ("Nuclear Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'nuc'):.3f}")
print (f"Low rank model (mean centered): {np.linalg.norm(lowX_cent, ord = 'nuc'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'nuc'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'nuc'):.3f}")
Nuclear Norms
Low rank model: 106525.779
Low rank model (mean centered): 106525.766
Input data: 431988.255
Input data (mean centered): 431234.720
Code
print ("Frobenius Norms")
print (f"Low rank model: {np.linalg.norm(lowX, ord = 'fro'):.3f}")
print (f"Low rank model (mean centered): {np.linalg.norm(lowX_cent, ord = 'fro'):.3f}")
print (f"Input data: {np.linalg.norm(X, ord = 'fro'):.3f}")
print (f"Input data (mean centered): {np.linalg.norm(X_cent, ord = 'fro'):.3f}")
Frobenius Norms
Low rank model: 7166.610
Low rank model (mean centered): 7166.610
Input data: 12539.901
Input data (mean centered): 12462.378
Code
# def compute_r2(X, Y):
#     """
#     Model: Y = X @ B
#     """
#     # 1. Solve for B using least squares (Moore-Penrose pseudoinverse via np.linalg.lstsq)
#     B, residuals, rank, s = np.linalg.lstsq(X, Y, rcond=None)  # B will be 51368 x 51368

#     # 2. Compute Sum of Squared Errors (SSE) and Total Sum of Squares (SST)
#     # If residuals is empty (underdetermined case), compute SSE manually:
#     if residuals.size == 0:
#         # Calculate SSE by summing (Y - Y_pred)^2 over all entries
#         Y_pred = X.dot(B)
#         ss_res = np.sum((Y - Y_pred)**2)
#     else:
#         # residuals array contains SSE for each output column; sum them up
#         ss_res = residuals.sum()

#     # Calculate SST by summing (Y - Y_mean)^2 over all entries
#     Y_mean = Y.mean(axis=0)               # mean of each column (should be ~0 if centered)
#     ss_tot = np.sum((Y - Y_mean)**2)      # total variance across all outputs

#     # 3. Compute R^2 as 1 - SSE/SST
#     R2 = 1 - (ss_res / ss_tot)

#     return R2

# compute_r2(X_cent, lowX_cent)
Code
lowrank_model.keys()
dict_keys(['max_iter_', 'model_', 'svd_method_', 'svd_max_iter_', 'simplex_method_', 'stop_criteria_', 'tol_', 'step_size_tol_', 'fxrel_tol_', 'show_progress_', 'prog_step_skip_', 'suppress_warnings_', 'is_benchmark_', 'benchmark_method_', 'Y_', 'mask_', 'weight_', 'weight_mask_', 'rank_', 'l1_thres_', 'fm_list_', 'fl_list_', 'fx_list_', 'dg_list_', 'st_list_', 'cpu_time_', 'convergence_msg_', 'X_', 'M_'])
Code
lowrank_model['l1_thres_']
1770.3268026594692
Code
lowrank_model['rank_']
155872.0
Code
print ("Matrix ranks")
print (f"Low rank model: {np.linalg.matrix_rank(lowX):.3f}")
print (f"Input data: {np.linalg.matrix_rank(X):.3f}")
print (f"Input data (mean centered): {np.linalg.matrix_rank(X_cent):.3f}")
Matrix ranks
Low rank model: 560.000
Input data: 2110.000
Input data (mean centered): 2109.000
Code
dist_matrix = kneighbors_graph(loadings, n_neighbors = 100, 
                               mode='distance', metric = 'cosine')
tsne_embedding  = TSNE(n_components = 2, init = "random", perplexity = 20, 
                       early_exaggeration = 12, learning_rate = 'auto', 
                       random_state = 42, metric = 'precomputed')
loadings_2d = tsne_embedding.fit_transform(dist_matrix)
Code
hex_colors_40 = [
    "#084609", 
    "#ff4ff4",
    "#01d94a",
    "#b700ce",
    "#91c900",
    "#5f42ed",
    "#5fa200",
    "#8d6dff",
    "#c9f06b",
    "#0132a7",
    "#ffbb1f",
    "#0080ed",
    "#f56600",
    "#3afaf5",
    "#c10001",
    "#01e698",
    "#a20096",
    "#00e2c1",
    "#ff5ac8",
    "#008143",
    "#cd0057",
    "#4aeeff",
    "#8c001a",
    "#b5f2a2",
    "#5d177d",
    "#a99900",
    "#e299ff",
    "#5b6b00",
    "#96aeff",
    "#a46f00",
    "#007acb",
    "#ff9757",
    "#00a8e0",
    "#ff708e",
    "#baefc7",
    "#622b25",
    "#c8c797",
    "#885162",
    "#ffb7a5",
    "#ffa3c3"]

llm_methods = [
    "ls-da3m0ns/bge_large_medical",
    "medicalai/ClinicalBERT",
    "emilyalsentzer/Bio_ClinicalBERT",
]

llm_ctypes = [
    "community", 
    "kmeans", 
    "agglomerative"]

llm_outdir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"

llm_clusters = {method : { x : None for x in llm_ctypes } for method in llm_methods}
for method in llm_methods:
    for ctype in llm_ctypes:
        m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")
        with open(m_filename, "rb") as fh:
            llm_clusters[method][ctype] = pickle.load(fh)
            
def get_llm_cluster_labels(trait_idx, llm_cluster):
    clusteridx = np.full([trait_idx.shape[0],], -1)
    for i, ccomps in enumerate(llm_cluster):
        for idx in ccomps:
            if idx in trait_idx:
                clusteridx[np.where(trait_idx == idx)] = i
    return clusteridx
Code
llm_method = "ls-da3m0ns/bge_large_medical"
llm_ctype = "agglomerative"
data_labels = get_llm_cluster_labels(trait_df.index.to_numpy(), 
                                    llm_clusters[llm_method][llm_ctype])
llm_categories = {
    1: "Sleep, speech, lifestyle and environment",
    2: "Cardiovascular system and hematologic disorders",
    3: "Reproductive health and related disorders",
    4: "Diabetes and lipid metabolism",
    5: "Body composition, BMI, obesity and nutrition",
    6: "Gastrointestinal disorders and related conditions",
    7: "Respiratory disorders and infectious diseases",
    8: "Neurological and musculoskeletal conditions",
    9: "Medical examinations, follow-ups, and family history",
    10: "Medications and allergies",
    11: "Vision and refractive metrics",
    12: "Ear, nose, throat, and dental health",
    13: "Toxicology and supplemental medications",
    14: "Dietary intake",
    15: "Respiratory disorders and associated medications",
    16: "Hematological metrics",
    17: "Urinary system disorders",
    18: "Alcohol consumption and related behavior",
    19: "Demographics and life events",
    20: "Ocular health",
    21: "Self-reported health issues",
    22: "Skin disorders and infections",
    23: "Tobacco use and exposure",
    24: "Blood pressure metrics",
    25: "Carotid intima-media thickness (IMT)",
    26: "Neoplasms and cancers",
    27: "Metabolic and nutritional disorders",
    28: "Thyroid-related conditions",
    29: "Injuries and trauma",
    30: "Mental health and emotional well-being"}
Code
llm_categories_count = [len(x) for x in llm_clusters[llm_method][llm_ctype]]
Code
from matplotlib import colors as mpl_colors
import matplotlib.patches as mpatches
import matplotlib.lines as mlines

fig = plt.figure(figsize = (22, 9))
gs = fig.add_gridspec(1, 2, width_ratios=(1, 0.5), wspace=0, hspace=0)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax1.set_aspect(1.0)


alpha_factor = 30
h2_list = [max(1e-6, x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()]
fill_alpha_list = [min(0.6, alpha_factor * x) for x in h2_list]
line_alpha_list = [min(1.0, 1.8 * alpha_factor * x) for x in h2_list]
face_colors = [mpl_colors.to_rgba(hex_colors_40[x], fill_alpha_list[i]) 
               for i, x in enumerate(data_labels)]
edge_colors = [mpl_colors.to_rgba(hex_colors_40[x], line_alpha_list[i]) 
               for i, x in enumerate(data_labels)]

ax1.scatter(loadings_2d[:,0], loadings_2d[:,1], color = face_colors,
            s = 100,
            edgecolors = edge_colors)

handles = [mlines.Line2D([], [], color = hex_colors_40[i-1], marker='o',
                          markerfacecolor = hex_colors_40[i-1], 
                          markersize = 15, label = f'{cname} ({llm_categories_count[i-1]})')
           for i, cname in llm_categories.items()]
legend = ax2.legend(handles = handles, ncol = 2, loc = 'center left',
           labelspacing = 1, title = "30 text-based disease categories",
           prop={'size': 12})
legend._legend_box.sep = 30

for ax in [ax1, ax2]:
    ax.tick_params(bottom = False, top = False, left = False, right = False,
                   labelbottom = False, labeltop = False, labelleft = False, labelright = False)
for side, border in list(ax1.spines.items()) + list(ax2.spines.items()):
    border.set_visible(False)
    
gs.tight_layout(fig)
# plt.savefig('../plots/colormann-manuscript/panukb_nnmsparse_embedding.pdf', bbox_inches='tight')
plt.savefig('../plots/bi-2026/panukb_nnmsparse_embedding.png', transparent = True, bbox_inches='tight')
plt.show()

Select top 200 traits using heritability and look at their loadings. After selecting, sort them according to their data_labels.

Code
# h2_array = np.array(h2_list)
# top_phenotype_idx = np.argsort(h2_array)[::-1][:200]
# top_phenotype_idx_sorted = top_phenotype_idx[np.argsort(data_labels[top_phenotype_idx])]
# top_loadings = loadings[top_phenotype_idx_sorted, :]

Select top 20 in each category and remove traits below a certain cutoff.

Code
h2_array = np.array(h2_list)
select_phenotype_idx = list()
for i in llm_categories.keys():
    # print(i)
    pidx = np.where(data_labels == i - 1)[0]
    # print(pidx)
    pidx_sorted_clipped = pidx[np.argsort(h2_array[pidx])[::-1]][:10]
    select_phenotype_idx += pidx_sorted_clipped.flatten().tolist()
    
select_phenotype_idx = np.array(select_phenotype_idx)
top_phenotype_idx = select_phenotype_idx[np.where(h2_array[select_phenotype_idx] > 0.01)[0]]
# top_phenotype_idx_sorted = top_phenotype_idx[np.argsort(data_labels[top_phenotype_idx])]
top_loadings = loadings[top_phenotype_idx, :]
Code
top_phenotype_names = [trait_df['short_description'].to_list()[i] for i in top_phenotype_idx]
Code
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.preprocessing import normalize as sk_normalize
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 300)

plt_data = top_loadings[:,:101]
plt_data_norm = sk_normalize(plt_data, axis = 0)
# plt_data_labels = data_labels[top_phenotype_idx]
plt_cluster_idx = data_labels[top_phenotype_idx]

fig = plt.figure(figsize = (6, 8))
ax1 = fig.add_subplot(111)
divider = make_axes_locatable(ax1)
cax = divider.append_axes('right', size='5%', pad=0.05)

im = ax1.imshow(plt_data_norm, cmap='bwr')
fig.colorbar(im, cax = cax, orientation='vertical', label = 'Loadings')


ax1.set_xticks([0,25,50,75,100])
ax1.set_xlabel("Factors")

ax1.set_yticks(np.arange(top_phenotype_idx.shape[0]))
ax1.set_yticklabels(top_phenotype_names, size = 1.5)
ax1.tick_params(left=False)
ax1.yaxis.set_tick_params(pad = 0)
for i, ytick in enumerate(ax1.get_yticklabels()):
    plt.setp(ytick, color = hex_colors_40[plt_cluster_idx[i]])

# plt.savefig('../plots/colormann-manuscript/panukb_nnmsparse_loadings.pdf', bbox_inches='tight')
# Show the plot
plt.show()

Code
for i, pidx in enumerate(top_phenotype_idx):
    print (f"{llm_categories[plt_cluster_idx[i]+1]}\t|\t{top_phenotype_names[i]}")
Sleep, speech, lifestyle and environment    |   FEV1 (predict)
Sleep, speech, lifestyle and environment    |   FVC (best)
Sleep, speech, lifestyle and environment    |   Fluid intelligence score
Sleep, speech, lifestyle and environment    |   FEV1/FVC ratio
Sleep, speech, lifestyle and environment    |   FEV1 (best)
Sleep, speech, lifestyle and environment    |   FEV1, predicted percentage
Sleep, speech, lifestyle and environment    |   FVC
Sleep, speech, lifestyle and environment    |   FEV1
Sleep, speech, lifestyle and environment    |   P duration
Sleep, speech, lifestyle and environment    |   QRS duration
Cardiovascular system and hematologic disorders |   Age DVT diagnosed
Cardiovascular system and hematologic disorders |   Pulse rate (during BP measurement)
Cardiovascular system and hematologic disorders |   Pulse rate, automated reading
Cardiovascular system and hematologic disorders |   Pulse rate
Cardiovascular system and hematologic disorders |   Ventricular rate
Cardiovascular system and hematologic disorders |   Hypertension
Cardiovascular system and hematologic disorders |   Essential hypertension
Cardiovascular system and hematologic disorders |   ECG, load
Cardiovascular system and hematologic disorders |   ECG, heart rate
Cardiovascular system and hematologic disorders |   ECG, number of stages in a phase
Reproductive health and related disorders   |   Number of pregnancy terminations
Reproductive health and related disorders   |   Number of spontaneous miscarriages
Reproductive health and related disorders   |   O72 Postpartum haemorrhage
Reproductive health and related disorders   |   Hyperplasia of prostate
Reproductive health and related disorders   |   Cancer of prostate
Reproductive health and related disorders   |   Genital prolapse
Reproductive health and related disorders   |   Prolapse of vaginal walls
Reproductive health and related disorders   |   Number of stillbirths
Reproductive health and related disorders   |   Irregular menstrual cycle/bleeding
Reproductive health and related disorders   |   Excessive or frequent menstruation
Diabetes and lipid metabolism   |   IGF-1
Diabetes and lipid metabolism   |   HDL cholesterol
Diabetes and lipid metabolism   |   Glycated haemoglobin (HbA1c)
Diabetes and lipid metabolism   |   Apolipoprotein A
Diabetes and lipid metabolism   |   Triglycerides
Diabetes and lipid metabolism   |   LDL direct, adjusted by medication
Diabetes and lipid metabolism   |   Cholesterol
Diabetes and lipid metabolism   |   Apolipoprotein B
Diabetes and lipid metabolism   |   LDL direct
Diabetes and lipid metabolism   |   Glucose
Body composition, BMI, obesity and nutrition    |   Standing height
Body composition, BMI, obesity and nutrition    |   Sitting height
Body composition, BMI, obesity and nutrition    |   Whole body fat-free mass
Body composition, BMI, obesity and nutrition    |   Whole body water mass
Body composition, BMI, obesity and nutrition    |   Trunk predicted mass
Body composition, BMI, obesity and nutrition    |   Trunk fat-free mass
Body composition, BMI, obesity and nutrition    |   Ankle spacing width
Body composition, BMI, obesity and nutrition    |   Leg fat-free mass (left)
Body composition, BMI, obesity and nutrition    |   Leg predicted mass (right)
Body composition, BMI, obesity and nutrition    |   Leg fat-free mass (right)
Gastrointestinal disorders and related conditions   |   Total bilirubin
Gastrointestinal disorders and related conditions   |   Indirect bilirubin
Gastrointestinal disorders and related conditions   |   Direct bilirubin
Gastrointestinal disorders and related conditions   |   Diverticulosis and diverticulitis
Gastrointestinal disorders and related conditions   |   Diverticulosis
Gastrointestinal disorders and related conditions   |   K57 Diverticular disease of intestine
Gastrointestinal disorders and related conditions   |   Intestinal malabsorption (non-celiac)
Gastrointestinal disorders and related conditions   |   Malabsorption, coeliac disease self-reported
Gastrointestinal disorders and related conditions   |   Diaphragmatic hernia
Gastrointestinal disorders and related conditions   |   Celiac disease
Respiratory disorders and infectious diseases   |   Basal metabolic rate
Respiratory disorders and infectious diseases   |   Age emphysema diagnosed
Respiratory disorders and infectious diseases   |   Pheno 48 / pheno 49
Respiratory disorders and infectious diseases   |   Peak expiratory flow (PEF)
Respiratory disorders and infectious diseases   |   Ever taken cannabis
Respiratory disorders and infectious diseases   |   Hot drink temperature
Respiratory disorders and infectious diseases   |   Maximum frequency of taking cannabis
Respiratory disorders and infectious diseases   |   COVID-19 positive vs negative
Respiratory disorders and infectious diseases   |   Age when last took cannabis
Respiratory disorders and infectious diseases   |   Spells in hospital
Neurological and musculoskeletal conditions |   Multi-site chronic pain
Neurological and musculoskeletal conditions |   Other arthropathies
Neurological and musculoskeletal conditions |   Rheumatoid factor
Neurological and musculoskeletal conditions |   M17 Gonarthrosis [arthrosis of knee]
Neurological and musculoskeletal conditions |   Unspecified monoarthritis
Neurological and musculoskeletal conditions |   Osteoarthrosis
Neurological and musculoskeletal conditions |   Leg pain on walking : action taken
Neurological and musculoskeletal conditions |   Osteoarthritis; localized
Neurological and musculoskeletal conditions |   Dupuytren's disease
Neurological and musculoskeletal conditions |   Other peripheral nerve disorders
Medical examinations, follow-ups, and family history    |   Z86 Personal history of certain diseases
Medical examinations, follow-ups, and family history    |   Z92 Personal history of medical treatment
Medical examinations, follow-ups, and family history    |   Z96 Other functional implants
Medical examinations, follow-ups, and family history    |   Z88 Personal history of allergy to medication
Medical examinations, follow-ups, and family history    |   Z72 Problems related to lifestyle
Medical examinations, follow-ups, and family history    |   Z95 Cardiac and vascular prosthesis
Medical examinations, follow-ups, and family history    |   Z82 Family history of chronic diseases
Medical examinations, follow-ups, and family history    |   Personal history of digestive system diseases
Medications and allergies   |   SHBG
Medications and allergies   |   Bendroflumethiazide
Medications and allergies   |   Amlodipine
Medications and allergies   |   Omeprazole
Medications and allergies   |   Ramipril
Medications and allergies   |   Atenolol
Medications and allergies   |   Lansoprazole
Medications and allergies   |   Seretide 50 evohaler
Medications and allergies   |   Lisinopril
Medications and allergies   |   Gliclazide
Vision and refractive metrics   |   6mm strong meridian (left)
Vision and refractive metrics   |   6mm strong meridian (right)
Vision and refractive metrics   |   6mm weak meridian (left)
Vision and refractive metrics   |   6mm weak meridian (right)
Vision and refractive metrics   |   3mm strong meridian (right)
Vision and refractive metrics   |   3mm weak meridian (right)
Vision and refractive metrics   |   3mm weak meridian (left)
Vision and refractive metrics   |   3mm strong meridian (left)
Vision and refractive metrics   |   Spherical power (left)
Vision and refractive metrics   |   Spherical power (right)
Ear, nose, throat, and dental health    |   Age of hayfever diagnosis
Ear, nose, throat, and dental health    |   Age hay fever or eczema diagnosed
Ear, nose, throat, and dental health    |   Nasal polyps
Ear, nose, throat, and dental health    |   J33 Nasal polyp
Toxicology and supplemental medications |   Magnesium
Toxicology and supplemental medications |   Paracetamol
Toxicology and supplemental medications |   Englyst dietary fibre
Toxicology and supplemental medications |   Aspirin
Toxicology and supplemental medications |   Ibuprofen
Toxicology and supplemental medications |   Co-codamol
Toxicology and supplemental medications |   Poisoning by antibiotics
Toxicology and supplemental medications |   Quinine
Toxicology and supplemental medications |   Allergy/adverse effect of penicillin
Dietary intake  |   Salt added to food
Dietary intake  |   Oily fish intake
Dietary intake  |   Dried fruit intake
Dietary intake  |   Water intake
Dietary intake  |   Yogurt intake
Dietary intake  |   Intake of sugar added to tea
Dietary intake  |   Fresh fruit intake
Dietary intake  |   Cheese intake
Dietary intake  |   Cereal intake
Dietary intake  |   Tea intake
Respiratory disorders and associated medications    |   Age asthma diagnosed by doctor
Respiratory disorders and associated medications    |   Age asthma diagnosed
Respiratory disorders and associated medications    |   Asthma
Respiratory disorders and associated medications    |   J45 Asthma
Respiratory disorders and associated medications    |   Ventolin 100micrograms inhaler
Respiratory disorders and associated medications    |   Salbutamol
Hematological metrics   |   Platelet count
Hematological metrics   |   Mean platelet (thrombocyte) volume
Hematological metrics   |   Platelet crit
Hematological metrics   |   High light scatter reticulocyte count
Hematological metrics   |   High light scatter reticulocytes percent
Hematological metrics   |   Lymphocyte count
Hematological metrics   |   Red blood cell (erythrocyte) count
Hematological metrics   |   Red blood cell distribution width
Hematological metrics   |   White blood cell (leukocyte) count
Hematological metrics   |   Mean corpuscular haemoglobin
Urinary system disorders    |   Sodium in urine
Urinary system disorders    |   Microalbumin in urine
Urinary system disorders    |   Stress incontinence, female
Urinary system disorders    |   Urinary calculus
Urinary system disorders    |   N20 Calculus of kidney and ureter
Urinary system disorders    |   Hematuria
Urinary system disorders    |   Calculus of kidney
Urinary system disorders    |   N39 Other disorders of urinary system
Alcohol consumption and related behavior    |   Average monthly spirits intake
Alcohol consumption and related behavior    |   Alcohol intake frequency.
Alcohol consumption and related behavior    |   Frequency of drinking alcohol
Alcohol consumption and related behavior    |   Average monthly red wine intake
Alcohol consumption and related behavior    |   Average monthly fortified wine intake
Alcohol consumption and related behavior    |   Frequ of guilt/remorse, alcohol
Alcohol consumption and related behavior    |   Average weekly red wine intake
Alcohol consumption and related behavior    |   Alcohol intake per drinking day
Alcohol consumption and related behavior    |   Freq of consuming 6+ units alcohol
Alcohol consumption and related behavior    |   Average weekly beer plus cider intake
Demographics and life events    |   Age when periods started (menarche)
Demographics and life events    |   Age at first live birth
Demographics and life events    |   Year ended full time education
Demographics and life events    |   Relative age of first facial hair
Demographics and life events    |   Age at menopause (last menstrual period)
Demographics and life events    |   Age completed full time education
Demographics and life events    |   Age at hysterectomy
Demographics and life events    |   Father's age
Demographics and life events    |   Age at last live birth
Demographics and life events    |   Length of menstrual cycle
Ocular health   |   Cataract
Ocular health   |   H26 Other cataract
Ocular health   |   Glaucoma
Ocular health   |   Retinal detachments and defects
Ocular health   |   Retinal detachment with retinal defect
Ocular health   |   Senile cataract
Self-reported health issues |   Hypertension self-reported
Self-reported health issues |   Self-reported non-cancer illnesses count
Self-reported health issues |   Asthma self-reported
Self-reported health issues |   High cholesterol self-reported
Self-reported health issues |   Diabetes self-reported
Self-reported health issues |   Hayfever/allergic rhinitis self-reported
Self-reported health issues |   Angina self-reported
Self-reported health issues |   Heart attack, MI self-reported
Self-reported health issues |   Osteoarthritis self-reported
Self-reported health issues |   Depression self-reported
Skin disorders and infections   |   Childhood sunburn occasions
Skin disorders and infections   |   Skin colour
Skin disorders and infections   |   Ease of skin tanning
Skin disorders and infections   |   Use of sun/uv protection
Skin disorders and infections   |   Other non-epithelial cancer of skin
Skin disorders and infections   |   C44 Other malignant neoplasms of skin
Skin disorders and infections   |   Skin cancer
Skin disorders and infections   |   Diseases of sebaceous glands
Skin disorders and infections   |   Sebaceous cyst
Skin disorders and infections   |   H40 Glaucoma
Tobacco use and exposure    |   Smoking status, ever vs never
Tobacco use and exposure    |   Previous daily cigarettes
Tobacco use and exposure    |   Past tobacco smoking
Tobacco use and exposure    |   Combined daily cigarettes (prev + current)
Tobacco use and exposure    |   Daily cigarettes (current smokers)
Tobacco use and exposure    |   Difficulty not smoking for 1 day
Tobacco use and exposure    |   Age started smoking in former smokers
Tobacco use and exposure    |   Age started smoking in current smokers
Tobacco use and exposure    |   Current tobacco smoking
Tobacco use and exposure    |   Time from waking to first cigarette
Blood pressure metrics  |   SBP (combined_medadj)
Blood pressure metrics  |   Mean arterial pressure (combined_medadj)
Blood pressure metrics  |   SBP (auto_medadj)
Blood pressure metrics  |   Mean arterial pressure (auto_medadj)
Blood pressure metrics  |   Pulse pressure (combined_medadj)
Blood pressure metrics  |   DBP (combined_medadj)
Blood pressure metrics  |   DBP (auto_medadj)
Blood pressure metrics  |   Mean arterial pressure (manual_medadj)
Blood pressure metrics  |   DBP (manual_medadj)
Blood pressure metrics  |   Pulse pressure (auto_medadj)
Carotid intima-media thickness (IMT)    |   Carotid IMT mean at 210°
Carotid intima-media thickness (IMT)    |   Carotid IMT mean at 240°
Carotid intima-media thickness (IMT)    |   Carotid IMT min at 210°
Carotid intima-media thickness (IMT)    |   Carotid IMT max at 240°
Carotid intima-media thickness (IMT)    |   Carotid IMT mean at 120°
Carotid intima-media thickness (IMT)    |   Carotid IMT max at 210°
Carotid intima-media thickness (IMT)    |   Carotid IMT min at 120°
Carotid intima-media thickness (IMT)    |   Carotid IMT max at 120°
Carotid intima-media thickness (IMT)    |   Carotid IMT mean at 150°
Carotid intima-media thickness (IMT)    |   Carotid IMT max at 150°
Neoplasms and cancers   |   Breast cancer [female]
Neoplasms and cancers   |   Malignant neoplasm of female breast
Neoplasms and cancers   |   Breast cancer
Neoplasms and cancers   |   Benign neoplasm of uterus
Neoplasms and cancers   |   C50 Malignant neoplasm of breast
Neoplasms and cancers   |   Uterine leiomyoma
Neoplasms and cancers   |   Ovarian cyst
Metabolic and nutritional disorders |   Estimated GFR, cystain C
Metabolic and nutritional disorders |   Cystatin C
Metabolic and nutritional disorders |   Estimated GFR, serum creatinine + cystain C
Metabolic and nutritional disorders |   Creatinine
Metabolic and nutritional disorders |   Estimated GFR, serum creatinine
Metabolic and nutritional disorders |   Alkaline phosphatase
Metabolic and nutritional disorders |   Non-albumin protein
Metabolic and nutritional disorders |   Albumin/Globulin ratio
Metabolic and nutritional disorders |   Gamma glutamyltransferase
Metabolic and nutritional disorders |   Urate
Thyroid-related conditions  |   Testosterone
Thyroid-related conditions  |   Hypothyroidism/myxoedema self-reported
Thyroid-related conditions  |   Levothyroxine sodium
Thyroid-related conditions  |   Hypothyroidism NOS
Thyroid-related conditions  |   Hypothyroidism
Thyroid-related conditions  |   E03 Other hypothyroidism
Thyroid-related conditions  |   Hyperthyroidism, thyrotoxicosis self-reported
Thyroid-related conditions  |   Oestradiol
Thyroid-related conditions  |   E04 Other non-toxic goitre
Injuries and trauma |   Falls in the last year
Injuries and trauma |   Dislocation
Injuries and trauma |   Fracture of lower limb
Mental health and emotional well-being  |   Age first had sexual intercourse
Mental health and emotional well-being  |   Overall health rating
Mental health and emotional well-being  |   Felt loved as a child
Mental health and emotional well-being  |   Health satisfaction
Mental health and emotional well-being  |   Ever thought that life not worth living
Mental health and emotional well-being  |   General happiness with own health
Mental health and emotional well-being  |   Family relationship satisfaction
Mental health and emotional well-being  |   General happiness
Mental health and emotional well-being  |   Friendships satisfaction
Mental health and emotional well-being  |   Freq of psychotic experiences
Code
cluster_idx, counts = np.unique(data_labels[top_phenotype_idx], return_counts=True)
trait_counts_dict = dict(zip(cluster_idx, counts))
trait_counts_cumsum = np.cumsum(list(trait_counts_dict.values()))
print (trait_counts_cumsum)
[ 10  20  30  40  50  60  70  80  88  98 108 112 121 131 137 147 155 165
 175 181 191 201 211 221 231 238 248 257 260 270]
Code
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.preprocessing import normalize as sk_normalize
from matplotlib.gridspec import GridSpec

mpl_stylesheet.banskt_presentation(dpi = 300, fontsize = 20, 
    splinecolor = nygc_colors['darkgray'], black = nygc_colors['darkgray'])

plt_data = top_loadings[:,:100]
plt_data_norm = sk_normalize(plt_data, axis = 0)



fig = plt.figure(figsize = (12, 24))
gs = GridSpec(100, 100, figure=fig)
colorbar_ax = fig.add_subplot(gs[0, 33:68])
ax1 = fig.add_subplot(gs[10:, :])

# ax1 = fig.add_subplot(111)
# divider = make_axes_locatable(ax1)
# cax = divider.append_axes('top', size='1%', pad=0.05)

im = ax1.imshow(plt_data_norm, cmap='bwr')
fig.colorbar(im, cax = colorbar_ax, orientation='horizontal', label = 'Loadings')
# colorbar_ax.tick_params(bottom=False, top=True)

# Decorate
cluster_idx, counts = np.unique(data_labels[top_phenotype_idx], return_counts=True)
trait_counts_dict = dict(zip(cluster_idx, counts))
trait_counts_cumsum = np.cumsum(list(trait_counts_dict.values()))
cluster_labels = [llm_categories[i+1] for i in cluster_idx]
for i in trait_counts_cumsum:
    ax1.axhline(y=i-0.5,alpha=0.8,linewidth=0.4,c='black')
cmsms = [0] + [i for i in trait_counts_cumsum]
midpts = [cmsms[i-1] + (cmsms[i]-cmsms[i-1])/2 for i in range(1,len(cmsms))]
ax1.set_yticks([i-0.5 for i in midpts])
ax1.set_yticklabels(cluster_labels)
for i, ytick in enumerate(ax1.get_yticklabels()):
    plt.setp(ytick, color = hex_colors_40[cluster_idx[i]])

ax1.set_xticks([0,24,49,74,99])
ax1.set_xticklabels([1,25,50,75,100])
# ax1.set_xlabel("Factors")

# ax1.set_yticks(np.arange(top_phenotype_idx.shape[0]))
# ax1.set_yticklabels(top_phenotype_names, size = 1.5)
# ax1.tick_params(left=False)
# ax1.yaxis.set_tick_params(pad = 0)
# for i, ytick in enumerate(ax1.get_yticklabels()):
#     plt.setp(ytick, color = hex_colors_40[plt_cluster_idx[i]])

# plt.tight_layout()
# plt.savefig('../plots/colormann-manuscript/panukb_nnmsparse_loadings.pdf', bbox_inches='tight')

plt.tight_layout()
plt.savefig('../plots/bi-2026/panukb_nnmsparse_loadings.png', transparent = True, bbox_inches='tight')
# Show the plot
plt.show()
/scratch/ipykernel_2045557/764209936.py:54: UserWarning: Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations
  plt.tight_layout()