Characterization of PanUKB hidden factors

Author

Saikat Banerjee

Published

March 7, 2024

Abstract

We attempt to characterize the latent structure of the PanUKB summary statistics using our methods.

Code

import os
import re
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
from adjustText import adjust_text
import textalloc

from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120, colors = 'kelly')
from matplotlib.gridspec import GridSpec

import sys
sys.path.append("../utils/")
import simulate as mpy_simulate

Code

import scipy.stats as sc_stats
import collections

def q1(x, axis = None):
    return np.percentile(x, 25, axis = axis)

def q3(x, axis = None):
    return np.percentile(x, 75, axis = axis)

def iqr_outlier(x, axis = None, bar = 1.5, side = 'both'):
    assert side in ['gt', 'lt', 'both'], 'Side should be `gt`, `lt` or `both`.'

    d_iqr = sc_stats.iqr(x, axis = axis)
    d_q1 = q1(x, axis = axis)
    d_q3 = q3(x, axis = axis)
    iqr_distance = np.multiply(d_iqr, bar)

    stat_shape = list(x.shape)

    if isinstance(axis, collections.abc.Iterable):
        for single_axis in axis:
            stat_shape[single_axis] = 1
    else:
        stat_shape[axis] = 1

    if side in ['gt', 'both']:
        upper_range = d_q3 + iqr_distance
        upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
    if side in ['lt', 'both']:
        lower_range = d_q1 - iqr_distance
        lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)

    if side == 'gt':
        return upper_outlier
    if side == 'lt':
        return lower_outlier
    if side == 'both':
        return np.logical_or(upper_outlier, lower_outlier)

Read data and results

Code

data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
result_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/nnsparsh"
h2_cut = 0.1
pval_cut = 5e-8

zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_h2{h2_cut}_pval{pval_cut}.pkl"))
trait_df  = pd.read_pickle(os.path.join(data_dir, f"modselect/traits_h2{h2_cut}.pkl"))

variant_filename = f"{data_dir}/allvar.pruned.closesttss.hugo"
variant_df = pd.read_csv(variant_filename, sep = '\t')

Code

zscore_df

	z1415	z1678	z1679	z1680	z1681	z1682	z1683	z1685	z1686	z1687	...	z2473	z2474	z2475	z2476	z2477	z2479	z2480	z2481	z2482	z2483
0	3.591973	-0.038545	4.356103	-0.030974	-2.625193	-0.082182	-1.513514	2.927977	1.145769	-0.924589	...	-0.964337	1.521092	0.612532	1.405428	0.018029	-0.008761	-2.069432	-4.292948	-4.701711	2.952899
1	2.690253	2.060959	5.074098	2.131896	-2.200532	0.203585	0.053289	2.081926	1.606040	-0.265317	...	-0.158982	-0.296537	-0.734266	-0.093081	0.412077	0.716049	-2.171984	-5.314085	-6.612137	3.817518
2	4.672697	1.966395	6.363325	1.017074	-3.506709	-0.985765	-0.432902	2.834796	3.055340	-2.151471	...	0.716255	2.222433	1.092969	2.328233	1.160767	-1.467249	-0.135785	-2.187241	-3.223529	4.508578
3	3.628734	0.377612	1.851399	-1.249221	-1.881565	-0.479096	-1.015920	0.498607	-0.294269	-1.046150	...	-1.404276	3.356354	1.990588	3.092179	-0.133810	-1.376310	1.317044	0.913491	0.535188	2.245657
4	-0.858911	-0.623239	-1.686601	-3.003435	0.532031	-0.341693	-3.777505	0.254496	0.226082	-0.322407	...	-0.536635	-0.320075	-0.128047	-0.524757	-0.232900	-0.483644	2.026508	4.400092	5.407316	1.125536
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
51394	1.342127	1.797208	-0.979370	-1.867291	0.037906	-1.162637	-3.256483	2.176569	-1.732998	-0.328859	...	0.482023	0.714167	0.354347	0.611158	-0.354725	-0.831737	0.870924	1.432076	2.228501	0.536104
51395	-1.244520	-1.527636	1.849772	2.144946	-0.031816	2.274951	2.456148	-2.560491	-0.212461	0.982478	...	-0.110369	1.107098	1.482684	1.512723	0.322355	1.320194	-0.700092	-1.395039	-2.270186	0.360025
51396	-0.253862	0.812630	-0.711050	1.179577	-2.862653	-2.013295	2.833169	0.153290	2.138158	-2.653582	...	-0.088946	-0.562063	-1.148515	-0.994185	-0.268232	0.013256	-0.777667	-1.544760	-1.406344	2.205817
51397	-0.915597	1.668377	-0.823597	0.163447	-0.061726	0.319698	0.922030	2.559199	0.215736	0.468609	...	0.098326	-1.185702	-0.624073	-0.859522	0.549669	0.268733	0.947441	1.533302	1.658537	2.218653
51398	1.047521	-0.022502	2.343549	-0.933287	1.285805	-1.105163	0.234492	-0.452837	-0.142916	-0.777741	...	0.713318	0.308486	0.612102	0.492977	-0.621278	-0.732402	-0.106474	-0.029006	0.070647	-0.805253

48212 rows × 216 columns

Code

trait_df

	zindex	trait_type	phenocode	pheno_sex	description	description_more	category	BIN_QT	n_cases_EUR	n_controls_EUR	N	Neff	estimates.final.h2_observed
1414	1415	categorical	20002	both_sexes	Non-cancer illness code, self-reported	Code for non-cancer illness. If the participan...	UK Biobank Assessment Centre > Verbal intervie...	BIN	109307	311166.0	420473	161782.6684	0.125
1677	1678	biomarkers	30600	both_sexes	Albumin	NaN	Biological samples > Assay results > Blood ass...	QT	367192	NaN	367192	367192.0000	0.145
1678	1679	biomarkers	30610	both_sexes	Alkaline phosphatase	NaN	Biological samples > Assay results > Blood ass...	QT	400988	NaN	400988	400988.0000	0.205
1679	1680	biomarkers	30620	both_sexes	Alanine aminotransferase	NaN	Biological samples > Assay results > Blood ass...	QT	400822	NaN	400822	400822.0000	0.124
1680	1681	biomarkers	30630	both_sexes	Apolipoprotein A	NaN	Biological samples > Assay results > Blood ass...	QT	364987	NaN	364987	364987.0000	0.182
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2478	2479	continuous	Smoking	both_sexes	Smoking status, ever vs never	Ever (previous + current smoker) vs never base...	NaN	QT	418817	NaN	418817	418817.0000	0.110
2479	2480	continuous	eGFR	both_sexes	Estimated glomerular filtration rate, serum cr...	eGFR based on serum creatinine (30700) using t...	NaN	QT	401867	NaN	401867	401867.0000	0.207
2480	2481	continuous	eGFRcreacys	both_sexes	Estimated glomerular filtration rate, cystain C	eGFR based on cystain C (30720) using the CKD-...	NaN	QT	401570	NaN	401570	401570.0000	0.238
2481	2482	continuous	eGFRcys	both_sexes	Estimated glomerular filtration rate, serum cr...	eGFR based on serum creatinine (30700) and cys...	NaN	QT	402031	NaN	402031	402031.0000	0.224
2482	2483	continuous	whr	both_sexes	pheno 48 / pheno 49	NaN	NaN	QT	420531	NaN	420531	420531.0000	0.174

216 rows × 13 columns

Code

method_names = {
    'rpca' : 'RPCA-IALM',
    'nnm'  : 'NNM-FW',
    'nnm_sparse' : 'NNM-Sparse-FW',
}

Code

lowrank_model = dict()
for m in method_names.keys():
    res_filename = os.path.join(result_dir, f"{m}_model.pkl")
    with (open(res_filename, "rb")) as fh:
        lowrank_model[m] = pickle.load(fh)

Code

lowrank_X = dict()
for m in method_names.keys():
    if m == 'rpca':
        lowrank_X[m] = lowrank_model[m]['L_']
    else:
        lowrank_X[m] = lowrank_model[m]['X_']

Code

def get_principal_components(X):
    X_cent = mpy_simulate.do_standardize(X, scale = False)
    X_cent /= np.sqrt(np.prod(X_cent.shape))
    U, S, Vt = np.linalg.svd(X_cent, full_matrices = False)
    pcomps = U @ np.diag(S)
    loadings = Vt.T @ np.diag(S)
    return loadings, pcomps, S

loadings  = dict()
pcomps    = dict()
eigenvals = dict()

for m in method_names.keys():
    loadings[m], pcomps[m], eigenvals[m] = get_principal_components(lowrank_X[m])

Code

trait_types  = trait_df['trait_type'].unique().tolist()
trait_colors = {trait: color for trait, color in zip(trait_types, mpl_stylesheet.banskt_colors()[:len(trait_types)])}
trait_types.reverse()

Code

fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(111)
method = 'nnm_sparse'
ipc1 = 0
ipc2 = 1
trait_indices = np.array(trait_df.index)

for t in trait_types:
    selectidx = np.array(trait_df[trait_df['trait_type'] == t].index)
    tidx = np.searchsorted(trait_indices, selectidx)
    xvals = pcomps[method][tidx, ipc1]
    yvals = pcomps[method][tidx, ipc2]
    ax1.scatter(xvals, yvals, alpha = 0.7, color = trait_colors[t], label = t, s = 50)
    ax1.legend(loc = 'upper left')
    

# Combine outliers in x-axis and y-axis
xvals = pcomps[method][:, ipc1]
yvals = pcomps[method][:, ipc2]
outlier_idx_x = np.where(iqr_outlier(xvals, axis = 0, bar = 2.0))[0]
outlier_idx_y = np.where(iqr_outlier(yvals, axis = 0, bar = 2.0))[0]
outlier_idx = np.union1d(outlier_idx_x, outlier_idx_y)
x_center = np.mean(ax1.get_xlim())

# # Mark using adjustText package
# # https://github.com/Phlya/adjustText
# annots = []
# for i in outlier_idx:
#     txt = trait_df.loc[trait_indices[i]]['description'].strip()
#     if 'intima-medial thickness' in txt:
#         continue
#     if xvals[i] > x_center:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 6, ha = 'right')]
#     else:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 6)]
# # Adjust the annotations iteratively
# adjust_text(annots, arrowprops=dict(arrowstyle='-', color = 'grey'))

# Mark using textalloc package
txt_list = []
text_idx_list = []
for i in outlier_idx:
    txt = trait_df.loc[trait_indices[i]]['description'].strip()
    #if 'intima-medial thickness' in txt:
    #    continue
    txt_list.append(txt)
    text_idx_list.append(i)
if len(text_idx_list) > 0:
    txt_idx = np.array(text_idx_list)
    textalloc.allocate_text(fig, ax1, xvals[txt_idx], yvals[txt_idx], txt_list,
                            x_scatter = xvals, y_scatter = yvals,
                            textsize = 8, textcolor = 'black', linecolor = 'grey')

ax1.set_xlabel(f"PC{ipc1 + 1}")
ax1.set_ylabel(f"PC{ipc2 + 1}")
    
plt.tight_layout()
plt.show()

Code

fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(111)
method = 'nnm_sparse'
ipc1 = 2
ipc2 = 3
trait_indices = np.array(trait_df.index)

for t in trait_types:
    selectidx = np.array(trait_df[trait_df['trait_type'] == t].index)
    tidx = np.searchsorted(trait_indices, selectidx)
    xvals = pcomps[method][tidx, ipc1]
    yvals = pcomps[method][tidx, ipc2]
    ax1.scatter(xvals, yvals, alpha = 0.7, color = trait_colors[t], label = t, s = 50)
    ax1.legend(loc = 'upper left')
    

# Combine outliers in x-axis and y-axis
xvals = pcomps[method][:, ipc1]
yvals = pcomps[method][:, ipc2]
outlier_idx_x = np.where(iqr_outlier(xvals, axis = 0, bar = 4.0))[0]
outlier_idx_y = np.where(iqr_outlier(yvals, axis = 0, bar = 4.0))[0]
outlier_idx = np.union1d(outlier_idx_x, outlier_idx_y)
x_center = np.mean(ax1.get_xlim())

# Mark using adjustText package
# https://github.com/Phlya/adjustText
# annots = []
# for i in outlier_idx:
#     txt = trait_df.loc[trait_indices[i]]['description']
#     if 'intima-medial thickness' in txt:
#         continue
#     if xvals[i] > x_center:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10, ha = 'right')]
#     else:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10)]
# # Adjust the annotations iteratively
# adjust_text(annots, arrowprops=dict(arrowstyle='-', color = 'grey'))

# Mark using textalloc package
txt_list = []
text_idx_list = []
for i in outlier_idx:
    txt = trait_df.loc[trait_indices[i]]['description'].strip()
    if 'pressure' in txt:
        txt = re.sub(
            pattern=r'(.*pressure),.*', 
            repl='\\1', 
            string = txt)
    #    continue
    if 'bone mineral density' in txt:
        txt = re.sub(
            pattern = r'bone mineral density \(BMD\)',
            repl = 'BMD',
            string = txt)
    txt_list.append(txt)
    text_idx_list.append(i)
if len(text_idx_list) > 0:
    txt_idx = np.array(text_idx_list)
    textalloc.allocate_text(fig, ax1, xvals[txt_idx], yvals[txt_idx], txt_list,
                            x_scatter = xvals, y_scatter = yvals,
                            textsize = 8, textcolor = 'black', linecolor = 'grey')

ax1.set_xlabel(f"PC{ipc1 + 1}")
ax1.set_ylabel(f"PC{ipc2 + 1}")
    
plt.tight_layout()
plt.show()

Code

fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(111)
method = 'nnm_sparse'
ipc1 = 4
ipc2 = 5
trait_indices = np.array(trait_df.index)

for t in trait_types:
    selectidx = np.array(trait_df[trait_df['trait_type'] == t].index)
    tidx = np.searchsorted(trait_indices, selectidx)
    xvals = pcomps[method][tidx, ipc1]
    yvals = pcomps[method][tidx, ipc2]
    ax1.scatter(xvals, yvals, alpha = 0.7, color = trait_colors[t], label = t, s = 50)
    ax1.legend(loc = 'upper left')
    

# Combine outliers in x-axis and y-axis
xvals = pcomps[method][:, ipc1]
yvals = pcomps[method][:, ipc2]
outlier_idx_x = np.where(iqr_outlier(xvals, axis = 0, bar = 4.0))[0]
outlier_idx_y = np.where(iqr_outlier(yvals, axis = 0, bar = 4.0))[0]
outlier_idx = np.union1d(outlier_idx_x, outlier_idx_y)
x_center = np.mean(ax1.get_xlim())

# Mark using adjustText package
# https://github.com/Phlya/adjustText
# annots = []
# for i in outlier_idx:
#     txt = trait_df.loc[trait_indices[i]]['description']
#     if 'intima-medial thickness' in txt:
#         continue
#     if xvals[i] > x_center:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10, ha = 'right')]
#     else:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10)]
# # Adjust the annotations iteratively
# adjust_text(annots, arrowprops=dict(arrowstyle='-', color = 'grey'))

# Mark using textalloc package
txt_list = []
text_idx_list = []
for i in outlier_idx:
    txt = trait_df.loc[trait_indices[i]]['description'].strip()
    #if 'intima-medial thickness' in txt:
    #    continue
    txt_list.append(txt)
    text_idx_list.append(i)
if len(text_idx_list) > 0:
    txt_idx = np.array(text_idx_list)
    textalloc.allocate_text(fig, ax1, xvals[txt_idx], yvals[txt_idx], txt_list,
                            x_scatter = xvals, y_scatter = yvals,
                            textsize=8, textcolor = 'black', linecolor = 'grey')

ax1.set_xlabel(f"PC{ipc1 + 1}")
ax1.set_ylabel(f"PC{ipc2 + 1}")
    
plt.tight_layout()
plt.show()

Code

fig = plt.figure(figsize = (12, 12))
ax1 = fig.add_subplot(111)
method = 'nnm_sparse'
ipc1 = 8
ipc2 = 9
trait_indices = np.array(trait_df.index)

for t in trait_types:
    selectidx = np.array(trait_df[trait_df['trait_type'] == t].index)
    tidx = np.searchsorted(trait_indices, selectidx)
    xvals = pcomps[method][tidx, ipc1]
    yvals = pcomps[method][tidx, ipc2]
    ax1.scatter(xvals, yvals, alpha = 0.7, color = trait_colors[t], label = t, s = 50)
    ax1.legend(loc = 'upper left')
    

# Combine outliers in x-axis and y-axis
xvals = pcomps[method][:, ipc1]
yvals = pcomps[method][:, ipc2]
outlier_idx_x = np.where(iqr_outlier(xvals, axis = 0, bar = 3.0))[0]
outlier_idx_y = np.where(iqr_outlier(yvals, axis = 0, bar = 3.0))[0]
outlier_idx = np.union1d(outlier_idx_x, outlier_idx_y)
x_center = np.mean(ax1.get_xlim())

# Mark using adjustText package
# https://github.com/Phlya/adjustText
# annots = []
# for i in outlier_idx:
#     txt = trait_df.loc[trait_indices[i]]['description']
#     if 'intima-medial thickness' in txt:
#         continue
#     if xvals[i] > x_center:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10, ha = 'right')]
#     else:
#         annots += [ax1.annotate(txt, (xvals[i], yvals[i]), fontsize = 10)]
# # Adjust the annotations iteratively
# adjust_text(annots, arrowprops=dict(arrowstyle='-', color = 'grey'))

# Mark using textalloc package
txt_list = []
text_idx_list = []
for i in outlier_idx:
    txt = trait_df.loc[trait_indices[i]]['description'].strip()
    #if 'intima-medial thickness' in txt:
    #    continue
    txt_list.append(txt)
    text_idx_list.append(i)
if len(text_idx_list) > 0:
    txt_idx = np.array(text_idx_list)
    textalloc.allocate_text(fig, ax1, xvals[txt_idx], yvals[txt_idx], txt_list,
                            x_scatter = xvals, y_scatter = yvals,
                            textsize=8, textcolor = 'black', linecolor = 'grey')

ax1.set_xlabel(f"PC{ipc1 + 1}")
ax1.set_ylabel(f"PC{ipc2 + 1}")
    
plt.tight_layout()
plt.show()