LD filtering of UKBB data

Author

Saikat Banerjee

Published

November 27, 2023

Abstract

A simple position based filtering for SNPs in the UKBB data. Position is used as a proxy for true LD.

Code

import numpy as np
import pandas as pd
import scipy.stats as sc_stats
import collections
import pickle

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils

mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120)

Code

phenotype_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/npd_phenotypes.tsv"
variants_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/metadata/significant_variants.tsv"
data_dir = "/gpfs/commons/home/sbanerjee/npddata/ukbb.imputed_v3.neale/3_all_assoc"

Code

phenotype_df = pd.read_csv(phenotype_metafile, sep = '\t')
phenotype_df

	phenotype	description	variable_type	source	n_non_missing	n_missing	n_controls	n_cases	PHESANT_transformation	notes
0	1160	Sleep duration	ordinal	phesant	359020	2174	NaN	NaN	1160_0\|\| INTEGER \|\| reassignments: -1=NA\|-3=NA...	ACE touchscreen question "About how many hours...
1	1200	Sleeplessness / insomnia	ordinal	phesant	360738	456	NaN	NaN	1200_0\|\| CAT-SINGLE \|\| Inc(>=10): 3(102157) \|\|...	ACE touchscreen question "Do you have trouble ...
2	1220	Daytime dozing / sleeping (narcolepsy)	ordinal	phesant	359752	1442	NaN	NaN	1220_0\|\| CAT-SINGLE \|\| reassignments: 3=2 \|\| I...	ACE touchscreen question "How likely are you t...
3	1920	Mood swings	binary	phesant	352604	8590	193622.0	158982.0	1920_0\|\| CAT-SINGLE \|\| Inc(>=10): 0(193622) \|\|...	ACE touchscreen question "Does your mood often...
4	1970	Nervous feelings	binary	phesant	351829	9365	268709.0	83120.0	1970_0\|\| CAT-SINGLE \|\| Inc(>=10): 0(268709) \|\|...	ACE touchscreen question "Would you call yours...
...	...	...	...	...	...	...	...	...	...	...
312	T79	Diagnoses - main ICD10: T79 Certain early comp...	binary	icd10	361194	0	360989.0	205.0	NaN	NaN
313	TRAUMBRAIN_NONCONCUS	severe traumatic brain injury, does not includ...	binary	finngen	361194	0	360631.0	563.0	NaN	NaN
314	VI_NERVOUS	Diseases of the nervous system	binary	finngen	361194	0	339871.0	21323.0	NaN	NaN
315	V_MENTAL_BEHAV	Mental and behavioural disorders	binary	finngen	361194	0	356892.0	4302.0	NaN	NaN
316	Z43	Diagnoses - main ICD10: Z43 Attention to artif...	binary	icd10	361194	0	359838.0	1356.0	NaN	NaN

317 rows × 10 columns

Code

phenotype_ids = phenotype_df['phenotype'].to_list()
assoc_file = {}
for s in phenotype_ids:
    assoc_file[s] = f"{data_dir}/{s}.tsv"

Code

assoc_df = pd.read_pickle("../data/ukbb_assoc.pkl")
assoc_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13311781 entries, 0 to 41992
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   variant         object 
 1   low_confidence  bool   
 2   beta            float64
 3   se              float64
 4   tstat           float64
 5   pval            float64
 6   phenotype_id    object 
dtypes: bool(1), float64(4), object(2)
memory usage: 723.6+ MB

Code

print (f"Number of unique SNPs: {len(assoc_df['variant'].unique())}")
print (f"Number of unique studies: {len(assoc_df['phenotype_id'].unique())}")

Number of unique SNPs: 41993
Number of unique studies: 317

Code

variant_colnames = ['variant', 'chr', 'pos', 'ref', 'alt', 'rsid', 'varid', 
                    'consequence', 'consequence_category', 'info', 'call_rate',
                    'AC', 'AF', 'minor_allele', 'minor_AF', 'p_hwe', 
                    'n_called', 'n_not_called', 'n_hom_ref', 'n_het', 'n_hom_var', 'n_non_ref',
                    'r_heterozygosity', 'r_het_hom_var', 'r_expected_het_frequency']
variant_df = pd.read_csv(variants_metafile, sep = '\t', names = variant_colnames)

def count_nucleotides(row):
    return len(row['ref']) + len(row['alt'])

variant_df['ntcount'] = variant_df.apply(count_nucleotides, axis = 1)
variant_df_filtered = variant_df[(variant_df['ntcount'] == 2) 
                               & (variant_df['minor_AF'] >= 0.05)
                               & (variant_df['info'] >= 0.8)].drop(columns = ['ntcount'])

Code

variant_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37395 entries, 9 to 41991
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   variant                   37395 non-null  object 
 1   chr                       37395 non-null  int64  
 2   pos                       37395 non-null  int64  
 3   ref                       37395 non-null  object 
 4   alt                       37395 non-null  object 
 5   rsid                      37395 non-null  object 
 6   varid                     37395 non-null  object 
 7   consequence               37395 non-null  object 
 8   consequence_category      37395 non-null  object 
 9   info                      37395 non-null  float64
 10  call_rate                 37395 non-null  float64
 11  AC                        37395 non-null  int64  
 12  AF                        37395 non-null  float64
 13  minor_allele              37395 non-null  object 
 14  minor_AF                  37395 non-null  float64
 15  p_hwe                     37395 non-null  float64
 16  n_called                  37395 non-null  int64  
 17  n_not_called              37395 non-null  int64  
 18  n_hom_ref                 37395 non-null  int64  
 19  n_het                     37395 non-null  int64  
 20  n_hom_var                 37395 non-null  int64  
 21  n_non_ref                 37395 non-null  int64  
 22  r_heterozygosity          37395 non-null  float64
 23  r_het_hom_var             37395 non-null  float64
 24  r_expected_het_frequency  37395 non-null  float64
dtypes: float64(8), int64(9), object(8)
memory usage: 7.4+ MB

Code

def ldclump_snppos(xarr, pos_cutoff=100000):
    xclump = list()
    xsorted = np.sort(xarr)
    xlast = xsorted[0]
    xclump.append(xlast)
    for x in xsorted[1:]:
        if (x - xlast > pos_cutoff):
            xclump.append(x)
            xlast = x
    return np.array(xclump)

Code

ldfilter_dict = []
for chrm in range(1,23):
    chr_snppos = variant_df_filtered[variant_df_filtered['chr'] == chrm]['pos'].values
    chr_snppos_filtered = ldclump_snppos(chr_snppos, pos_cutoff = 10000)
    ldfilter_dict += [{'chr':chrm, 'pos':x} for x in chr_snppos_filtered]

Code

len(ldfilter_dict)

Code

df_ldfilter = pd.DataFrame.from_dict(ldfilter_dict)
variant_df_ldfiltered = df_ldfilter.merge(variant_df_filtered, on=['chr', 'pos'], how='inner')

Code

variant_df_ldfiltered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   chr                       3387 non-null   int64  
 1   pos                       3387 non-null   int64  
 2   variant                   3387 non-null   object 
 3   ref                       3387 non-null   object 
 4   alt                       3387 non-null   object 
 5   rsid                      3387 non-null   object 
 6   varid                     3387 non-null   object 
 7   consequence               3387 non-null   object 
 8   consequence_category      3387 non-null   object 
 9   info                      3387 non-null   float64
 10  call_rate                 3387 non-null   float64
 11  AC                        3387 non-null   int64  
 12  AF                        3387 non-null   float64
 13  minor_allele              3387 non-null   object 
 14  minor_AF                  3387 non-null   float64
 15  p_hwe                     3387 non-null   float64
 16  n_called                  3387 non-null   int64  
 17  n_not_called              3387 non-null   int64  
 18  n_hom_ref                 3387 non-null   int64  
 19  n_het                     3387 non-null   int64  
 20  n_hom_var                 3387 non-null   int64  
 21  n_non_ref                 3387 non-null   int64  
 22  r_heterozygosity          3387 non-null   float64
 23  r_het_hom_var             3387 non-null   float64
 24  r_expected_het_frequency  3387 non-null   float64
dtypes: float64(8), int64(9), object(8)
memory usage: 661.7+ KB

Code

assoc_df_fvar = variant_df_ldfiltered[['variant', 'rsid']].merge(assoc_df, on = ['variant'], how = 'inner')

Code

print (f"Number of unique SNPs: {len(assoc_df_fvar['variant'].unique())}")
print (f"Number of unique studies: {len(assoc_df_fvar['phenotype_id'].unique())}")

Number of unique SNPs: 3387
Number of unique studies: 317

Code

zscore_df = assoc_df_fvar[['variant', 'phenotype_id', 'tstat']].pivot(index = 'variant', columns = 'phenotype_id', values = 'tstat').rename_axis(None, axis = 0).rename_axis(None, axis = 1)
beta_df   = assoc_df_fvar[['variant', 'phenotype_id', 'beta']].pivot(index = 'variant', columns = 'phenotype_id', values = 'beta').rename_axis(None, axis = 0).rename_axis(None, axis = 1)
se_df     = assoc_df_fvar[['variant', 'phenotype_id', 'se']].pivot(index = 'variant', columns = 'phenotype_id', values = 'se').rename_axis(None, axis = 0).rename_axis(None, axis = 1)

Code

zscore_df

	1160	1200	1220	1920	1970	1980	20002_1123	20002_1240	20002_1243	20002_1246	...	R53	R56	R90	SFN	SLEEP	T79	TRAUMBRAIN_NONCONCUS	VI_NERVOUS	V_MENTAL_BEHAV	Z43
10:100010186:A:G	-0.501456	0.468240	-1.401350	-2.319730	-4.101760	-2.469050	1.373340	0.807527	-1.483770	-0.983242	...	-1.047450	-0.541606	-1.559310	0.746273	0.644985	-0.028021	-0.159701	3.711700	-1.380550	-0.903030
10:100020572:T:G	0.687418	-0.769290	0.041449	-2.215650	-1.051370	-0.254845	0.022909	-0.961978	-0.344766	-1.693950	...	-0.750566	0.989090	0.516234	-0.152721	-0.266356	1.091550	-0.054509	3.134090	-1.631720	-0.641998
10:100602545:C:G	0.197858	0.137792	0.064242	-1.027020	-0.967476	-0.528904	1.598920	-0.491628	-0.103273	-0.099966	...	1.198700	0.925530	-0.146606	1.605810	0.764948	2.064470	1.832110	1.723660	-0.391214	0.789003
10:100612750:G:C	0.243301	0.144469	0.043636	-0.979825	-0.982524	-0.524519	1.652070	-0.498177	0.032667	-0.131171	...	1.208040	1.012120	-0.168484	1.617130	0.776149	2.232480	1.825990	1.762370	-0.351120	0.801818
10:100627302:G:C	0.303313	0.288102	-0.243662	-1.241620	-0.917032	-0.624787	1.616470	-0.359382	0.038058	-0.138088	...	0.959963	1.038140	-0.187544	1.637230	0.858652	1.866400	1.898110	1.759880	-0.374304	0.609954
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9:98225056:G:T	-0.609864	0.191835	2.357520	3.899610	3.187290	5.256400	0.402311	1.592520	-0.382107	1.365880	...	1.017950	1.900860	2.127600	0.056486	0.692330	-0.307143	-0.591845	0.056605	-0.247403	-1.640570
9:98235310:T:C	-0.731896	-0.239943	1.869780	3.750520	3.260280	5.254040	-0.255022	0.183105	0.344827	1.237990	...	0.913732	1.789060	1.465640	0.407725	0.614795	-0.168933	-0.381017	0.452154	0.714727	-1.272540
9:98247204:C:T	-0.627788	0.057694	2.106230	4.309430	3.180430	4.965740	0.358445	0.787971	-0.052250	1.509800	...	0.796777	1.726200	1.235950	-0.052887	1.313870	-0.505464	-0.119510	-0.124091	1.065030	-1.160720
9:98262178:G:A	-0.619834	-0.240170	2.556200	4.230840	3.632350	4.855730	0.002387	0.601861	0.154543	1.272620	...	0.684594	1.329930	1.257680	0.055839	0.996216	-0.217356	-0.178359	-0.336342	1.148090	-1.059110
9:98273305:T:G	-0.846092	0.070976	2.650630	4.323630	3.354090	5.113290	-0.051482	0.623136	0.203378	1.743840	...	0.865338	1.432780	1.166460	0.018348	0.479160	-0.233870	-0.179549	-0.483875	0.839336	-0.996326

3387 rows × 317 columns

Code

mean_se  = se_df.median(axis = 0, skipna = True)
mean_se  = pd.DataFrame(mean_se).set_axis(["mean_se"], axis = 1)
beta_std = beta_df.std(axis = 0, skipna = True)
beta_std = pd.DataFrame(beta_std).set_axis(["beta_std"], axis = 1)
error_df = pd.concat([mean_se, beta_std], axis = 1)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(np.log10(error_df['beta_std']), np.log10(error_df['mean_se']), alpha = 0.5, s = 100)
mpl_utils.set_xticks(ax1, scale = 'log10', spacing = 'log10')
mpl_utils.set_yticks(ax1, scale = 'log10', spacing = 'log10')
mpl_utils.plot_diag(ax1)

keep_columns = error_df.query("mean_se <= 0.2").index
for pid in error_df.index.to_list():
    if pid not in keep_columns:
        pid_text = f"{pid} / {phenotype_dict[pid]}"
        xval = np.log10(error_df.loc[pid]['beta_std'])
        yval = np.log10(error_df.loc[pid]['mean_se'])
        ax1.annotate(pid_text, (xval, yval))

ax1.set_xlabel(r"Standard Deviation of mean $\beta$")
ax1.set_ylabel(r"Median of SE")
plt.show()

Figure 1: Calibration of SE against std of beta

Code

zscore_df.to_pickle("../data/ukbb_zscore_df2.pkl")
beta_df.to_pickle("../data/ukbb_beta_df2.pkl")
se_df.to_pickle("../data/ukbb_se_df2.pkl")