Preprocessing UKBB data

Author

Saikat Banerjee

Published

October 30, 2023

Abstract
Preprocessing steps for UKBB data from Neale Lab
Code
import numpy as np
import pandas as pd
import scipy.stats as sc_stats
import collections
import pickle

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils

mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120)
Code
phenotype_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/npd_phenotypes.tsv"
variants_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/metadata/significant_variants.tsv"
data_dir = "/gpfs/commons/home/sbanerjee/npddata/ukbb.imputed_v3.neale/3_all_assoc"
Code
phenotype_df = pd.read_csv(phenotype_metafile, sep = '\t')
phenotype_ids = phenotype_df['phenotype'].to_list()
assoc_file = {}
for s in phenotype_ids:
    assoc_file[s] = f"{data_dir}/{s}.tsv"
Code
phenotype_df
phenotype description variable_type source n_non_missing n_missing n_controls n_cases PHESANT_transformation notes
0 1160 Sleep duration ordinal phesant 359020 2174 NaN NaN 1160_0|| INTEGER || reassignments: -1=NA|-3=NA... ACE touchscreen question "About how many hours...
1 1200 Sleeplessness / insomnia ordinal phesant 360738 456 NaN NaN 1200_0|| CAT-SINGLE || Inc(>=10): 3(102157) ||... ACE touchscreen question "Do you have trouble ...
2 1220 Daytime dozing / sleeping (narcolepsy) ordinal phesant 359752 1442 NaN NaN 1220_0|| CAT-SINGLE || reassignments: 3=2 || I... ACE touchscreen question "How likely are you t...
3 1920 Mood swings binary phesant 352604 8590 193622.0 158982.0 1920_0|| CAT-SINGLE || Inc(>=10): 0(193622) ||... ACE touchscreen question "Does your mood often...
4 1970 Nervous feelings binary phesant 351829 9365 268709.0 83120.0 1970_0|| CAT-SINGLE || Inc(>=10): 0(268709) ||... ACE touchscreen question "Would you call yours...
... ... ... ... ... ... ... ... ... ... ...
312 T79 Diagnoses - main ICD10: T79 Certain early comp... binary icd10 361194 0 360989.0 205.0 NaN NaN
313 TRAUMBRAIN_NONCONCUS severe traumatic brain injury, does not includ... binary finngen 361194 0 360631.0 563.0 NaN NaN
314 VI_NERVOUS Diseases of the nervous system binary finngen 361194 0 339871.0 21323.0 NaN NaN
315 V_MENTAL_BEHAV Mental and behavioural disorders binary finngen 361194 0 356892.0 4302.0 NaN NaN
316 Z43 Diagnoses - main ICD10: Z43 Attention to artif... binary icd10 361194 0 359838.0 1356.0 NaN NaN

317 rows × 10 columns

Code
assoc_file['1160']
'/gpfs/commons/home/sbanerjee/npddata/ukbb.imputed_v3.neale/3_all_assoc/1160.tsv'
Code
pd.read_csv(assoc_file['1160'], sep='\t', header = None, names = ['variant', 'low_confidence', 'beta', 'se', 'tstat', 'pval'])
variant low_confidence beta se tstat pval
0 1:2073742:A:T False -0.002622 0.007306 -0.358873 0.719690
1 1:2094006:A:G False -0.001466 0.007288 -0.201181 0.840557
2 1:2094007:C:G False -0.002688 0.007319 -0.367227 0.713450
3 1:2108792:C:T False -0.002112 0.007679 -0.274978 0.783333
4 1:2111080:C:T False -0.000891 0.007655 -0.116456 0.907291
... ... ... ... ... ... ...
41988 22:41812084:G:A False 0.006255 0.002175 2.875620 0.004033
41989 22:41812439:A:G False 0.006235 0.002175 2.866260 0.004154
41990 22:41812755:C:T False 0.006234 0.002175 2.865640 0.004162
41991 22:41812819:T:C False 0.006261 0.002175 2.878240 0.003999
41992 22:46994296:T:A True 0.014798 0.021522 0.687576 0.491720

41993 rows × 6 columns

Code
assoc = {}

colnames = ['variant', 'low_confidence', 'beta', 'se', 'tstat', 'pval']

for pid in phenotype_ids:
    print (f"Read summary statistics for {pid}")
    assoc[pid] = pd.read_csv(assoc_file[pid], sep = '\t', names = colnames)
    assoc[pid]['phenotype_id'] = pid
Read summary statistics for 1160
Read summary statistics for 1200
Read summary statistics for 1220
Read summary statistics for 1920
Read summary statistics for 1970
Read summary statistics for 1980
Read summary statistics for 20002_1123
Read summary statistics for 20002_1240
Read summary statistics for 20002_1243
Read summary statistics for 20002_1246
Read summary statistics for 20002_1249
Read summary statistics for 20002_1250
Read summary statistics for 20002_1251
Read summary statistics for 20002_1254
Read summary statistics for 20002_1255
Read summary statistics for 20002_1256
Read summary statistics for 20002_1257
Read summary statistics for 20002_1258
Read summary statistics for 20002_1261
Read summary statistics for 20002_1262
Read summary statistics for 20002_1264
Read summary statistics for 20002_1265
Read summary statistics for 20002_1267
Read summary statistics for 20002_1279
Read summary statistics for 20002_1286
Read summary statistics for 20002_1287
Read summary statistics for 20002_1288
Read summary statistics for 20002_1289
Read summary statistics for 20002_1291
Read summary statistics for 20002_1384
Read summary statistics for 20002_1394
Read summary statistics for 20002_1425
Read summary statistics for 20002_1433
Read summary statistics for 20002_1434
Read summary statistics for 20002_1435
Read summary statistics for 20002_1436
Read summary statistics for 20002_1437
Read summary statistics for 20002_1468
Read summary statistics for 20002_1469
Read summary statistics for 20002_1470
Read summary statistics for 20002_1478
Read summary statistics for 20002_1482
Read summary statistics for 20002_1523
Read summary statistics for 20002_1525
Read summary statistics for 20002_1526
Read summary statistics for 20002_1536
Read summary statistics for 20002_1541
Read summary statistics for 20002_1550
Read summary statistics for 20002_1614
Read summary statistics for 20002_1616
Read summary statistics for 20002_1683
Read summary statistics for 20019_irnt
Read summary statistics for 20021_irnt
Read summary statistics for 20107_10
Read summary statistics for 20107_11
Read summary statistics for 20107_12
Read summary statistics for 2010
Read summary statistics for 20110_10
Read summary statistics for 20110_11
Read summary statistics for 20110_12
Read summary statistics for 20111_10
Read summary statistics for 20111_11
Read summary statistics for 20111_12
Read summary statistics for 20112_10
Read summary statistics for 20113_10
Read summary statistics for 20113_12
Read summary statistics for 20114_12
Read summary statistics for 20122
Read summary statistics for 20126_0
Read summary statistics for 20126_1
Read summary statistics for 20126_2
Read summary statistics for 20126_3
Read summary statistics for 20126_4
Read summary statistics for 20126_5
Read summary statistics for 20127_irnt
Read summary statistics for 20417
Read summary statistics for 20418
Read summary statistics for 20419
Read summary statistics for 20421
Read summary statistics for 20422
Read summary statistics for 20423
Read summary statistics for 20426
Read summary statistics for 20427
Read summary statistics for 20428
Read summary statistics for 20429
Read summary statistics for 20432
Read summary statistics for 20435
Read summary statistics for 20436
Read summary statistics for 20437
Read summary statistics for 20438
Read summary statistics for 20439
Read summary statistics for 20440
Read summary statistics for 20445
Read summary statistics for 20446
Read summary statistics for 20447
Read summary statistics for 20448
Read summary statistics for 20449
Read summary statistics for 20450
Read summary statistics for 20462
Read summary statistics for 20466
Read summary statistics for 20467
Read summary statistics for 20477
Read summary statistics for 20488
Read summary statistics for 20493
Read summary statistics for 20495
Read summary statistics for 20497
Read summary statistics for 20498
Read summary statistics for 20499
Read summary statistics for 20500
Read summary statistics for 20501
Read summary statistics for 20506
Read summary statistics for 2050
Read summary statistics for 20510
Read summary statistics for 20517
Read summary statistics for 20532
Read summary statistics for 20533
Read summary statistics for 20534
Read summary statistics for 20536_0
Read summary statistics for 20536_1
Read summary statistics for 20536_2
Read summary statistics for 20536_3
Read summary statistics for 20537
Read summary statistics for 20538
Read summary statistics for 20539
Read summary statistics for 20540
Read summary statistics for 20541
Read summary statistics for 20542
Read summary statistics for 20543
Read summary statistics for 20544_10
Read summary statistics for 20544_11
Read summary statistics for 20544_12
Read summary statistics for 20544_13
Read summary statistics for 20544_14
Read summary statistics for 20544_15
Read summary statistics for 20544_16
Read summary statistics for 20544_17
Read summary statistics for 20544_1
Read summary statistics for 20544_2
Read summary statistics for 20544_3
Read summary statistics for 20544_4
Read summary statistics for 20544_5
Read summary statistics for 20544_6
Read summary statistics for 20544_7
Read summary statistics for 20546_1
Read summary statistics for 20546_3
Read summary statistics for 20546_4
Read summary statistics for 20547_1
Read summary statistics for 20547_3
Read summary statistics for 20548_1
Read summary statistics for 20548_2
Read summary statistics for 20548_3
Read summary statistics for 20548_5
Read summary statistics for 20548_6
Read summary statistics for 20548_7
Read summary statistics for 20548_8
Read summary statistics for 20548_9
Read summary statistics for 20549_1
Read summary statistics for 20549_3
Read summary statistics for 20549_4
Read summary statistics for 20550_1
Read summary statistics for 20550_3
Read summary statistics for 20551_1
Read summary statistics for 20552_1
Read summary statistics for 20552_2
Read summary statistics for 20554_1
Read summary statistics for 2090
Read summary statistics for 2100
Read summary statistics for 3799
Read summary statistics for 40001_C719
Read summary statistics for 40001_G122
Read summary statistics for 41215_1
Read summary statistics for 41218_1
Read summary statistics for 41218_2
Read summary statistics for 41248_3001
Read summary statistics for 41248_5003
Read summary statistics for 4620
Read summary statistics for 4642
Read summary statistics for 4968
Read summary statistics for 4990
Read summary statistics for 5012
Read summary statistics for 5663
Read summary statistics for 5674
Read summary statistics for 5699
Read summary statistics for 6145_1
Read summary statistics for 6145_2
Read summary statistics for 6145_3
Read summary statistics for 6145_4
Read summary statistics for 6145_5
Read summary statistics for 6145_6
Read summary statistics for 6156_11
Read summary statistics for 6156_12
Read summary statistics for 6156_13
Read summary statistics for 6156_14
Read summary statistics for 6156_15
Read summary statistics for 6159_1
Read summary statistics for AB1_VIRAL_CNS
Read summary statistics for AD
Read summary statistics for AMN2
Read summary statistics for C3_BRAIN
Read summary statistics for C3_EYE_BRAIN_NEURO
Read summary statistics for C71
Read summary statistics for C_BRAIN
Read summary statistics for C_EYE_BRAIN_NEURO
Read summary statistics for D33
Read summary statistics for D43
Read summary statistics for F05
Read summary statistics for F10
Read summary statistics for F20
Read summary statistics for F31
Read summary statistics for F32
Read summary statistics for F33
Read summary statistics for F41
Read summary statistics for F43
Read summary statistics for F45
Read summary statistics for F52
Read summary statistics for F5_ALCOHOLAC
Read summary statistics for F5_ALLANXIOUS
Read summary statistics for F5_ANXIETY
Read summary statistics for F5_BEHAVE
Read summary statistics for F5_DEMENTIA
Read summary statistics for F5_DEPRESSIO
Read summary statistics for F5_MOOD
Read summary statistics for F5_PANIC
Read summary statistics for F5_PERSONALITY
Read summary statistics for F5_SCHIZO
Read summary statistics for F99
Read summary statistics for G12
Read summary statistics for G20
Read summary statistics for G24
Read summary statistics for G35
Read summary statistics for G37
Read summary statistics for G40
Read summary statistics for G43
Read summary statistics for G44
Read summary statistics for G45
Read summary statistics for G47
Read summary statistics for G50
Read summary statistics for G51
Read summary statistics for G54
Read summary statistics for G56
Read summary statistics for G57
Read summary statistics for G58
Read summary statistics for G61
Read summary statistics for G62
Read summary statistics for G6_ALS
Read summary statistics for G6_BELLPA
Read summary statistics for G6_CARPTU
Read summary statistics for G6_CONCUS
Read summary statistics for G6_CPETAL
Read summary statistics for G6_DEGENOTH
Read summary statistics for G6_DEMYEL
Read summary statistics for G6_DIFBRAININJ
Read summary statistics for G6_DISBROTHUNS
Read summary statistics for G6_EPIPAROX
Read summary statistics for G6_GUILBAR
Read summary statistics for G6_HYDROCEPH
Read summary statistics for G6_ICTRAUOTHUNS
Read summary statistics for G6_INTRACRATRAUMA
Read summary statistics for G6_MONOLOWOTHUNS
Read summary statistics for G6_MONOOTHUNS
Read summary statistics for G6_MYONEU
Read summary statistics for G6_NERPLEX
Read summary statistics for G6_NEUATR
Read summary statistics for G6_NEUINFL
Read summary statistics for G6_NEURODEG
Read summary statistics for G6_PLANTAR
Read summary statistics for G6_POLYNEU
Read summary statistics for G6_POLYOTHUNS
Read summary statistics for G6_ROOTPLEXOTHUNS
Read summary statistics for G6_SLEEPAPNO
Read summary statistics for G6_TRINEU
Read summary statistics for G6_ULLNLE
Read summary statistics for G6_XTRAPYR
Read summary statistics for G81
Read summary statistics for G93
Read summary statistics for G95
Read summary statistics for H49
Read summary statistics for H7_PARASTRAB
Read summary statistics for H8_BPV
Read summary statistics for I45
Read summary statistics for I9_ANEURYSM
Read summary statistics for I9_CONDUCTIO
Read summary statistics for I9_INTRACRA
Read summary statistics for KRA_PSY_ALCOH
Read summary statistics for KRA_PSY_ANXIETY
Read summary statistics for KRA_PSY_ANYMENTAL
Read summary statistics for KRA_PSY_ANYMENTAL_SUICID
Read summary statistics for KRA_PSY_DEMENTIA
Read summary statistics for KRA_PSY_MOOD
Read summary statistics for KRA_PSY_PERSON
Read summary statistics for KRA_PSY_SCHIZODEL
Read summary statistics for KRA_PSY_SUBSTANCE
Read summary statistics for M13_ALGONEURO
Read summary statistics for M13_CERVICALGIA
Read summary statistics for M13_GANGLION
Read summary statistics for M13_NEURALGIA
Read summary statistics for M13_SPINSTENOSIS
Read summary statistics for M13_THORACISPINEPAIN
Read summary statistics for M50
Read summary statistics for NEURODEGOTH
Read summary statistics for O68
Read summary statistics for PULM_ANXIETY
Read summary statistics for R11
Read summary statistics for R29
Read summary statistics for R41
Read summary statistics for R47
Read summary statistics for R51
Read summary statistics for R53
Read summary statistics for R56
Read summary statistics for R90
Read summary statistics for SFN
Read summary statistics for SLEEP
Read summary statistics for T79
Read summary statistics for TRAUMBRAIN_NONCONCUS
Read summary statistics for VI_NERVOUS
Read summary statistics for V_MENTAL_BEHAV
Read summary statistics for Z43
Code
assoc_df = pd.concat([v for k,v in assoc.items()])
assoc_df.to_pickle("../data/ukbb_assoc.pkl")
assoc_df
variant low_confidence beta se tstat pval phenotype_id
0 1:2073742:A:T False -0.002622 0.007306 -0.358873 0.719690 1160
1 1:2094006:A:G False -0.001466 0.007288 -0.201181 0.840557 1160
2 1:2094007:C:G False -0.002688 0.007319 -0.367227 0.713450 1160
3 1:2108792:C:T False -0.002112 0.007679 -0.274978 0.783333 1160
4 1:2111080:C:T False -0.000891 0.007655 -0.116456 0.907291 1160
... ... ... ... ... ... ... ...
41988 22:41812084:G:A False -0.000076 0.000177 -0.431360 0.666207 Z43
41989 22:41812439:A:G False -0.000076 0.000177 -0.431209 0.666317 Z43
41990 22:41812755:C:T False -0.000076 0.000177 -0.428950 0.667960 Z43
41991 22:41812819:T:C False -0.000076 0.000177 -0.431143 0.666365 Z43
41992 22:46994296:T:A True 0.000667 0.001755 0.380176 0.703815 Z43

13311781 rows × 7 columns

Code
assoc_df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 13311781 entries, 0 to 41992
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   variant         object 
 1   low_confidence  bool   
 2   beta            float64
 3   se              float64
 4   tstat           float64
 5   pval            float64
 6   phenotype_id    object 
dtypes: bool(1), float64(4), object(2)
memory usage: 723.6+ MB
Code
print (f"Number of unique SNPs: {len(assoc_df['variant'].unique())}")
print (f"Number of unique studies: {len(assoc_df['phenotype_id'].unique())}")
Number of unique SNPs: 41993
Number of unique studies: 317

Filter SNPs (indel, rare, low info)

Code
variant_colnames = ['variant', 'chr', 'pos', 'ref', 'alt', 'rsid', 'varid', 
                    'consequence', 'consequence_category', 'info', 'call_rate',
                    'AC', 'AF', 'minor_allele', 'minor_AF', 'p_hwe', 
                    'n_called', 'n_not_called', 'n_hom_ref', 'n_het', 'n_hom_var', 'n_non_ref',
                    'r_heterozygosity', 'r_het_hom_var', 'r_expected_het_frequency']
variant_df = pd.read_csv(variants_metafile, sep = '\t', names = variant_colnames)
Code
variant_df
variant chr pos ref alt rsid varid consequence consequence_category info ... p_hwe n_called n_not_called n_hom_ref n_het n_hom_var n_non_ref r_heterozygosity r_het_hom_var r_expected_het_frequency
0 1:2073742:A:T 1 2073742 A T rs551766141 1:2073742_A_T non_coding_transcript_exon_variant non_coding 0.992179 ... 0.604866 361194 0 350078 11025 91 11116 0.030524 121.154000 0.030546
1 1:2094006:A:G 1 2094006 A G rs190347047 rs190347047 intron_variant non_coding 1.000000 ... 0.263827 361194 0 350089 11008 97 11105 0.030477 113.485000 0.030533
2 1:2094007:C:G 1 2094007 C G rs182067703 1:2094007_C_G intron_variant non_coding 0.998611 ... 0.763051 361194 0 350145 10961 88 11049 0.030347 124.557000 0.030359
3 1:2108792:C:T 1 2108792 C T rs116253512 1:2108792_C_T intron_variant non_coding 0.974810 ... 0.660102 361194 0 350872 10244 78 10322 0.028362 131.333000 0.028379
4 1:2111080:C:T 1 2111080 C T rs116005884 1:2111080_C_T intron_variant non_coding 0.975114 ... 0.703785 361194 0 350845 10271 78 10349 0.028436 131.679000 0.028452
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41988 22:41812084:G:A 22 41812084 G A rs132922 22:41812084_G_A intergenic_variant non_coding 0.999802 ... 0.179555 361194 0 17461 124540 219193 343733 0.344801 0.568175 0.344031
41989 22:41812439:A:G 22 41812439 A G rs202661 22:41812439_A_G intergenic_variant non_coding 0.999845 ... 0.174893 361194 0 17459 124542 219193 343735 0.344806 0.568184 0.344028
41990 22:41812755:C:T 22 41812755 C T rs202662 22:41812755_C_T intergenic_variant non_coding 0.999862 ... 0.176463 361194 0 17463 124548 219183 343731 0.344823 0.568238 0.344050
41991 22:41812819:T:C 22 41812819 T C rs202663 22:41812819_T_C intergenic_variant non_coding 0.999905 ... 0.177995 361194 0 17461 124542 219191 343733 0.344806 0.568189 0.344034
41992 22:46994296:T:A 22 46994296 T A rs536520098 22:46994296_T_A intron_variant non_coding 0.963021 ... 0.486537 361194 0 359880 1314 0 1314 0.003638 NaN 0.003631

41993 rows × 25 columns

Code
def count_nucleotides(row):
    return len(row['ref']) + len(row['alt'])

variant_df['ntcount'] = variant_df.apply(count_nucleotides, axis = 1)
variant_df_filtered = variant_df[(variant_df['ntcount'] == 2) 
                               & (variant_df['minor_AF'] >= 0.05)
                               & (variant_df['info'] >= 0.8)].drop(columns = ['ntcount'])
Code
variant_df_filtered
variant chr pos ref alt rsid varid consequence consequence_category info ... p_hwe n_called n_not_called n_hom_ref n_het n_hom_var n_non_ref r_heterozygosity r_het_hom_var r_expected_het_frequency
9 1:2978043:G:C 1 2978043 G C rs2075969 1:2978043_G_C non_coding_transcript_exon_variant non_coding 0.992771 ... 0.103637 361194 0 114375 177327 69492 246819 0.490947 2.551760 0.492280
10 1:2984087:C:A 1 2984087 C A rs2297829 1:2984087_C_A non_coding_transcript_exon_variant non_coding 0.980053 ... 0.146547 361194 0 148683 165801 46710 212511 0.459036 3.549580 0.460148
11 1:3065568:C:T 1 3065568 C T rs61759161 1:3065568_C_T intron_variant non_coding 0.897850 ... 0.009582 361183 11 215504 127329 18350 145679 0.352533 6.938910 0.351021
12 1:3066761:A:T 1 3066761 A T rs10909886 1:3066761_A_T intron_variant non_coding 0.904227 ... 0.021327 361187 7 214854 127752 18581 146333 0.353700 6.875410 0.352353
13 1:3070634:C:G 1 3070634 C G rs207195 1:3070634_C_G intron_variant non_coding 0.958558 ... 0.454596 361193 1 116581 177438 67174 244612 0.491255 2.641470 0.490645
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41987 22:41812044:G:A 22 41812044 G A rs132921 22:41812044_G_A intergenic_variant non_coding 0.999884 ... 0.178013 361194 0 17463 124547 219184 343731 0.344820 0.568230 0.344048
41988 22:41812084:G:A 22 41812084 G A rs132922 22:41812084_G_A intergenic_variant non_coding 0.999802 ... 0.179555 361194 0 17461 124540 219193 343733 0.344801 0.568175 0.344031
41989 22:41812439:A:G 22 41812439 A G rs202661 22:41812439_A_G intergenic_variant non_coding 0.999845 ... 0.174893 361194 0 17459 124542 219193 343735 0.344806 0.568184 0.344028
41990 22:41812755:C:T 22 41812755 C T rs202662 22:41812755_C_T intergenic_variant non_coding 0.999862 ... 0.176463 361194 0 17463 124548 219183 343731 0.344823 0.568238 0.344050
41991 22:41812819:T:C 22 41812819 T C rs202663 22:41812819_T_C intergenic_variant non_coding 0.999905 ... 0.177995 361194 0 17461 124542 219191 343733 0.344806 0.568189 0.344034

37395 rows × 25 columns

Code
assoc_df_fvar = variant_df_filtered[['variant', 'rsid']].merge(assoc_df, on = ['variant'], how = 'inner')
Code
print (f"Number of unique SNPs: {len(assoc_df_fvar['variant'].unique())}")
print (f"Number of unique studies: {len(assoc_df_fvar['phenotype_id'].unique())}")
Number of unique SNPs: 37395
Number of unique studies: 317
Code
zscore_df = assoc_df_fvar[['variant', 'phenotype_id', 'tstat']].pivot(index = 'variant', columns = 'phenotype_id', values = 'tstat').rename_axis(None, axis = 0).rename_axis(None, axis = 1)
beta_df   = assoc_df_fvar[['variant', 'phenotype_id', 'beta']].pivot(index = 'variant', columns = 'phenotype_id', values = 'beta').rename_axis(None, axis = 0).rename_axis(None, axis = 1)
se_df     = assoc_df_fvar[['variant', 'phenotype_id', 'se']].pivot(index = 'variant', columns = 'phenotype_id', values = 'se').rename_axis(None, axis = 0).rename_axis(None, axis = 1)
Code
zscore_df
1160 1200 1220 1920 1970 1980 20002_1123 20002_1240 20002_1243 20002_1246 ... R53 R56 R90 SFN SLEEP T79 TRAUMBRAIN_NONCONCUS VI_NERVOUS V_MENTAL_BEHAV Z43
10:100000625:A:G 0.501981 -0.525399 1.469430 2.32670 4.11523 2.423000 -1.383980 -0.798416 1.501170 0.99798 ... 1.031720 0.563788 1.543120 -0.732158 -0.639260 0.041272 0.122796 -3.657790 1.372840 0.936180
10:100003785:T:C 0.727118 -0.838564 -0.011967 -2.23214 -1.18325 -0.279210 -0.208375 -0.994745 -0.626474 -1.53071 ... -0.738045 1.121560 0.602487 -0.086688 -0.193665 0.912439 0.122715 3.345200 -1.647860 -0.641863
10:100004441:G:C -0.728567 0.914938 0.070534 2.30550 1.20813 0.318050 0.150605 1.014940 0.667526 1.56572 ... 0.764725 -1.095360 -0.706028 0.112536 0.206682 -0.879437 -0.133474 -3.253040 1.707040 0.687995
10:100004906:C:A 0.504855 -0.523555 1.469210 2.32101 4.11411 2.419160 -1.382470 -0.798095 1.502150 0.99878 ... 1.032040 0.564664 1.543560 -0.732081 -0.637166 0.041860 0.123797 -3.650540 1.375100 0.937855
10:100004996:G:A 0.733441 -0.843107 -0.008906 -2.22630 -1.18083 -0.287227 -0.205852 -0.994113 -0.626903 -1.52962 ... -0.746129 1.123710 0.603249 -0.085872 -0.189963 0.913379 0.124349 3.348560 -1.648650 -0.641694
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9:98265901:A:G -0.785029 0.118622 2.643240 4.30311 3.35008 5.088650 -0.048185 0.606501 0.197470 1.65606 ... 0.830156 1.567530 1.371840 -0.001408 0.609074 -0.196145 -0.200227 -0.354429 0.785742 -1.053890
9:98266855:T:A -0.615773 -0.188766 2.571380 4.24034 3.62091 4.711810 0.039514 0.608059 0.151668 1.40410 ... 0.704177 1.213300 1.207820 0.050593 1.022050 -0.416852 -0.208696 -0.352999 1.217760 -1.058050
9:98273305:T:G -0.846092 0.070976 2.650630 4.32363 3.35409 5.113290 -0.051482 0.623136 0.203378 1.74384 ... 0.865338 1.432780 1.166460 0.018348 0.479160 -0.233870 -0.179549 -0.483875 0.839336 -0.996326
9:98275789:C:T -0.687039 -0.247633 2.569970 4.27250 3.63573 4.751370 0.036028 0.624772 0.184598 1.47482 ... 0.741583 1.084520 1.003140 0.071251 0.892456 -0.443611 -0.191688 -0.461093 1.284170 -0.991163
9:98278413:C:T -0.819318 -0.267375 2.656800 4.05938 3.62366 4.654410 0.006371 1.033790 0.106624 1.35686 ... 0.849630 0.951746 0.913949 -0.017823 0.924901 -0.552934 -0.289266 -0.395223 1.057020 -0.924260

37395 rows × 317 columns

Code
mean_se  = se_df.median(axis = 0, skipna = True)
mean_se  = pd.DataFrame(mean_se).set_axis(["mean_se"], axis = 1)
beta_std = beta_df.std(axis = 0, skipna = True)
beta_std = pd.DataFrame(beta_std).set_axis(["beta_std"], axis = 1)
error_df = pd.concat([mean_se, beta_std], axis = 1)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(np.log10(error_df['beta_std']), np.log10(error_df['mean_se']), alpha = 0.5, s = 100)
mpl_utils.set_xticks(ax1, scale = 'log10', spacing = 'log10')
mpl_utils.set_yticks(ax1, scale = 'log10', spacing = 'log10')
mpl_utils.plot_diag(ax1)

keep_columns = error_df.query("mean_se <= 0.2").index
for pid in error_df.index.to_list():
    if pid not in keep_columns:
        pid_text = f"{pid} / {phenotype_dict[pid]}"
        xval = np.log10(error_df.loc[pid]['beta_std'])
        yval = np.log10(error_df.loc[pid]['mean_se'])
        ax1.annotate(pid_text, (xval, yval))

ax1.set_xlabel(r"Standard Deviation of mean $\beta$")
ax1.set_ylabel(r"Median of SE")
plt.show()
Figure 1: Calibration of SE against std of beta
Code
zscore_df.to_pickle("../data/ukbb_zscore_df.pkl")