Estimating fixation index in trans-eQTLs
Background
Trans-eQTLs can be confounded by population substructure. To get an idea, I am looking at the fixation index of the trans-eQTLs. I calculated $F_{ST}$ from the 1000 Genome Project using 668 EUR (including CEU, FIN, GBR, IBS, TSI) subjects and 806 AFR (including ESN, GWD, LWK, MSL, YRI) subjects. I used the mapping between GTEx variant IDs and dbGap 151 hg38 mappings for rsids. I used the hg38 mapped version of 1KG.
Path to all data files
#collapse-hide
teqtl_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/trans_eqtls/trans_eqtls_EUR_AFR_fst.txt'
gwas_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/gwas/gwas_SNPs_EUR_AFR_fst.txt'
random_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/trans_eqtls/random_snps_EUR_AFR_fst.txt'
chr1_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/fst/EUR-AFR_chr1.weir.fst'
teqtl_results = '/usr/users/sbanerj/trans-eQTL/gtex_v8_tejaas_trans_eQTLs_20200309/protein_coding_lncRNA_fixed_gamma_alt-haa-spl-pan-wb_combined_cut5e-8'
chr1_rr_file = '/scratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_202003/as/tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt'
tissue_file = 'gtex/tissue_table.txt'
json_file = 'gtex/gtex_v8_metadata.json'
nsample_file = "gtex/tissue_nsamples.txt"
popsample_file = 'gtex/tissue_nsamples_with_pop.txt'
Python libraries
#collapse-hide
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)
from utils import utils
Read metadata for our trans-eQTLs
#collapse-hide
tshorts, tfulls, tstrings = utils.read_tissues(tissue_file)
with open(json_file) as instream:
gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_nsamples = dict()
for tshort, tfull, tstring in zip(tshorts, tfulls, tstrings):
if tshort in tshorts:
tissue_names[tshort] = tstring
tissue_colors[tshort] = "#" + gtex_meta[tfull]["colorHex"]
tissue_nsamples = dict()
tissue_noneur_samples = dict()
tissue_afr_samples = dict()
tissue_frac_noneur_to_eur = dict()
with open(popsample_file, 'r') as instream:
header = instream.readline().split()
totidx = header.index('TOT')
euridx = header.index('EUR')
afridx = header.index('AFR')
for line in instream:
linesplit = line.strip().split()
tshort = linesplit[0].strip()
ntot = int(linesplit[totidx])
neur = int(linesplit[euridx])
tissue_nsamples[tshort] = ntot
tissue_noneur_samples[tshort] = ntot - neur
tissue_afr_samples[tshort] = int(linesplit[afridx])
tissue_frac_noneur_to_eur[tshort] = (ntot - neur) / neur
brain_tissues = ['bam', 'ban', 'bca', 'bceh', 'bce', 'bco', 'bfr', 'bhi', 'bhy', 'bnu', 'bpu', 'bsp', 'bsu']
altsb_tissues = ['haa', 'pan', 'spl', 'wb']
Define some functions to read results
#collapse-hide
def get_fstmap(fst_file, headerline = 0):
fstmap = dict()
for c in range(1,23):
fstmap[f'chr{c}'] = {}
with open(fst_file, 'r') as instream:
for line in instream:
linesplit = line.strip().split()
chrm = int(linesplit[0])
bppos = int(linesplit[1])
fst = float(linesplit[2])
if np.isnan(fst): fst = 0
fstmap[f'chr{chrm}'][bppos] = fst
return fstmap
def read_transeqtls(filename):
idcol = 0
chrcol = 1
bpcol = 2
pvcol = 7
rsidlist = list()
chrmlist = list()
bplist = list()
pvallist = list()
with open(filename, 'r') as instream:
next(instream)
for line in instream:
linesplit = line.strip().split()
rsid = linesplit[idcol]
chrm = int(linesplit[chrcol])
bppos = int(linesplit[bpcol])
pval = float(linesplit[pvcol])
rsidlist.append(rsid)
chrmlist.append(chrm)
bplist.append(bppos)
pvallist.append(pval)
return rsidlist, chrmlist, pvallist, bplist
Read the results for trans-eQTLs
#collapse-hide
teqtl_fstmap = get_fstmap(teqtl_fst_file)
random_fstmap = get_fstmap(random_fst_file)
gwas_fstmap = get_fstmap(gwas_fst_file)
teqtl_fst = [x for c in range(1,23) for k, x in teqtl_fstmap[f'chr{c}'].items()]
random_fst = [x for c in range(1,23) for k, x in random_fstmap[f'chr{c}'].items()]
gwas_fst = [x for c in range(1,23) for k, x in gwas_fstmap[f'chr{c}'].items()]
mean_fst_per_tissue = dict()
tissue_nteqtls_with_fst = dict()
for tshort in tshorts:
_fst = list()
resfile = os.path.join(teqtl_results, tshort, 'trans_eqtls_ldpruned.txt')
gwvarids, gwchrms, gwpvals, gwbppos = read_transeqtls(resfile)
for i, varid in enumerate(gwvarids):
chrm = gwchrms[i]
bppos = gwbppos[i]
if bppos in teqtl_fstmap[f'chr{chrm}']:
_fst.append(teqtl_fstmap[f'chr{chrm}'][bppos])
if _fst:
tissue_nteqtls_with_fst[tshort] = len(_fst)
mean_fst_per_tissue[tshort] = np.mean(np.array(_fst))
Read the results for p-values
#collapse-hide
chr1_fstmap = get_fstmap(chr1_fst_file)
pvalue_lowfst = list()
pvalue_highfst = list()
gwvarids, gwchrms, gwpvals, gwbppos = read_transeqtls(chr1_rr_file)
for i, varid in enumerate(gwvarids):
chrm = gwchrms[i]
bppos = gwbppos[i]
if bppos in chr1_fstmap[f'chr{chrm}']:
this_fst = chr1_fstmap[f'chr{chrm}'][bppos]
if this_fst <= 0.5:
pvalue_lowfst.append(gwpvals[i])
else:
pvalue_highfst.append(gwpvals[i])
#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
ax1.hist(gwas_fst, alpha = 0.7, bins = 50, lw = 0, density = True, label = 'GWAS SNPs')
ax1.hist(random_fst, alpha = 0.5, bins = 50, lw = 0, density = True, label = 'Random SNPs')
ax1.hist(teqtl_fst, alpha = 0.7, bins = 50, lw = 0, density = True, label = 'Trans-eQTLs')
ax1.legend()
ax1.set_xlabel('$F_{ST}$')
ax1.set_ylabel('Density')
whichtissues = [x for x in tshorts if x in mean_fst_per_tissue]
yvals = [mean_fst_per_tissue[x] for x in whichtissues]
xvals = [tissue_frac_noneur_to_eur[x] for x in whichtissues]
# xvals = [np.log2(tissue_nteqtls_with_fst[x]) for x in whichtissues]
sizes = [tissue_nsamples[x] / 2 for x in whichtissues]
# sizes = [tissue_sample_frac_noneur_to_eur[x] * 1000 for x in whichtissues]
mcolors = [tissue_colors[x] for x in whichtissues]
ax2.scatter(xvals, yvals, color = mcolors, s = sizes, edgecolors = 'black', zorder = 10)
ax2.set_xlabel('non-European / European')
ax2.set_ylabel('Mean $F_{ST}$')
# xticks = [2, 4, 6, 8, 10, 12]
# ax2.set_xticks(xticks)
# ax2.set_xticklabels([int(2**x) for x in xticks])
plt.tight_layout()
plt.show()
- Compare Tejaas P-values of SNPs with high $F_{ST}$ and low $F_{ST}$
#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(111)
ax1.hist(pvalue_highfst, bins = 100, lw = 0, density = True, alpha = 0.7, label = "High $F_{ST}$")
ax1.hist(pvalue_lowfst, bins = 100, lw = 0, density = True, alpha = 0.7, label = "Low $F_{ST}$")
ax1.set_xlabel("P-value")
ax1.set_ylabel("Density")
ax1.legend()
plt.tight_layout()
plt.show()