Background

Trans-eQTLs can be confounded by population substructure. To get an idea, I am looking at the fixation index of the trans-eQTLs. I calculated $F_{ST}$ from the 1000 Genome Project using 668 EUR (including CEU, FIN, GBR, IBS, TSI) subjects and 806 AFR (including ESN, GWD, LWK, MSL, YRI) subjects. I used the mapping between GTEx variant IDs and dbGap 151 hg38 mappings for rsids. I used the hg38 mapped version of 1KG.

Path to all data files

#collapse-hide
teqtl_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/trans_eqtls/trans_eqtls_EUR_AFR_fst.txt'
gwas_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/gwas/gwas_SNPs_EUR_AFR_fst.txt'
random_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/trans_eqtls/random_snps_EUR_AFR_fst.txt'
chr1_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/fst/EUR-AFR_chr1.weir.fst'
teqtl_results = '/usr/users/sbanerj/trans-eQTL/gtex_v8_tejaas_trans_eQTLs_20200309/protein_coding_lncRNA_fixed_gamma_alt-haa-spl-pan-wb_combined_cut5e-8'
chr1_rr_file = '/scratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_202003/as/tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt'
tissue_file = 'gtex/tissue_table.txt'
json_file = 'gtex/gtex_v8_metadata.json'
nsample_file = "gtex/tissue_nsamples.txt"
popsample_file = 'gtex/tissue_nsamples_with_pop.txt'

Python libraries

#collapse-hide
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)
from utils import utils

Read metadata for our trans-eQTLs

#collapse-hide
tshorts, tfulls, tstrings = utils.read_tissues(tissue_file)
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_nsamples = dict()

for tshort, tfull, tstring in zip(tshorts, tfulls, tstrings):
    if tshort in tshorts:
        tissue_names[tshort] = tstring
        tissue_colors[tshort] = "#" + gtex_meta[tfull]["colorHex"]
        
tissue_nsamples = dict()       
tissue_noneur_samples = dict()
tissue_afr_samples = dict()
tissue_frac_noneur_to_eur = dict()
with open(popsample_file, 'r') as instream:
    header = instream.readline().split()
    totidx = header.index('TOT')
    euridx = header.index('EUR')
    afridx = header.index('AFR')
    for line in instream:
        linesplit = line.strip().split()
        tshort = linesplit[0].strip()
        ntot = int(linesplit[totidx])
        neur = int(linesplit[euridx])
        tissue_nsamples[tshort] = ntot
        tissue_noneur_samples[tshort] = ntot - neur
        tissue_afr_samples[tshort] = int(linesplit[afridx])
        tissue_frac_noneur_to_eur[tshort] = (ntot - neur) / neur
        
brain_tissues = ['bam', 'ban', 'bca', 'bceh', 'bce', 'bco', 'bfr', 'bhi', 'bhy', 'bnu', 'bpu', 'bsp', 'bsu']
altsb_tissues = ['haa', 'pan', 'spl', 'wb']

Define some functions to read results

#collapse-hide
def get_fstmap(fst_file, headerline = 0):
    fstmap = dict()
    for c in range(1,23):
        fstmap[f'chr{c}'] = {}

    with open(fst_file, 'r') as instream:
        for line in instream:
            linesplit = line.strip().split()
            chrm = int(linesplit[0])
            bppos = int(linesplit[1])
            fst = float(linesplit[2])
            if np.isnan(fst): fst = 0
            fstmap[f'chr{chrm}'][bppos] = fst
    return fstmap

def read_transeqtls(filename):
    idcol = 0
    chrcol = 1
    bpcol = 2
    pvcol = 7
    rsidlist = list()
    chrmlist = list()
    bplist   = list()
    pvallist = list()
    with open(filename, 'r') as instream:
        next(instream)
        for line in instream:
            linesplit = line.strip().split()
            rsid = linesplit[idcol]
            chrm = int(linesplit[chrcol])
            bppos = int(linesplit[bpcol])
            pval = float(linesplit[pvcol])
            rsidlist.append(rsid)
            chrmlist.append(chrm)
            bplist.append(bppos)
            pvallist.append(pval)
    return rsidlist, chrmlist, pvallist, bplist

Read the results for trans-eQTLs

#collapse-hide
teqtl_fstmap = get_fstmap(teqtl_fst_file)
random_fstmap = get_fstmap(random_fst_file)
gwas_fstmap = get_fstmap(gwas_fst_file)
teqtl_fst = [x for c in range(1,23) for k, x in teqtl_fstmap[f'chr{c}'].items()]
random_fst = [x for c in range(1,23) for k, x in random_fstmap[f'chr{c}'].items()]
gwas_fst = [x for c in range(1,23) for k, x in gwas_fstmap[f'chr{c}'].items()]

mean_fst_per_tissue = dict()
tissue_nteqtls_with_fst = dict()
for tshort in tshorts:
    _fst = list()
    resfile = os.path.join(teqtl_results, tshort, 'trans_eqtls_ldpruned.txt')
    gwvarids, gwchrms, gwpvals, gwbppos = read_transeqtls(resfile)
    for i, varid in enumerate(gwvarids):
        chrm = gwchrms[i]
        bppos = gwbppos[i]
        if bppos in teqtl_fstmap[f'chr{chrm}']:
            _fst.append(teqtl_fstmap[f'chr{chrm}'][bppos])
    if _fst:
        tissue_nteqtls_with_fst[tshort] = len(_fst)
        mean_fst_per_tissue[tshort] = np.mean(np.array(_fst))

Read the results for p-values

#collapse-hide
chr1_fstmap = get_fstmap(chr1_fst_file)

pvalue_lowfst = list()
pvalue_highfst = list()

gwvarids, gwchrms, gwpvals, gwbppos = read_transeqtls(chr1_rr_file)
for i, varid in enumerate(gwvarids):
    chrm = gwchrms[i]
    bppos = gwbppos[i]
    if bppos in chr1_fstmap[f'chr{chrm}']:
        this_fst = chr1_fstmap[f'chr{chrm}'][bppos]
        if this_fst <= 0.5:
            pvalue_lowfst.append(gwpvals[i])
        else:
            pvalue_highfst.append(gwpvals[i])

Plot

Compare $F_{ST}$ distributions for all our trans-eQTLs with random SNPs.
Check if the mean of $F_{ST}$ depends on the number of trans-eQTLs discovered.

Right panel: color according to GTEx tissues, size according to number of samples

#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

ax1.hist(gwas_fst, alpha = 0.7, bins = 50, lw = 0, density = True, label = 'GWAS SNPs')
ax1.hist(random_fst, alpha = 0.5, bins = 50, lw = 0, density = True, label = 'Random SNPs')
ax1.hist(teqtl_fst, alpha = 0.7, bins = 50, lw = 0, density = True, label = 'Trans-eQTLs')

ax1.legend()
ax1.set_xlabel('$F_{ST}$')
ax1.set_ylabel('Density')

whichtissues = [x for x in tshorts if x in mean_fst_per_tissue]
yvals = [mean_fst_per_tissue[x] for x in whichtissues]
xvals = [tissue_frac_noneur_to_eur[x] for x in whichtissues]
# xvals = [np.log2(tissue_nteqtls_with_fst[x]) for x in whichtissues]
sizes = [tissue_nsamples[x] / 2 for x in whichtissues]
# sizes = [tissue_sample_frac_noneur_to_eur[x] * 1000 for x in whichtissues]
mcolors = [tissue_colors[x] for x in whichtissues]
ax2.scatter(xvals, yvals, color = mcolors, s = sizes, edgecolors = 'black', zorder = 10)
ax2.set_xlabel('non-European / European')
ax2.set_ylabel('Mean $F_{ST}$')
# xticks = [2, 4, 6, 8, 10, 12]
# ax2.set_xticks(xticks)
# ax2.set_xticklabels([int(2**x) for x in xticks])

plt.tight_layout()
plt.show()

Compare Tejaas P-values of SNPs with high $F_{ST}$ and low $F_{ST}$

#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(111)

ax1.hist(pvalue_highfst, bins = 100, lw = 0, density = True, alpha = 0.7, label = "High $F_{ST}$")
ax1.hist(pvalue_lowfst, bins = 100, lw = 0, density = True, alpha = 0.7, label = "Low $F_{ST}$")
ax1.set_xlabel("P-value")
ax1.set_ylabel("Density")
ax1.legend()

plt.tight_layout()
plt.show()