Background

Tejaas is picking SNPs with high $F_{ST}$ as trans-eQTLs. To dissect the problem, I am comparing the P-values for all SNPs in chromosome 1 obtained from all samples against the p-values obtained from only European samples.

Path to all data files

#collapse-hide
teqtl_all_dir = '/scratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_202003/'
teqtl_eur_dir = '/cbscratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_EUR_202006'
teqtl_amx_dir = '/cbscratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_AMX_202006'
chr1_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/fst/EUR-AFR_chr1.weir.fst'

tissue_file = 'gtex/tissue_table.txt'
json_file = 'gtex/gtex_v8_metadata.json'
nsample_file = "gtex/tissue_nsamples.txt"

Python libraries

#collapse-hide
import os
import numpy as np
import json
import collections
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)
from utils import utils

Read metadata for our trans-eQTLs

#collapse-hide
tshorts, tfulls, tstrings = utils.read_tissues(tissue_file)
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_nsamples = dict()

for tshort, tfull, tstring in zip(tshorts, tfulls, tstrings):
    if tshort in tshorts:
        tissue_names[tshort] = tstring
        tissue_colors[tshort] = "#" + gtex_meta[tfull]["colorHex"]
        
tissue_nsamples = dict()
with open(nsample_file, 'r') as instream:
    for line in instream:
        tshort = line.strip().split()[0].strip()
        tissue_nsamples[tshort] = int(line.strip().split()[1].strip())
        
brain_tissues = ['bam', 'ban', 'bca', 'bceh', 'bce', 'bco', 'bfr', 'bhi', 'bhy', 'bnu', 'bpu', 'bsp', 'bsu']
altsb_tissues = ['haa', 'pan', 'spl', 'wb']

Define some functions to read results

#collapse-hide
def read_transeqtls(filename):
    idcol = 0
    chrcol = 1
    bpcol = 2
    pvcol = 7
    rsidlist = list()
    chrmlist = list()
    bplist   = list()
    pvallist = list()
    with open(filename, 'r') as instream:
        next(instream)
        for line in instream:
            linesplit = line.strip().split()
            rsid = linesplit[idcol]
            chrm = int(linesplit[chrcol])
            bppos = int(linesplit[bpcol])
            pval = float(linesplit[pvcol])
            rsidlist.append(rsid)
            chrmlist.append(chrm)
            bplist.append(bppos)
            pvallist.append(pval)
    return rsidlist, chrmlist, pvallist, bplist

def get_fstmap(fst_file, headerline = 0):
    fstmap = dict()
    for c in range(1,23):
        fstmap[f'chr{c}'] = {}

    with open(fst_file, 'r') as instream:
        for line in instream:
            linesplit = line.strip().split()
            chrm = int(linesplit[0])
            bppos = int(linesplit[1])
            fst = float(linesplit[2])
            if np.isnan(fst): fst = 0
            fstmap[f'chr{chrm}'][bppos] = fst
    return fstmap

Read the results for trans-eQTLs

#collapse-hide
tshort = 'as'

RES_FIELDS = ['pall', 'peur', 'fst']
class CompareRes(collections.namedtuple('_CompareRes', RES_FIELDS)):
    __slots__ = ()
    
chr1_fstmap = get_fstmap(chr1_fst_file)['chr1']

eur_res = list()
rr_file_all = os.path.join(teqtl_all_dir, tshort, 'tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt')
rr_file_eur = os.path.join(teqtl_eur_dir, tshort, 'tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt')
allvarids, allchrms, allpvals, allbppos = read_transeqtls(rr_file_all)
eurvarids, eurchrms, eurpvals, eurbppos = read_transeqtls(rr_file_eur)

Prepare plotting result

for i, varid in enumerate(allvarids):
    if varid in eurvarids and allbppos[i] in chr1_fstmap:
        j = eurvarids.index(varid)
        eur_res.append(CompareRes(pall=allpvals[i], peur=eurpvals[j], fst = chr1_fstmap[allbppos[i]]))
        if len(eur_res) > 200000: break

Plots

Compare $-\log_{10} (p)$ with and without non-EUR samples at $F_{ST} \le 0.1$ and $\ge 0.3$

#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax = list([0, 0])
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

lowfst = [x for x in eur_res if x.fst <= 0.1]
highfst = [x for x in eur_res if x.fst >= 0.3]

for res, ax in zip([lowfst, highfst], [ax1, ax2]):
    k = min(20000, len(res))
    nchoose = np.random.choice(len(res), k, replace = False)
    xvals = [-np.log10(res[i].pall) for i in nchoose]
    yvals = [-np.log10(res[i].peur) for i in nchoose]

    ax.scatter(xvals, yvals, alpha = 0.05, facecolor = 'tomato', edgecolor = 'maroon', zorder = 1)
    ax.set_xlabel('All samples')
    ax.set_ylabel('Only EUR samples')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_xlim([0, 5])
    ax.set_ylim([0, 5])
    ax.plot([0, 5], [0, 5], zorder = 0, ls = 'dashed', color = 'gray')
    
ax1.text(0.07, 0.93, '$F_{ST} < 0.1$', transform=ax1.transAxes, va='top', ha = 'left')
ax2.text(0.07, 0.93, '$F_{ST} > 0.3$', transform=ax2.transAxes, va='top', ha = 'left')


plt.tight_layout()
plt.show()

Find correlation between $P$-values at different $F_{ST}$

#collapse-hide

from scipy.stats import pearsonr

nbin = 10
bins = np.linspace(0, 1, nbin + 1)
xvals = [(bins[i+1] + bins[i]) / 2 for i in range(nbin)]
yvals = [0 for i in range(nbin)]
size = [0 for i in range(nbin)]
#rmse = [0 for i in range(nbin)]
for i in range(nbin):
    res = [x for x in eur_res if x.fst < bins[i+1] and x.fst >= bins[i]]
    if len(res) > 100:
        k = min(30000, len(res))
        nchoose = np.random.choice(len(res), k, replace = False)
        data1 = np.array([-np.log10(res[i].pall) for i in nchoose])
        data2 = np.array([-np.log10(res[i].peur) for i in nchoose])
        #data1 = np.array([-np.log10(res[i].pall) for i in nchoose]).reshape(-1, 1)
        #data2 = np.array([-np.log10(res[i].peur) for i in nchoose]).reshape(-1, 1)
        #linreg = LinearRegression(fit_intercept=False, normalize=False, copy_X=True)
        #linreg.fit(data1, data2)
        #y_pred = linreg.predict(data1)
        #mse = np.sum((y_pred - data2)**2)
        #yvals[i] = linreg.coef_[0][0]
        yvals[i] = np.sum((data1 - data2)**2) / k
        size[i] = k
        #rmse[i] = 1#mse / k
    else:
        yvals[i] = np.nan

RMSD between Tejaas' $P$-values from all samples and Tejaas' $P$-values from only EUR samples at different local ancestry

Size is proportional to number of SNPs in the $F_{ST}$ window

#collapse-hide
msize = [min(12800, k) / 16 for k in size]
fig = plt.figure(figsize = (6, 6))
ax1 = fig.add_subplot(111)
ax1.scatter(xvals, yvals, s = msize)
ax1.set_xlabel('$F_{ST}$')
ax1.set_ylabel('RMSD of P-values (all vs EUR)')
ax1.set_xlim([-0.05, 0.9])
#ax1.set_ylim([0, 1])
#ax1.colorbar()
plt.show()