Tejaas P-value and population admixture
Path to all data files
#collapse-hide
teqtl_all_dir = '/scratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_202003/'
teqtl_eur_dir = '/cbscratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_EUR_202006'
teqtl_amx_dir = '/cbscratch/sbanerj/trans-eqtl/dev-pipeline/gtex_v8_AMX_202006'
chr1_fst_file = '/scratch/sbanerj/trans-eqtl/fst_from_1kg/fst/EUR-AFR_chr1.weir.fst'
tissue_file = 'gtex/tissue_table.txt'
json_file = 'gtex/gtex_v8_metadata.json'
nsample_file = "gtex/tissue_nsamples.txt"
Python libraries
#collapse-hide
import os
import numpy as np
import json
import collections
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 72)
from utils import utils
Read metadata for our trans-eQTLs
#collapse-hide
tshorts, tfulls, tstrings = utils.read_tissues(tissue_file)
with open(json_file) as instream:
gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_nsamples = dict()
for tshort, tfull, tstring in zip(tshorts, tfulls, tstrings):
if tshort in tshorts:
tissue_names[tshort] = tstring
tissue_colors[tshort] = "#" + gtex_meta[tfull]["colorHex"]
tissue_nsamples = dict()
with open(nsample_file, 'r') as instream:
for line in instream:
tshort = line.strip().split()[0].strip()
tissue_nsamples[tshort] = int(line.strip().split()[1].strip())
brain_tissues = ['bam', 'ban', 'bca', 'bceh', 'bce', 'bco', 'bfr', 'bhi', 'bhy', 'bnu', 'bpu', 'bsp', 'bsu']
altsb_tissues = ['haa', 'pan', 'spl', 'wb']
Define some functions to read results
#collapse-hide
def read_transeqtls(filename):
idcol = 0
chrcol = 1
bpcol = 2
pvcol = 7
rsidlist = list()
chrmlist = list()
bplist = list()
pvallist = list()
with open(filename, 'r') as instream:
next(instream)
for line in instream:
linesplit = line.strip().split()
rsid = linesplit[idcol]
chrm = int(linesplit[chrcol])
bppos = int(linesplit[bpcol])
pval = float(linesplit[pvcol])
rsidlist.append(rsid)
chrmlist.append(chrm)
bplist.append(bppos)
pvallist.append(pval)
return rsidlist, chrmlist, pvallist, bplist
def get_fstmap(fst_file, headerline = 0):
fstmap = dict()
for c in range(1,23):
fstmap[f'chr{c}'] = {}
with open(fst_file, 'r') as instream:
for line in instream:
linesplit = line.strip().split()
chrm = int(linesplit[0])
bppos = int(linesplit[1])
fst = float(linesplit[2])
if np.isnan(fst): fst = 0
fstmap[f'chr{chrm}'][bppos] = fst
return fstmap
Read the results for trans-eQTLs
#collapse-hide
tshort = 'as'
RES_FIELDS = ['pall', 'peur', 'fst']
class CompareRes(collections.namedtuple('_CompareRes', RES_FIELDS)):
__slots__ = ()
chr1_fstmap = get_fstmap(chr1_fst_file)['chr1']
eur_res = list()
rr_file_all = os.path.join(teqtl_all_dir, tshort, 'tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt')
rr_file_eur = os.path.join(teqtl_eur_dir, tshort, 'tejaas/raw_std/permnull_sb0.1_knn30/chr1/rr.txt')
allvarids, allchrms, allpvals, allbppos = read_transeqtls(rr_file_all)
eurvarids, eurchrms, eurpvals, eurbppos = read_transeqtls(rr_file_eur)
Prepare plotting result
for i, varid in enumerate(allvarids):
if varid in eurvarids and allbppos[i] in chr1_fstmap:
j = eurvarids.index(varid)
eur_res.append(CompareRes(pall=allpvals[i], peur=eurpvals[j], fst = chr1_fstmap[allbppos[i]]))
if len(eur_res) > 200000: break
#collapse-hide
fig = plt.figure(figsize = (12, 6))
ax = list([0, 0])
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
lowfst = [x for x in eur_res if x.fst <= 0.1]
highfst = [x for x in eur_res if x.fst >= 0.3]
for res, ax in zip([lowfst, highfst], [ax1, ax2]):
k = min(20000, len(res))
nchoose = np.random.choice(len(res), k, replace = False)
xvals = [-np.log10(res[i].pall) for i in nchoose]
yvals = [-np.log10(res[i].peur) for i in nchoose]
ax.scatter(xvals, yvals, alpha = 0.05, facecolor = 'tomato', edgecolor = 'maroon', zorder = 1)
ax.set_xlabel('All samples')
ax.set_ylabel('Only EUR samples')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xlim([0, 5])
ax.set_ylim([0, 5])
ax.plot([0, 5], [0, 5], zorder = 0, ls = 'dashed', color = 'gray')
ax1.text(0.07, 0.93, '$F_{ST} < 0.1$', transform=ax1.transAxes, va='top', ha = 'left')
ax2.text(0.07, 0.93, '$F_{ST} > 0.3$', transform=ax2.transAxes, va='top', ha = 'left')
plt.tight_layout()
plt.show()
Find correlation between $P$-values at different $F_{ST}$
#collapse-hide
from scipy.stats import pearsonr
nbin = 10
bins = np.linspace(0, 1, nbin + 1)
xvals = [(bins[i+1] + bins[i]) / 2 for i in range(nbin)]
yvals = [0 for i in range(nbin)]
size = [0 for i in range(nbin)]
#rmse = [0 for i in range(nbin)]
for i in range(nbin):
res = [x for x in eur_res if x.fst < bins[i+1] and x.fst >= bins[i]]
if len(res) > 100:
k = min(30000, len(res))
nchoose = np.random.choice(len(res), k, replace = False)
data1 = np.array([-np.log10(res[i].pall) for i in nchoose])
data2 = np.array([-np.log10(res[i].peur) for i in nchoose])
#data1 = np.array([-np.log10(res[i].pall) for i in nchoose]).reshape(-1, 1)
#data2 = np.array([-np.log10(res[i].peur) for i in nchoose]).reshape(-1, 1)
#linreg = LinearRegression(fit_intercept=False, normalize=False, copy_X=True)
#linreg.fit(data1, data2)
#y_pred = linreg.predict(data1)
#mse = np.sum((y_pred - data2)**2)
#yvals[i] = linreg.coef_[0][0]
yvals[i] = np.sum((data1 - data2)**2) / k
size[i] = k
#rmse[i] = 1#mse / k
else:
yvals[i] = np.nan
- RMSD between Tejaas' $P$-values from all samples and Tejaas' $P$-values from only EUR samples at different local ancestry
Size is proportional to number of SNPs in the $F_{ST}$ window
#collapse-hide
msize = [min(12800, k) / 16 for k in size]
fig = plt.figure(figsize = (6, 6))
ax1 = fig.add_subplot(111)
ax1.scatter(xvals, yvals, s = msize)
ax1.set_xlabel('$F_{ST}$')
ax1.set_ylabel('RMSD of P-values (all vs EUR)')
ax1.set_xlim([-0.05, 0.9])
#ax1.set_ylim([0, 1])
#ax1.colorbar()
plt.show()