Removing background by PCA
Required for removing covariates in the distance matrix
Some references
Background removal with robust PCA
A nice explanation of PCA
#collapse-hide
expression_file = '/scratch/sbanerj/trans-eqtl/input/gtex_v8/expression/gtex_ms_raw_std_protein_coding_lncRNA.txt'
#collapse-hide
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy import stats
import os
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.axes_grid1 import make_axes_locatable
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 300)
#collapse-hide
def read_gtex(filename): # returns N x G gene expression
expr_list = list()
donor_list = list()
gene_list = list()
with open(filename) as mfile:
donor_list = mfile.readline().strip().split("\t")[1:]
for line in mfile:
linesplit = line.strip().split("\t")
gene = linesplit[0].strip()
gene_list.append(gene)
expr = np.array([float(x) for x in linesplit[1:]])
expr_list.append(expr)
expr = np.transpose(np.array(expr_list))
return expr, donor_list, gene_list
def get_pca(x, K):
pca = PCA(n_components=K)
pca.fit(x) # requires N x P (n_samples, n_features)
x_pca = pca.transform(x)
return x_pca
def get_distance(a, b):
return np.linalg.norm(a - b)
def distance_matrix(x_pca):
nsample = x_pca.shape[0]
distance_matrix = np.zeros((nsample, nsample))
for i in range(nsample):
for j in range(i+1, nsample):
dist = get_distance(x_pca[i,:], x_pca[j,:])
distance_matrix[i, j] = dist
distance_matrix[j, i] = dist
return distance_matrix
def pheatmap(ax, Xorig, title, norm = None):
'''
provide norm, if required
where,
norm = matplotlib.colors.DivergingNorm(vmin=0., vcenter=90., vmax=300.)
'''
# the zero distance between the same samples
# is bad for the color scale.
X = Xorig.copy()
X[np.diag_indices(X.shape[0])] = np.nan
cmap = plt.get_cmap("YlGnBu")
cmap.set_bad('w')
if norm is not None:
im1 = ax.imshow(X, cmap = cmap, norm = norm, interpolation='nearest')
else:
im1 = ax.imshow(X, cmap = cmap, interpolation='nearest')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im1, cax=cax, fraction = 0.1)
ax.set_title(title, pad = 20)
return
#collapse-hide
gx, gxsamples, _ = read_gtex(expression_file)
#collapse-hide
dm_gx = distance_matrix(get_pca(gx, 30))
o1 = hc.leaves_list(hc.linkage(dm_gx, method = 'centroid'))
mgx = dm_gx[o1, :][:, o1]
np.linalg.eig
returns
- w : (M,) array. The eigenvalues, each repeated according to its multiplicity. The eigenvalues are not necessarily ordered.
- v : (M, M) array. The normalized (unit “length”) eigenvectors, such that the column v[:,i] is the eigenvector corresponding to the eigenvalue w[i].
np.linalg.svd (a, full_matrices = False)
returns
- u : (M, K) array. Unitary array(s).
- s : (K, ) array. Vector(s) with the singular values, within each vector sorted in descending order.
- vt : (K, N) array. Unitary array(s).
#collapse-hide
def remove_nfirst_pcs_svd(X, n=1):
'''
Using the svd module of numpy.linalg
'''
#mu = np.mean(X, axis = 0)
#Xnorm = X - mu
#U, S, Vt = np.linalg.svd(Xnorm, full_matrices=False)
U, S, Vt = np.linalg.svd(X, full_matrices=False)
Xhat = U[:, n:] @ np.diag(S[n:]) @ Vt[n:, :]
#Xhat += mu
return Xhat
def remove_nfirst_pcs_eig(X, n=1):
'''
Using the eig module of numpy.linalg
'''
#mu = np.mean(X, axis = 0)
#Xnorm = X - mu
#w, v = np.linalg.eig(Xnorm)
w, v = np.linalg.eig(X)
iord = np.argsort(w)[::-1]
w = w[iord]
v = v[:, iord]
Xhat = v[:, n:] @ np.diag(w[n:]) @ v[:, n:].T
#Xhat += mu
return Xhat
def remove_nfirst_pcs_skl(X, n = 1):
'''
Using the PCA module of sklearn
'''
nsamples = X.shape[0]
nfeatures = X.shape[1]
ncomp = min(nsamples, nfeatures)
pca = PCA(n_components = ncomp)
pca.fit(X)
Xpcs = pca.transform(X)
Xeig = pca.components_
mu = np.mean(X, axis=0)
Xhat = np.dot(Xpcs[:, n:], Xeig[n:, :])
Xhat += mu
return Xhat
#collapse-show
mgx_n0_eig = remove_nfirst_pcs_eig(mgx, n=0)
mgx_n1_eig = remove_nfirst_pcs_eig(mgx, n=1)
mgx_n0_svd = remove_nfirst_pcs_svd(mgx, n=0)
mgx_n1_svd = remove_nfirst_pcs_svd(mgx, n=1)
mgx_n0_skl = remove_nfirst_pcs_skl(mgx, n=0)
mgx_n1_skl = remove_nfirst_pcs_skl(mgx, n=1)
norm = matplotlib.colors.DivergingNorm(vmin=0., vcenter=75., vmax=300.)
#collapse-show
fig = plt.figure(figsize = (18, 6))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
pheatmap(ax1, mgx, "Original", norm = norm)
pheatmap(ax2, mgx_n0_eig, "Reconstruction", norm = norm)
pheatmap(ax3, mgx_n1_eig, "Removed first PC", norm = norm)
fig.suptitle("Using np.linalg.eig")
plt.tight_layout()
plt.show()
#collapse-show
fig = plt.figure(figsize = (18, 6))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
pheatmap(ax1, mgx, "Original", norm = norm)
pheatmap(ax2, mgx_n0_svd, "Reconstruction", norm = norm)
pheatmap(ax3, mgx_n1_svd, "Removed first PC", norm = norm)
plt.tight_layout()
plt.show()
#collapse-show
fig = plt.figure(figsize = (18, 6))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
pheatmap(ax1, mgx, "Original", norm = norm)
pheatmap(ax2, mgx_n0_skl, "Reconstruction", norm = norm)
pheatmap(ax3, mgx_n1_skl, "Removed first PC", norm = norm)
plt.tight_layout()
plt.show()
np.allclose(mgx_n1_svd, mgx_n1_eig)