Options for finding neighbors
What's the best way to find neighbors from the distance matrix?
- Setup
- 1. Find neighbors after removing first PC from distance matrix
- 2. Find neighbors after removing first PC from log(distance matrix)
- 3. Calculate neigbors afer centering / scaling distance matrix
- 4. Apply KNN on quantile normalized gene expression
- 5. Calculate neighbors from quantile normalized gene expression and apply on raw data
Yesterday, we found that some samples are still far from others after KNN correction. Why do we see this pattern? In this notebook, I have explored different options of calculating neighbors.
Important: I modified the KNN function. Earlier, the distance matrix was always positive and sorting the samples by distance had the self index at the beginning (distance = 0).
Data location
#collapse-hide
dosagefile = '/cbscratch/sbanerj/gtex_pca/gtex_v8_filtered.dosage.raw'
dosage_numpy_file = '/cbscratch/sbanerj/gtex_pca/gtex_dosage.npy'
expression_file = '/scratch/sbanerj/trans-eqtl/input/gtex_v8/expression/gtex_ms_raw_std_protein_coding_lncRNA.txt'
Python libraries
#collapse-hide
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy import stats
import os
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.axes_grid1 import make_axes_locatable
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 300)
Read input data
#collapse-hide
def read_gtex(filename): # returns N x G gene expression
expr_list = list()
donor_list = list()
gene_list = list()
with open(filename) as mfile:
donor_list = mfile.readline().strip().split("\t")[1:]
for line in mfile:
linesplit = line.strip().split("\t")
gene = linesplit[0].strip()
gene_list.append(gene)
expr = np.array([float(x) for x in linesplit[1:]])
expr_list.append(expr)
expr = np.transpose(np.array(expr_list))
return expr, donor_list, gene_list
def center_expression(Y):
'''
Y is N x G
here we center the columns, the mean of the columns (genes) are subtracted
'''
Ycent = Y - np.mean(Y, axis = 0)
return Ycent
def center_genotype(X):
'''
X is N x I
here we center the columns, the mean of the columns (SNPs) are subtracted
'''
return X - np.mean(X, axis = 0).reshape(1, -1)
if not os.path.isfile(dosage_numpy_file):
dosage = np.loadtxt(dosagefile, delimiter=' ', skiprows=1, usecols=range(6, 97612))
np.save(dosage_numpy_file, dosage)
else:
dosage = np.load(dosage_numpy_file)
gtsamples = list()
with open (dosagefile, 'r') as infile:
next(infile)
for line in infile:
gtsamples.append(line.strip().split()[1])
gx, gxsamples, _ = read_gtex(expression_file)
gx = center_expression(gx)
sampleidx = [gtsamples.index(x) for x in gxsamples] # assumes all expression samples have genotype
dreduce = dosage[sampleidx, :]
gt = center_genotype(dreduce) #dreduce - np.mean(dreduce, axis = 0).reshape(1, -1)
print(f'{len(sampleidx)} samples, {gx.shape[1]} genes, {gt.shape[1]} SNPs.')
print(f'Centered and normalized genotype and expression. Samples in same order as `gxsamples`')
#collapse-hide
def get_pca(x, K):
pca = PCA(n_components=K)
pca.fit(x) # requires N x P (n_samples, n_features)
x_pca = pca.transform(x)
return x_pca
def get_distance(a, b):
return np.linalg.norm(a - b)
def distance_matrix(x_pca):
nsample = x_pca.shape[0]
distance_matrix = np.zeros((nsample, nsample))
for i in range(nsample):
for j in range(i+1, nsample):
dist = get_distance(x_pca[i,:], x_pca[j,:])
distance_matrix[i, j] = dist
distance_matrix[j, i] = dist
return distance_matrix
def map_distance_matrix(dm, samples, target_samples):
N = len(target_samples)
newdm = np.zeros((N, N))
newdm[:] = np.nan
for i in range(N):
if target_samples[i] in samples:
newdm[i, i] = 0 # diagonal is always zero
iold = samples.index(target_samples[i])
for j in range(i+1, N):
if target_samples[j] in samples:
jold = samples.index(target_samples[j])
newdm[i, j] = dm[iold, jold]
newdm[j, i] = dm[jold, iold]
return newdm
def knn(gx, gt, dm, K, center = True):
assert (gx.shape[0] == gt.shape[0])
N = gx.shape[0]
gx_knn = np.zeros_like(gx)
gt_knn = np.zeros_like(gt)
for i in range(N):
#neighbors = np.argsort(distance_matrix[i, :kneighbor + 1])
ordered_nbrs = np.argsort(dm[i, :])
self_idx = np.where(ordered_nbrs == i)[0][0]
neighbors = np.delete(ordered_nbrs, self_idx)[:K]
gx_knn[i, :] = gx[i, :] - np.mean(gx[neighbors, :], axis = 0)
gt_knn[:, i] = gt[:, i] - np.mean(gt[:, neighbors], axis = 1)
if center:
gx_knn -= np.mean(gx_knn, axis = 0)
gt_knn -= np.mean(gt_knn, axis = 0)
return gx_knn, gt_knn
def remove_nfirst_pcs(X, n=1):
Xnorm = X
U, S, Vt = np.linalg.svd(X, full_matrices=False)
Xhat = U[:, n:] @ np.diag(S[n:]) @ Vt[n:, :]
return Xhat
def center_scale_off_diagonal(X):
ioff = np.where(~np.eye(X.shape[0], dtype=bool))
X[ioff] = (X[ioff] - np.mean(X[ioff])) / np.std(X[ioff])
return X
def plot_distance_matrices(dmA, dmB, norms = None):
'''
provide norms, if required, as norms = (norm1, norm2)
where,
norm1 = matplotlib.colors.DivergingNorm(vmin=10., vcenter=90., vmax=170.)
norm2 = matplotlib.colors.DivergingNorm(vmin=0., vcenter=90., vmax=300.)
'''
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
# the zero distance between the same samples
# is bad for the color scale.
dmA[np.diag_indices(dmA.shape[0])] = np.nan
dmB[np.diag_indices(dmB.shape[0])] = np.nan
cmap1 = plt.get_cmap("YlOrRd")
cmap1.set_bad('w')
cmap2 = plt.get_cmap("YlGnBu")
cmap2.set_bad('w')
if norms is not None:
norm1 = norms[0]
norm2 = norms[1]
im1 = ax1.imshow(dmA, cmap = cmap1, norm = norm1, interpolation='nearest')
im2 = ax2.imshow(dmB, cmap = cmap2, norm = norm2, interpolation='nearest')
else:
im1 = ax1.imshow(dmA, cmap = cmap1, interpolation='nearest')
im2 = ax2.imshow(dmB, cmap = cmap2, interpolation='nearest')
divider = make_axes_locatable(ax1)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im1, cax=cax, fraction = 0.1)
divider = make_axes_locatable(ax2)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im2, cax=cax, fraction = 0.1)
ax1.set_title("Genotype space", pad = 20)
ax2.set_title("Expression space", pad = 20)
plt.tight_layout()
return fig
#collapse-show
# Before KNN
DT = 20 # reduced dimension of genotype
DX = 30 # reduced dimension of expression
dm_gt = distance_matrix(get_pca(gt, DT))
dm_gx = distance_matrix(get_pca(gx, DX))
dm_gx_corr = remove_nfirst_pcs(dm_gx, n=1)
# Single-KNN
K = 30
gx_knn, gt_knn = knn(gx, gt, dm_gx_corr, K)
dm_gt_knn = distance_matrix(get_pca(gt_knn, DT))
dm_gx_knn = distance_matrix(get_pca(gx_knn, DX))
I ordered the samples by their distance in the expression space
#collapse-hide
o2 = hc.leaves_list(hc.linkage(dm_gx_corr, method = 'centroid'))
Plot the distance matrices
#collapse-hide
mgt = dm_gt[o2, :][:, o2]
mgx = dm_gx_corr[o2, :][:, o2]
norm1 = matplotlib.colors.DivergingNorm(vmin=10., vcenter=90., vmax=170.)
norm2 = matplotlib.colors.DivergingNorm(vmin=-50, vcenter=40, vmax=100.)
norms = (norm1, norm2)
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Modified distance matrix before KNN")
plt.show()