Code
import os
import json
from pathlib import Path
import numpy as np
import pandas as pddatadir = "/Users/sbanerjee/Documents/work/clorinn_v2/input_data"
outsuffix = "v1_0"
outdir = Path(datadir) / "pgc_v1.0"
# Input files
zscore_file = Path(datadir) / "corrected_sign_zscore_df.csv"
noise_cov_file = Path(datadir) / "corrected_non_smooth_sigma.csv"
category_file = Path(datadir) / "category_dic.txt"
# Output files
def ensure_parent(path):
Path(path).parent.mkdir(parents=True, exist_ok=True)
zscore_outfile = Path(outdir) / f"zscore_{outsuffix}.csv"
noise_cov_outfile = Path(outdir) / f"sampling_covariance_{outsuffix}.csv"
category_outfile = Path(outdir) / f"trait_to_group_{outsuffix}.json"
ensure_parent(zscore_outfile)
ensure_parent(noise_cov_outfile)
ensure_parent(category_outfile)
# Read files
Z_df = pd.read_csv(zscore_file, header = 0, index_col=0).T
A_df = pd.read_csv(noise_cov_file, header = 0, index_col=0)
with open(category_file, "r") as f:
trait_to_group = json.load(f)Some trait names have changed between assembling Z-scores and sampling covariance. Here, we pull everything to a common scheme.
def normalize_trait_name(name):
rename_map = {
"Saxena": "Daytime_sleepiness",
}
if not isinstance(name, str):
return name
name = name.replace("-", "_")
name = rename_map.get(name, name)
return str(name)
Z_df.index = Z_df.index.map(normalize_trait_name)
A_df.index = A_df.index.map(normalize_trait_name)
A_df.columns = A_df.columns.map(normalize_trait_name)
trait_to_group = {
normalize_trait_name(k): v
for k, v in trait_to_group.items()
}traits = list(Z_df.index)
missing_rows = set(traits) - set(A_df.index)
missing_cols = set(traits) - set(A_df.columns)
if missing_rows or missing_cols:
raise ValueError(
"Noise covariance is missing traits from Z. "
f"Missing rows: {sorted(missing_rows)}; "
f"missing columns: {sorted(missing_cols)}."
)--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[4], line 5 3 missing_cols = set(traits) - set(A_df.columns) 4 if missing_rows or missing_cols: ----> 5 raise ValueError( 6 "Noise covariance is missing traits from Z. " 7 f"Missing rows: {sorted(missing_rows)}; " 8 f"missing columns: {sorted(missing_cols)}." 9 ) ValueError: Noise covariance is missing traits from Z. Missing rows: ['Lipoprotein_A_UKBB_EUR', 'daner_uniman', 'mdd_symptoms_2023_Comm_MDD5b_psychomotorSlow']; missing columns: ['Lipoprotein_A_UKBB_EUR', 'daner_uniman', 'mdd_symptoms_2023_Comm_MDD5b_psychomotorSlow'].
<class 'str'> 46504
Name: count, dtype: int64
Some traits produced NaN intercepts in LDSC and hence not included in the covariance. Drop them here, so we don’t have to align Z and covariance everytime they are called.
common = A_df.index.intersection(A_df.index).intersection(Z_df.index)
# reorder consistently
A_aligned = A_df.loc[common, common]
Z_aligned = Z_df.loc[common, :]
# convert to numpy
A = A_aligned.to_numpy()
Z = Z_aligned.to_numpy()
print (f"Number of common traits: {len(common)}")
print (f"Number of SNPs: {Z.shape[1]}")Number of common traits: 107
Number of SNPs: 46504
tol = 1e-8
is_symmetric = np.allclose(A, A.T, atol=tol, rtol=0)
if is_symmetric:
print("This matrix is symmetric.")
else:
print("This matrix is not symmetric.")
max_asymmetry = np.abs(A - A.T).max()
print("max_asymmetry:", max_asymmetry)
eigvals = np.linalg.eigvalsh((A + A.T) / 2)
is_pd = np.all(eigvals > 0)
if is_pd:
print ("This matrix is PD.")
else:
print ("This matrix is not PD.")
print (f"Minimum eigenvalue: {eigvals.min():g}")
print ("Zero / Negative eigenvalues:")
print(eigvals[eigvals <= 0])This matrix is symmetric.
This matrix is PD.