Map trait names

Code
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
Code
datadir = "/Users/sbanerjee/Documents/work/clorinn_v2/input_data"
outsuffix = "v1_0"
outdir = Path(datadir) / "pgc_v1.0"

# Input files
zscore_file    = Path(datadir) / "corrected_sign_zscore_df.csv"
noise_cov_file = Path(datadir) / "corrected_non_smooth_sigma.csv"
category_file  = Path(datadir) / "category_dic.txt"

# Output files
def ensure_parent(path):
    Path(path).parent.mkdir(parents=True, exist_ok=True)

zscore_outfile    = Path(outdir) / f"zscore_{outsuffix}.csv"
noise_cov_outfile = Path(outdir) / f"sampling_covariance_{outsuffix}.csv"
category_outfile  = Path(outdir) / f"trait_to_group_{outsuffix}.json"
ensure_parent(zscore_outfile)
ensure_parent(noise_cov_outfile)
ensure_parent(category_outfile)

# Read files
Z_df = pd.read_csv(zscore_file, header = 0, index_col=0).T
A_df = pd.read_csv(noise_cov_file, header = 0, index_col=0)
with open(category_file, "r") as f:
    trait_to_group = json.load(f)

Some trait names have changed between assembling Z-scores and sampling covariance. Here, we pull everything to a common scheme.

Code
def normalize_trait_name(name):
    rename_map = {
        "Saxena": "Daytime_sleepiness",
    }
    if not isinstance(name, str):
        return name
    name = name.replace("-", "_")
    name = rename_map.get(name, name)
    return str(name)

Z_df.index = Z_df.index.map(normalize_trait_name)
A_df.index = A_df.index.map(normalize_trait_name)
A_df.columns = A_df.columns.map(normalize_trait_name)
trait_to_group = {
    normalize_trait_name(k): v
    for k, v in trait_to_group.items()
}
Code
traits = list(Z_df.index)
missing_rows = set(traits) - set(A_df.index)
missing_cols = set(traits) - set(A_df.columns)
if missing_rows or missing_cols:
    raise ValueError(
        "Noise covariance is missing traits from Z. "
        f"Missing rows: {sorted(missing_rows)}; "
        f"missing columns: {sorted(missing_cols)}."
    )
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[4], line 5
      3 missing_cols = set(traits) - set(A_df.columns)
      4 if missing_rows or missing_cols:
----> 5     raise ValueError(
      6         "Noise covariance is missing traits from Z. "
      7         f"Missing rows: {sorted(missing_rows)}; "
      8         f"missing columns: {sorted(missing_cols)}."
      9     )

ValueError: Noise covariance is missing traits from Z. Missing rows: ['Lipoprotein_A_UKBB_EUR', 'daner_uniman', 'mdd_symptoms_2023_Comm_MDD5b_psychomotorSlow']; missing columns: ['Lipoprotein_A_UKBB_EUR', 'daner_uniman', 'mdd_symptoms_2023_Comm_MDD5b_psychomotorSlow'].
Code
traits = list(A_df.index)
missing_rows = set(traits) - set(Z_df.index)
if missing_rows:
    raise ValueError(
        "Z is missing traits from noise covariance. "
        f"Missing rows: {sorted(missing_rows)}. "
    )
Code
# Column-name label types
print(Z_df.columns.map(type).value_counts())
<class 'str'>    46504
Name: count, dtype: int64

Drop traits

Some traits produced NaN intercepts in LDSC and hence not included in the covariance. Drop them here, so we don’t have to align Z and covariance everytime they are called.

Code
traits_to_drop = [
    # NaN intercepts from LDSC
    # and hence not included in the covariance
    "mdd_symptoms_2023_Comm_MDD5b_psychomotorSlow", 
    "Lipoprotein_A_UKBB_EUR", 
    "daner_uniman",
]

Z_df = Z_df.drop(index = traits_to_drop)
Code
common = A_df.index.intersection(A_df.index).intersection(Z_df.index)

# reorder consistently
A_aligned = A_df.loc[common, common]
Z_aligned = Z_df.loc[common, :]

# convert to numpy
A = A_aligned.to_numpy()
Z = Z_aligned.to_numpy()

print (f"Number of common traits: {len(common)}")
print (f"Number of SNPs: {Z.shape[1]}")
Number of common traits: 107
Number of SNPs: 46504
Code
tol = 1e-8
is_symmetric = np.allclose(A, A.T, atol=tol, rtol=0)
if is_symmetric:
    print("This matrix is symmetric.")
else:
    print("This matrix is not symmetric.")
    max_asymmetry = np.abs(A - A.T).max()
    print("max_asymmetry:", max_asymmetry)

eigvals = np.linalg.eigvalsh((A + A.T) / 2)
is_pd = np.all(eigvals > 0)

if is_pd:
    print ("This matrix is PD.")
else:
    print ("This matrix is not PD.")
    print (f"Minimum eigenvalue: {eigvals.min():g}")
    print ("Zero / Negative eigenvalues:")
    print(eigvals[eigvals <= 0])
This matrix is symmetric.
This matrix is PD.
Code
Z_aligned.to_csv(zscore_outfile)
A_aligned.to_csv(noise_cov_outfile)

with open(category_outfile, "w") as f:
    json.dump(trait_to_group, f, indent=4, sort_keys=True)