variant_colnames = ['variant', 'chr', 'pos', 'ref', 'alt', 'rsid', 'varid',
'consequence', 'consequence_category', 'info', 'call_rate',
'AC', 'AF', 'minor_allele', 'minor_AF', 'p_hwe',
'n_called', 'n_not_called', 'n_hom_ref', 'n_het', 'n_hom_var', 'n_non_ref',
'r_heterozygosity', 'r_het_hom_var', 'r_expected_het_frequency']
variant_df = pd.read_csv(variants_metafile, sep = '\t', names = variant_colnames)
def count_nucleotides(row):
return len(row['ref']) + len(row['alt'])
variant_df['ntcount'] = variant_df.apply(count_nucleotides, axis = 1)
variant_df_filtered = variant_df[(variant_df['ntcount'] == 2)
& (variant_df['minor_AF'] >= 0.05)
& (variant_df['info'] >= 0.8)].drop(columns = ['ntcount'])