NNM loadings of all phenotypes except prescriptions.
tSVD loadings of all phenotypes except prescriptions.
Code
def get_umap_embedding(loadings, metric ='cosine', n_neighbors =15, min_dist =0.4): reducer = umap.UMAP(n_neighbors = n_neighbors, metric = metric, min_dist = min_dist)return reducer.fit_transform(loadings)umap_metrics = ['euclidean', 'cosine']nnm_embedding = { m : get_umap_embedding(nnm_loadings, metric = m) for m in umap_metrics }tsvd_embedding = { m : get_umap_embedding(tsvd_loadings, metric = m) for m in umap_metrics }nnm_embedding_noRx = { m : get_umap_embedding(nnm_loadings_noRx, metric = m) for m in umap_metrics }tsvd_embedding_noRx = { m : get_umap_embedding(tsvd_loadings_noRx, metric = m) for m in umap_metrics }
Interactive Plots
Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions. The opaciy of each point is proportional to the estimated heritability of the trait.
Code
def get_bokeh_plot(embedding, trait_labels, trait_df, plot_title, color_palette = hex_colors_40, alpha_factor =10): plot_dict =dict( x = embedding[:, 0], y = embedding[:, 1], trait_type_code = [f"{x}"for x in trait_labels], h2_fill_alpha = [min(0.6, alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], h2_line_alpha = [min(0.8, 1.3* alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}"for i in trait_df.index], ) color_mapping = CategoricalColorMapper(factors = [f"{x}"for x in np.unique(trait_labels)], palette = color_palette) plot_tooltips = [ ("Desc", "@fulldesc"), ] ax = bokeh_figure( width =800, height =800, tooltips = plot_tooltips, title = plot_title, ) ax.circle('x', 'y', size =10, source = ColumnDataSource(plot_dict), color =dict(field='trait_type_code', transform = color_mapping), line_alpha =dict(field='h2_line_alpha'), fill_alpha =dict(field='h2_fill_alpha'), ) ax.title.text_font_size ='16pt' ax.title.text_font_style ='normal' ax.title.text_font ='tahoma' ax.axis.major_label_text_font_size ='20pt' ax.axis.axis_line_width =2 ax.axis.major_tick_line_width =2 ax.grid.visible =Falsereturn ax
Code
bokeh_colors = hex_colors_40.copy()bokeh_colors[0] ="#3c3c3c0d"llm_method ="ls-da3m0ns/bge_large_medical"llm_ctype ="community"alpha_factor =10if llm_ctype =="kmeans"else100labels = get_llm_cluster_labels(np.array(trait_df.index), llm_method, llm_ctype)axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( nnm_embedding[metric], labels, trait_df, f"NNM, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
Code
axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( tsvd_embedding[metric], labels, trait_df, f"tSVD, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the tSVD embeddings.
(b)
Figure 2
Code
labels = get_llm_cluster_labels(np.array(trait_df.index), llm_method, llm_ctype)trait_df_noRx = trait_df.query('trait_type != "prescriptions"')labels_noRx = [labels[i] for i in trait_df_noRx.index]axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( nnm_embedding_noRx[metric], labels_noRx, trait_df_noRx, f"NNM, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the NNM loadings except prescriptions.
(b)
Figure 3
Code
axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( tsvd_embedding_noRx[metric], labels_noRx, trait_df_noRx, f"tSVD, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the tSVD loadings except prescriptions.