NNM loadings of all phenotypes except prescriptions.
tSVD loadings of all phenotypes except prescriptions.
def get_umap_embedding(loadings, metric ='cosine', n_neighbors =15, min_dist =0.4): reducer = umap.UMAP(n_neighbors = n_neighbors, metric = metric, min_dist = min_dist)return reducer.fit_transform(loadings)umap_metrics = ['euclidean', 'cosine']nnm_embedding = { m : get_umap_embedding(nnm_loadings, metric = m) for m in umap_metrics }tsvd_embedding = { m : get_umap_embedding(tsvd_loadings, metric = m) for m in umap_metrics }nnm_embedding_noRx = { m : get_umap_embedding(nnm_loadings_noRx, metric = m) for m in umap_metrics }tsvd_embedding_noRx = { m : get_umap_embedding(tsvd_loadings_noRx, metric = m) for m in umap_metrics }
Interactive Plots
Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions. The opaciy of each point is proportional to the estimated heritability of the trait.
def get_bokeh_plot(embedding, trait_labels, trait_df, plot_title, color_palette = hex_colors_40, alpha_factor =10): plot_dict =dict( x = embedding[:, 0], y = embedding[:, 1], trait_type_code = [f"{x}"for x in trait_labels], h2_fill_alpha = [min(0.6, alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], h2_line_alpha = [min(0.8, 1.3* alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}"for i in trait_df.index], ) color_mapping = CategoricalColorMapper(factors = [f"{x}"for x in np.unique(trait_labels)], palette = color_palette) plot_tooltips = [ ("Desc", "@fulldesc"), ] ax = bokeh_figure( width =800, height =800, tooltips = plot_tooltips, title = plot_title, ) ax.circle('x', 'y', size =10, source = ColumnDataSource(plot_dict), color =dict(field='trait_type_code', transform = color_mapping), line_alpha =dict(field='h2_line_alpha'), fill_alpha =dict(field='h2_fill_alpha'), ) ax.title.text_font_size ='16pt' ax.title.text_font_style ='normal' ax.title.text_font ='tahoma' ax.axis.major_label_text_font_size ='20pt' ax.axis.axis_line_width =2 ax.axis.major_tick_line_width =2 ax.grid.visible =Falsereturn ax
bokeh_colors = hex_colors_40.copy()bokeh_colors[0] ="#3c3c3c0d"llm_method ="ls-da3m0ns/bge_large_medical"llm_ctype ="community"alpha_factor =10if llm_ctype =="kmeans"else100labels = get_llm_cluster_labels(np.array(trait_df.index), llm_method, llm_ctype)axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( nnm_embedding[metric], labels, trait_df, f"NNM, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( tsvd_embedding[metric], labels, trait_df, f"tSVD, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the tSVD embeddings.
Figure 2
labels = get_llm_cluster_labels(np.array(trait_df.index), llm_method, llm_ctype)trait_df_noRx = trait_df.query('trait_type != "prescriptions"')labels_noRx = [labels[i] for i in trait_df_noRx.index]axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( nnm_embedding_noRx[metric], labels_noRx, trait_df_noRx, f"NNM, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the NNM loadings except prescriptions.
Figure 3
axlist =list()for metric in umap_metrics: ax = get_bokeh_plot( tsvd_embedding_noRx[metric], labels_noRx, trait_df_noRx, f"tSVD, {metric}, {llm_method}, {llm_ctype} clustering", alpha_factor = alpha_factor, color_palette = bokeh_colors ) axlist.append(ax)# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)
(a) UMAP embeddings of the tSVD loadings except prescriptions.