Each point is a PanUKB phenotype, colored according to the clusters identified by LLM models from the trait descriptions.
Code
hex_colors = ['#2D69C4', # blue'#CC2529', # red'#93AA00', # Vivid Yellowish Green'#535154', # gray'#6B4C9A', # purple'#FFB300', # Vivid Yellow'#922428', # dark brown'#948B3D', # olive]hex_colors_40 = ["#3c3c3c0d","#084609","#ff4ff4","#01d94a","#b700ce","#91c900","#5f42ed","#5fa200","#8d6dff","#c9f06b","#0132a7","#ffbb1f","#0080ed","#f56600","#3afaf5","#c10001","#01e698","#a20096","#00e2c1","#ff5ac8","#008143","#cd0057","#4aeeff","#8c001a","#b5f2a2","#5d177d","#a99900","#e299ff","#5b6b00","#96aeff","#a46f00","#007acb","#ff9757","#00a8e0","#ff708e","#baefc7","#622b25","#c8c797","#885162","#ffb7a5","#ffa3c3"]trait_type_unique = trait_df['trait_type'].unique().tolist()trait_type_dict = { trait: color for trait, color inzip( trait_type_unique, hex_colors[:len(trait_type_unique)]) }llm_methods = ["ls-da3m0ns/bge_large_medical","medicalai/ClinicalBERT","emilyalsentzer/Bio_ClinicalBERT",]llm_ctypes = ["community", "kmeans"]llm_clusters = {method : { x : Nonefor x in llm_ctypes } for method in llm_methods}llm_outdir ="/gpfs/commons/home/sbanerjee/work/npd/PanUKB/results/llm"for method in llm_methods:for ctype in llm_ctypes: m_filename = os.path.join(llm_outdir, f"{method}/{ctype}_clusters.pkl")withopen(m_filename, "rb") as fh: llm_clusters[method][ctype] = pickle.load(fh)def get_llm_cluster_index(selectidx, method, ctype): clusteridx = np.full([selectidx.shape[0],], -1)for i, ccomps inenumerate(llm_clusters[method][ctype]):for idx in ccomps: clusteridx[idx] = ireturn clusteridx
Code
def get_bokeh_plot(embedding, selectidx, clusteridx, trait_df, umap_string, color_palette = hex_colors_40, alpha_factor =10): plot_dict =dict( x = embedding[selectidx, 0], y = embedding[selectidx, 1], trait_type_code = [f"{x}"for x in clusteridx], h2_fill_alpha = [min(0.6, alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], h2_line_alpha = [min(0.8, 1.3* alpha_factor * x) for x in trait_df['estimates.final.h2_observed'].fillna(1e-6).tolist()], fulldesc = [f"{i} | {trait_df.loc[i, 'short_description']} | {trait_df.loc[i, 'estimates.final.h2_observed']:.3f} | {trait_df.loc[i, 'Neff']:.2f}"for i in selectidx], ) color_mapping = CategoricalColorMapper(factors = [f"{x}"for x in np.unique(clusteridx)], palette = color_palette) plot_tooltips = [ ("Desc", "@fulldesc"), ] ax = bokeh_figure( width =800, height =800, tooltips = plot_tooltips, title = umap_string , ) ax.circle('x', 'y', size =10, source = ColumnDataSource(plot_dict), color =dict(field='trait_type_code', transform = color_mapping), line_alpha =dict(field='h2_line_alpha'), fill_alpha =dict(field='h2_fill_alpha'), ) ax.title.text_font_size ='20pt' ax.axis.major_label_text_font_size ='20pt' ax.axis.axis_line_width =2 ax.axis.major_tick_line_width =2 ax.grid.visible =Falsereturn ax
Code
selectidx = np.array(trait_df.index)axlist =list()for llm_method in llm_methods:for llm_ctype in llm_ctypes: clusteridx = get_llm_cluster_index(selectidx, llm_method, llm_ctype) alpha_factor =10if llm_ctype =="kmeans"else100 plot_title =f"{llm_method} + {llm_ctype} clustering" axlist.append(get_bokeh_plot(X_embedding_cosine_30_01, selectidx, clusteridx, trait_df, plot_title, alpha_factor = alpha_factor))# put all the plots in a VBoxp = bokeh_column(*axlist)# show the resultsbokeh_show(p)