from ast import Str import streamlit as st import numpy as np import pandas as pd from PIL import Image import ujson as json import pickle as pk from collections import Counter import math import sklearn from plotnine import * # -- Set page config apptitle = "PhenoGenius" st.set_page_config( page_title=apptitle, page_icon=":genie:", layout="wide", initial_sidebar_state="auto", ) # -- Set Sidebar image_pg = Image.open("data/img/phenogenius.png") st.sidebar.image(image_pg, caption=None, width=100) st.sidebar.title("PhenoGenius") st.sidebar.header( "Learning phenotypic patterns in genetic diseases by symptom interaction modeling" ) st.sidebar.markdown( """ This webapp presents symptom interaction models in genetic diseases to provide: - Standardized clinical descriptions - Interpretable matches between symptoms and genes Code source is available in GitHub: [https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius) PhenoGenius is a collaborative project from: """ ) image_uga = Image.open("data/img/logo-uga.png") st.sidebar.image(image_uga, caption=None, width=95) image_seqone = Image.open("data/img/logo-seqone.png") st.sidebar.image(image_seqone, caption=None, width=95) image_miai = Image.open("data/img/logoMIAI-rvb.png") st.sidebar.image(image_miai, caption=None, width=95) image_chuga = Image.open("data/img/logo-chuga.png") st.sidebar.image(image_chuga, caption=None, width=60) @st.cache_data(max_entries=50) def convert_df(df): return df.to_csv(sep="\t").encode("utf-8") @st.cache_data(max_entries=50) def load_data(): matrix = pd.read_csv( "data/resources/ohe_all_thesaurus_weighted.tsv.gz", sep="\t", compression="gzip", index_col=0, ) return matrix @st.cache_data(max_entries=50) def load_umap_cohort(): matrix = pd.read_csv( "data/resources/umap_loc_cohort.tsv", sep="\t", index_col=0, ) return matrix @st.cache_data( max_entries=50) def load_cohort(): matrix = pd.read_csv( "data/resources/cohort_diag.tsv", sep="\t", ) return matrix @st.cache_data( hash_funcs={"Pickle": lambda _: None}, max_entries=50 ) def load_nmf_model(): with open("data/resources/pheno_NMF_390_model_42.pkl", "rb") as pickle_file: pheno_NMF = pk.load(pickle_file) with open("data/resources/pheno_NMF_390_matrix_42.pkl", "rb") as pickle_file: reduced = pk.load(pickle_file) return pheno_NMF, reduced @st.cache_data(max_entries=50) def symbol_to_id_to_dict(): # from NCBI ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t") ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606] ncbi_df_ncbi = ncbi_df.set_index("Symbol") ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict() ncbi_df = ncbi_df.set_index("GeneID") ncbi_to_dict = ncbi_df["Symbol"].to_dict() return ncbi_to_dict_ncbi, ncbi_to_dict @st.cache_data( hash_funcs={"_json.Scanner": hash}, max_entries=50 ) def load_hp_ontology(): with open("data/resources/hpo_obo.json") as json_data: data_dict = json.load(json_data) return data_dict @st.cache_data(max_entries=50) def hpo_description_to_id(): data_dict = {} for key, value in hp_onto.items(): data_dict[value["name"]] = key return data_dict @st.cache_data( hash_funcs={"_json.Scanner": hash}, max_entries=50 ) def load_cluster_data(): with open("data/resources/cluster_info.json") as json_data: data_dict = json.load(json_data) return data_dict @st.cache_data(max_entries=50) def load_topic_data(): topic = pd.read_csv( "data/resources/main_topics_hpo_390_42_filtered_norm_004.tsv", sep="\t", index_col=0, ) return topic @st.cache_data( hash_funcs={"_json.Scanner": hash}, max_entries=50 ) def load_similarity_dict(): with open("data/resources/similarity_dict_threshold_80.json") as json_data: data_dict = json.load(json_data) return data_dict # @st.cache_data( # hash_funcs={"Pickle": lambda _: None}, max_entries=50 # ) # def load_projection(): # with open("data/resources/clustering_model.pkl", "rb") as pickle_file: # cluster = pk.load(pickle_file) # with open("data/resources/umap_projection.pkl", "rb") as pickle_file: # umap = pk.load(pickle_file) # return cluster, umap def get_symbol(gene): if gene in symbol.keys(): return symbol[gene] def get_hpo_name(hpo): names = {} if hpo in hp_onto.keys(): names[hpo] = hp_onto[hpo]["name"] return names def get_hpo_name_only(hpo): if hpo in hp_onto.keys(): return hp_onto[hpo]["name"] else: return None def get_hpo_name_list(hpo_list, hp_onto): names = {} for hpo in hpo_list: if hpo in hp_onto.keys(): names[hpo] = hp_onto[hpo]["name"] return names def get_similar_terms(hpo_list, similarity_terms_dict): hpo_list_w_simi = {} for term in hpo_list: hpo_list_w_simi[term] = 1 if term in similarity_terms_dict.keys(): for key, value in similarity_terms_dict[term].items(): if value > 0.8: score = value / len(similarity_terms_dict[term].keys()) if key in hpo_list_w_simi.keys(): if score > hpo_list_w_simi[key]: hpo_list_w_simi[key] = score else: pass else: hpo_list_w_simi[key] = score hpo_list_all = hpo_list_w_simi.keys() return hpo_list_w_simi, list(hpo_list_all) def score(hpo_list, matrix): matrix_filter = matrix[hpo_list] matrix_filter["sum"] = matrix_filter.sum(axis=1) matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol) return matrix_filter.sort_values("sum", ascending=False) def score_sim_add(hpo_list_add, matrix, sim_dict): matrix_filter = matrix[hpo_list_add] for key, value in sim_dict.items(): matrix_filter[key] = matrix_filter[key] * value matrix_filter["sum"] = matrix_filter.sum(axis=1) matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol) return matrix_filter.sort_values("sum", ascending=False) def get_phenotype_specificity(gene_diag, data_patient): rank = data_patient.loc[int(ncbi[gene_diag]), "rank"] max_rank = data_patient["rank"].max() if rank == max_rank: return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general." elif rank < 41: return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)." elif rank < 250: return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)." else: return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity." def get_relatives_list(hpo_list, hp_onto): all_list = [] for hpo in hpo_list: all_list.append(hpo) if hpo in hp_onto.keys(): for parent in hp_onto[hpo]["parents"]: all_list.append(parent) for children in hp_onto[hpo]["childrens"]: all_list.append(children) return list(set(all_list)) def get_hpo_id(hpo_list): hpo_id = [] for description in hpo_list: hpo_id.append(hp_desc_id[description]) return ",".join(hpo_id) hp_onto = load_hp_ontology() hp_desc_id = hpo_description_to_id() ncbi, symbol = symbol_to_id_to_dict() # hpo = form.text_input( # label="Provide your HPOs (separated by comma)", # value="HP:0000107,HP:0000108,HP:0001407", # ) with st.form("my_form"): c1, c2 = st.columns(2) with c1: hpo_raw = st.multiselect( "Select interactively your HPOs or...", list(hp_desc_id.keys()), ["Renal cyst", "Hepatic cysts"], ) with c2: hpo = st.text_input( "copy/paste your HPOs, separated with comma", "HP:0000107,HP:0001407", ) gene_diag_input = st.multiselect( "Optional: provide HGNC gene symbol to be tested", options=list(ncbi.keys()), default=["PKD1"], max_selections=1, ) submit_button = st.form_submit_button( label="Submit", ) # form = st.form(key="my_form") # gene_diag_input = form.text_input( # label="Optional: provide HGNC gene symbol to be tested (in CAPITAL format)", # value="PKD1", # ) if submit_button: if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0: hpo = get_hpo_id(hpo_raw) data = load_data() pheno_NMF, reduced = load_nmf_model() # cluster, umap = load_projection() # umap_cohort = load_umap_cohort() cohort = load_cohort() cluster_info = load_cluster_data() topic = load_topic_data() similarity_terms_dict = load_similarity_dict() hpo_list_ini = hpo.strip().split(",") if gene_diag_input: if gene_diag_input[0] in ncbi.keys(): gene_diag = gene_diag_input[0] else: st.write( gene_diag_input + " gene are not in our database. Please check gene name (need to be in CAPITAL format)." ) gene_diag = None else: gene_diag = None hpo_list_up = [] for hpo in hpo_list_ini: if hpo in ["HP:0000001"]: pass elif len(hpo) != 10: st.write( "Incorrect HPO format: " + hpo + ". Please check (7-digits terms with prefix HP:, and separed by commas)." ) pass elif hpo not in data.columns: pass st.write(hpo + " not available in current database. Please modify.") else: if data[hpo].astype(bool).sum(axis=0) != 0: hpo_list_up.append(hpo) else: hpo_to_test = hp_onto[hpo]["direct_parent"][0] while data[hpo_to_test].astype(bool).sum( axis=0 ) == 0 and hpo_to_test not in ["HP:0000001"]: hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0] if hpo_to_test in ["HP:0000001"]: st.write( "No gene-HPO associations was found for " + hpo + " and parents." ) else: hpo_list_up.append(hpo_to_test) st.write( "We replaced: ", hpo, " by ", hp_onto[hpo]["direct_parent"][0], "-", get_hpo_name(hpo_to_test), ) hpo_list = list(set(hpo_list_up)) del hpo_list_up if hpo_list: with st.expander("See HPO inputs"): st.write(get_hpo_name_list(hpo_list_ini, hp_onto)) del hpo_list_ini hpo_list_name = get_relatives_list(hpo_list, hp_onto) st.header("Clinical description with symptom interaction modeling") witness = np.zeros(len(data.columns)) witness_nmf = np.matmul(pheno_NMF.components_, witness) patient = np.zeros(len(data.columns)) for hpo in hpo_list: hpo_index = list(data.columns).index(hpo) patient[hpo_index] = 1 patient_nmf = np.matmul(pheno_NMF.components_, patient) witness_sugg_df = ( pd.DataFrame(reduced) .set_index(data.index) .apply(lambda x: (x - witness_nmf) ** 2, axis=1) ) patient_sugg_df = ( pd.DataFrame(reduced) .set_index(data.index) .apply(lambda x: (x - patient_nmf) ** 2, axis=1) ) case_sugg_df = (patient_sugg_df - witness_sugg_df).sum() patient_df_info = pd.DataFrame(case_sugg_df).merge( topic, left_index=True, right_index=True ) patient_df_info["mean_score"] = round( patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4 ) patient_df_info_write = patient_df_info[ ["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"] ].sort_values("mean_score", ascending=False) del case_sugg_df del patient_sugg_df del witness_sugg_df del patient with st.expander("See projection in groups of symptoms dimension*"): st.dataframe(patient_df_info_write) st.write( "\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations" ) match_proj_csv = convert_df(patient_df_info_write) st.download_button( "Download description projection", match_proj_csv, "clin_desc_projected.tsv", "text/csv", key="download-csv-proj", ) # patient_transposed = sklearn.preprocessing.normalize( # np.array(patient_df_info["mean_score"]).reshape(1, -1), norm="l1" # ) # del patient_df_info # # patient_nmf_umap = umap.transform(pd.DataFrame(patient_transposed)) # del patient_transposed # # with st.expander("See projection in cohort"): # umap_cohort["dist"] = abs(umap_cohort["x"] - patient_nmf_umap[0, 0]) + abs( # umap_cohort["y"] - patient_nmf_umap[0, 1] # ) # del patient_nmf_umap # closest_patient = umap_cohort.nsmallest(3, "dist") # st.write("Closest patients in the cohort are: ", closest_patient) # st.write("Closest patient: ", cohort.loc[closest_patient.index[0]]) # st.write( # get_hpo_name_list( # cohort.loc[closest_patient.index[0]].hpo_list.split(","), # hp_onto, # ) # ) # # cluster_selected = cluster_info[str(closest_patient["cluster"].values[0])] # st.write("Selected cluster: ", closest_patient["cluster"].values[0]) # st.write("Number of patient in cluster: ", cluster_selected["n_patients"]) # del closest_patient # # gene_in_cluster = pd.DataFrame.from_dict( # dict(Counter(cluster_selected["gene_list"])), orient="index" # ) # gene_in_cluster.columns = ["count"] # if gene_diag: # if gene_diag in gene_in_cluster.index: # st.write("Gene diag in cluster", gene_in_cluster.loc[gene_diag, :]) # # st.write( # "Gene(s) involved in cluster: ", # gene_in_cluster.sort_values("count", ascending=False), # ) # del gene_in_cluster # # group_involved = cluster_selected["group"] # if ( # isinstance(group_involved, float) # and math.isnan(float(group_involved)) == False # ): # topic_involved = topic.loc[topic_involved, :] # st.write( # "Group(s) of symptoms statistically enriched: ", topic_involved # ) # elif isinstance(group_involved, str): # group_list = [int(x) for x in cluster_selected["group"].split(",")] # topic_involved = topic.loc[group_list, :] # st.write( # "Group(s) of symptoms statistically enriched: ", topic_involved # ) # del topic_involved # del group_involved # # dict_count_print = {} # dict_count = dict(Counter(cluster_selected["hpo_list"])) # dict_count_sorted = sorted( # dict_count.items(), key=lambda x: x[1], reverse=True # ) # del cluster_selected # for element in dict_count_sorted: # dict_count_print[element[0]] = { # "description": hp_onto[element[0]]["name"], # "count": element[1], # } # st.write( # "HPOs declared in cluster:", # pd.DataFrame.from_dict(dict_count_print, orient="index"), # ) # del dict_count # del dict_count_print # del dict_count_sorted # sim_dict, hpo_list_add = get_similar_terms(hpo_list, similarity_terms_dict) similar_list = list(set(hpo_list_add) - set(hpo_list)) similar_list_desc = get_hpo_name_list(similar_list, hp_onto) if similar_list_desc: with st.expander("See symptoms with similarity > 80%"): similar_list_desc_df = pd.DataFrame.from_dict( similar_list_desc, orient="index" ) similar_list_desc_df.columns = ["description"] st.write(similar_list_desc_df) del similar_list_desc_df del similar_list del similar_list_desc st.header("Phenotype matching") results_sum = score(hpo_list, data) results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1) results_sum["score"] = results_sum["matchs"] + results_sum["sum"] results_sum["rank"] = ( results_sum["score"].rank(ascending=False, method="max").astype(int) ) cols = results_sum.columns.tolist() cols = cols[-4:] + cols[:-4] match = results_sum[cols].sort_values(by=["score"], ascending=False) st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"])) match_csv = convert_df(match) st.download_button( "Download matching results", match_csv, "match.tsv", "text/csv", key="download-csv-match", ) if gene_diag: if int(ncbi[gene_diag]) in results_sum.index: p = ( ggplot(match, aes("score")) + geom_density() + geom_vline( xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"], linetype="dashed", color="red", size=1.5, ) + ggtitle("Matching score distribution") + xlab("Gene matching score") + ylab("% of genes") + theme_bw() + theme( text=element_text(size=12), figure_size=(5, 5), axis_ticks=element_line(colour="black", size=4), axis_line=element_line(colour="black", size=2), axis_text_x=element_text(angle=45, hjust=1), axis_text_y=element_text(angle=60, hjust=1), subplots_adjust={"wspace": 0.1}, legend_position=(0.7, 0.35), ) ) col1, col2, col3 = st.columns(3) with col1: st.pyplot(ggplot.draw(p)) st.write( "Gene ID rank:", results_sum.loc[int(ncbi[gene_diag]), "rank"], " | ", "Gene ID count:", round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4), ) st.write(results_sum.loc[[int(ncbi[gene_diag])]]) st.write( "Gene ID phenotype specificity:", get_phenotype_specificity(gene_diag, results_sum), ) del p else: st.write("Gene ID rank:", " Gene not available in PhenoGenius database") del results_sum del match st.header("Phenotype matching by similarity of symptoms") results_sum_add = score_sim_add(hpo_list_add, data, sim_dict) results_sum_add["rank"] = ( results_sum_add["sum"].rank(ascending=False, method="max").astype(int) ) cols = results_sum_add.columns.tolist() cols = cols[-2:] + cols[:-2] match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False) st.dataframe(match_sim[match_sim["sum"] > 0.01]) match_sim_csv = convert_df(match_sim) st.download_button( "Download matching results", match_sim_csv, "match_sim.tsv", "text/csv", key="download-csv-match-sim", ) if gene_diag: if int(ncbi[gene_diag]) in results_sum_add.index: p2 = ( ggplot(match_sim, aes("sum")) + geom_density() + geom_vline( xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"], linetype="dashed", color="red", size=1.5, ) + ggtitle("Matching score distribution") + xlab("Gene matching score") + ylab("% of genes") + theme_bw() + theme( text=element_text(size=12), figure_size=(5, 5), axis_ticks=element_line(colour="black", size=4), axis_line=element_line(colour="black", size=2), axis_text_x=element_text(angle=45, hjust=1), axis_text_y=element_text(angle=60, hjust=1), subplots_adjust={"wspace": 0.1}, legend_position=(0.7, 0.35), ) ) col1, col2, col3 = st.columns(3) with col1: st.pyplot(ggplot.draw(p2)) st.write( "Gene ID rank:", results_sum_add.loc[int(ncbi[gene_diag]), "rank"], " | ", "Gene ID count:", round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4), ) st.write( "Gene ID phenotype specificity:", get_phenotype_specificity(gene_diag, results_sum_add), ) del p2 else: st.write("Gene ID rank:", " Gene not available in PhenoGenius database") del sim_dict del hpo_list_add del results_sum_add del match_sim st.header("Phenotype matching by groups of symptoms") patient_df = ( pd.DataFrame(reduced) .set_index(data.index) .apply(lambda x: sum((x - patient_nmf) ** 2), axis=1) ) witness_df = ( pd.DataFrame(reduced) .set_index(data.index) .apply(lambda x: sum((x - witness_nmf) ** 2), axis=1) ) del patient_nmf del witness del witness_nmf case_df = pd.DataFrame(patient_df - witness_df) case_df.columns = ["score"] case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max()) # case_df["frequency"] = matrix_frequency["variant_number"] case_df["sum"] = case_df["score_norm"] # + case_df["frequency"] case_df_sort = case_df.sort_values(by="sum", ascending=False) case_df_sort["rank"] = ( case_df_sort["sum"].rank(ascending=False, method="max").astype(int) ) case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol) match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]] st.dataframe(match_nmf[match_nmf["sum"] > 0.01]) match_nmf_csv = convert_df(match_nmf) st.download_button( "Download matching results", match_nmf_csv, "match_groups.tsv", "text/csv", key="download-csv-match-groups", ) if gene_diag: if int(ncbi[gene_diag]) in case_df_sort.index: p3 = ( ggplot(match_nmf, aes("sum")) + geom_density() + geom_vline( xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"], linetype="dashed", color="red", size=1.5, ) + ggtitle("Matching score distribution") + xlab("Gene matching score") + ylab("% of genes") + theme_bw() + theme( text=element_text(size=12), figure_size=(5, 5), axis_ticks=element_line(colour="black", size=4), axis_line=element_line(colour="black", size=2), axis_text_x=element_text(angle=45, hjust=1), axis_text_y=element_text(angle=60, hjust=1), subplots_adjust={"wspace": 0.1}, legend_position=(0.7, 0.35), ) ) col1, col2, col3 = st.columns(3) with col1: st.pyplot(ggplot.draw(p3)) st.write( "Gene ID rank:", case_df_sort.loc[int(ncbi[gene_diag]), "rank"], " | ", "Gene ID count:", round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4), ) st.write( "Gene ID phenotype specificity:", get_phenotype_specificity(gene_diag, case_df_sort), ) del p3 else: st.write("Gene ID rank:", " Gene not available in PhenoGenius database") del case_df_sort del match_nmf del case_df else: st.write( "No HPO terms provided in correct format.", )