import streamlit as st import os import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances st.set_page_config( page_title="PhenoGene", page_icon="🧑‍💻", layout="wide", menu_items={ 'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/', 'About': "PhenoGene v1.0" }) # Constants embs = [] # Heading st.title('PhenoGene Interactive Demo') with st.expander("About", expanded=True): st.write( """ - PhenoGene is a novel gene prioritization method capable of representing HPO terms into embeddings. Utilizing advanced graph embeddings methods, PhenoGene can learn an effective mapping between genes and HPO terms. - Given a list of HPO terms, we compute the similarity with a Gene - **Input:** List of HPO terms - **Output:** Similarity score to the genes - Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/) """ ) st.markdown("") # Gene File, 128 dim embeddings gdf = pd.read_csv("data/diff2vec_gene_embd.csv").set_index("gene") genes = gdf.index.tolist() gene_emb = gdf.values gene_emb_data = {} for x,y in zip(genes, gene_emb): gene_emb_data[x] = y.reshape(1, 128) st.session_state['gene_emb_data'] = gene_emb_data # All HPO embeddings, 128 dim embeddings hdf = pd.read_csv("data/diff2vec_hpo_embd.csv").set_index("hpo_id") hpos = hdf.index.tolist() hpo_emb = hdf.values hpo_emb_data = {} for x,y in zip(hpos, hpo_emb): hpo_emb_data[x] = y.reshape(1, 128) st.session_state['hpo_emb_data'] = hpo_emb_data @st.cache(allow_output_mutation=True) def compute_similarity_with_gene(emb_src, genes, distance_metric='cosine'): data = {} for g in genes: if distance_metric == "cosine": data[g] = cosine_similarity(emb_src, gene_emb_data[g]).item() df = pd.DataFrame(data=data.items(), columns=['gene', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True) return df @st.cache def convert_df(df): # IMPORTANT: Cache the conversion to prevent computation on every rerun return df.to_csv().encode('utf-8') st.subheader("🖮 Enter HPO terms separated by a comma") hpo_terms_text = st.text_area('Example: HP_0000006, HP_0000006', "HP_0000006, HP_0000006") hpo_terms = list(map(str.strip, hpo_terms_text.split(','))) #st.write("HPO Terms entered: ") #st.write(hpo_terms) st.subheader("💻 Hit Compute to calculate similarity to gene") metrics = 'cosine' no_emb = False if st.button("Compute"): with st.spinner('Computing...'): for h in hpo_terms: if h not in hpo_emb_data.keys(): st.error("No Embeddings.") no_emb = True break embs.append(hpo_emb_data[h]) embs_mean = np.array(embs).mean(axis=0) result_df = compute_similarity_with_gene(embs_mean, genes, distance_metric=metrics) if no_emb: st.error("Embedding Error.") else: csv = convert_df(result_df) st.success("Done!") st.dataframe(result_df) st.download_button( label="Download results as CSV", data=csv, mime='text/csv', )