PhenoGenius / phenogenius_app.py
Kévin Yauy
fix(dependencies): fix updates
86f91be
from ast import Str
import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import ujson as json
import pickle as pk
from collections import Counter
import math
import sklearn
from plotnine import *
# -- Set page config
apptitle = "PhenoGenius"
st.set_page_config(
page_title=apptitle,
page_icon=":genie:",
layout="wide",
initial_sidebar_state="auto",
)
# -- Set Sidebar
image_pg = Image.open("data/img/phenogenius.png")
st.sidebar.image(image_pg, caption=None, width=100)
st.sidebar.title("PhenoGenius")
st.sidebar.header(
"Learning phenotypic patterns in genetic diseases by symptom interaction modeling"
)
st.sidebar.markdown(
"""
This webapp presents symptom interaction models in genetic diseases to provide:
- Standardized clinical descriptions
- Interpretable matches between symptoms and genes
Code source is available in GitHub:
[https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius)
PhenoGenius is a collaborative project from:
"""
)
image_uga = Image.open("data/img/logo-uga.png")
st.sidebar.image(image_uga, caption=None, width=95)
image_seqone = Image.open("data/img/logo-seqone.png")
st.sidebar.image(image_seqone, caption=None, width=95)
image_miai = Image.open("data/img/logoMIAI-rvb.png")
st.sidebar.image(image_miai, caption=None, width=95)
image_chuga = Image.open("data/img/logo-chuga.png")
st.sidebar.image(image_chuga, caption=None, width=60)
@st.cache_data(max_entries=50)
def convert_df(df):
return df.to_csv(sep="\t").encode("utf-8")
@st.cache_data(max_entries=50)
def load_data():
matrix = pd.read_csv(
"data/resources/ohe_all_thesaurus_weighted.tsv.gz",
sep="\t",
compression="gzip",
index_col=0,
)
return matrix
@st.cache_data(max_entries=50)
def load_umap_cohort():
matrix = pd.read_csv(
"data/resources/umap_loc_cohort.tsv",
sep="\t",
index_col=0,
)
return matrix
@st.cache_data( max_entries=50)
def load_cohort():
matrix = pd.read_csv(
"data/resources/cohort_diag.tsv",
sep="\t",
)
return matrix
@st.cache_data(
hash_funcs={"Pickle": lambda _: None}, max_entries=50
)
def load_nmf_model():
with open("data/resources/pheno_NMF_390_model_42.pkl", "rb") as pickle_file:
pheno_NMF = pk.load(pickle_file)
with open("data/resources/pheno_NMF_390_matrix_42.pkl", "rb") as pickle_file:
reduced = pk.load(pickle_file)
return pheno_NMF, reduced
@st.cache_data(max_entries=50)
def symbol_to_id_to_dict():
# from NCBI
ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606]
ncbi_df_ncbi = ncbi_df.set_index("Symbol")
ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict()
ncbi_df = ncbi_df.set_index("GeneID")
ncbi_to_dict = ncbi_df["Symbol"].to_dict()
return ncbi_to_dict_ncbi, ncbi_to_dict
@st.cache_data(
hash_funcs={"_json.Scanner": hash}, max_entries=50
)
def load_hp_ontology():
with open("data/resources/hpo_obo.json") as json_data:
data_dict = json.load(json_data)
return data_dict
@st.cache_data(max_entries=50)
def hpo_description_to_id():
data_dict = {}
for key, value in hp_onto.items():
data_dict[value["name"]] = key
return data_dict
@st.cache_data(
hash_funcs={"_json.Scanner": hash}, max_entries=50
)
def load_cluster_data():
with open("data/resources/cluster_info.json") as json_data:
data_dict = json.load(json_data)
return data_dict
@st.cache_data(max_entries=50)
def load_topic_data():
topic = pd.read_csv(
"data/resources/main_topics_hpo_390_42_filtered_norm_004.tsv",
sep="\t",
index_col=0,
)
return topic
@st.cache_data(
hash_funcs={"_json.Scanner": hash}, max_entries=50
)
def load_similarity_dict():
with open("data/resources/similarity_dict_threshold_80.json") as json_data:
data_dict = json.load(json_data)
return data_dict
# @st.cache_data(
# hash_funcs={"Pickle": lambda _: None}, max_entries=50
# )
# def load_projection():
# with open("data/resources/clustering_model.pkl", "rb") as pickle_file:
# cluster = pk.load(pickle_file)
# with open("data/resources/umap_projection.pkl", "rb") as pickle_file:
# umap = pk.load(pickle_file)
# return cluster, umap
def get_symbol(gene):
if gene in symbol.keys():
return symbol[gene]
def get_hpo_name(hpo):
names = {}
if hpo in hp_onto.keys():
names[hpo] = hp_onto[hpo]["name"]
return names
def get_hpo_name_only(hpo):
if hpo in hp_onto.keys():
return hp_onto[hpo]["name"]
else:
return None
def get_hpo_name_list(hpo_list, hp_onto):
names = {}
for hpo in hpo_list:
if hpo in hp_onto.keys():
names[hpo] = hp_onto[hpo]["name"]
return names
def get_similar_terms(hpo_list, similarity_terms_dict):
hpo_list_w_simi = {}
for term in hpo_list:
hpo_list_w_simi[term] = 1
if term in similarity_terms_dict.keys():
for key, value in similarity_terms_dict[term].items():
if value > 0.8:
score = value / len(similarity_terms_dict[term].keys())
if key in hpo_list_w_simi.keys():
if score > hpo_list_w_simi[key]:
hpo_list_w_simi[key] = score
else:
pass
else:
hpo_list_w_simi[key] = score
hpo_list_all = hpo_list_w_simi.keys()
return hpo_list_w_simi, list(hpo_list_all)
def score(hpo_list, matrix):
matrix_filter = matrix[hpo_list]
matrix_filter["sum"] = matrix_filter.sum(axis=1)
matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol)
return matrix_filter.sort_values("sum", ascending=False)
def score_sim_add(hpo_list_add, matrix, sim_dict):
matrix_filter = matrix[hpo_list_add]
for key, value in sim_dict.items():
matrix_filter[key] = matrix_filter[key] * value
matrix_filter["sum"] = matrix_filter.sum(axis=1)
matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol)
return matrix_filter.sort_values("sum", ascending=False)
def get_phenotype_specificity(gene_diag, data_patient):
rank = data_patient.loc[int(ncbi[gene_diag]), "rank"]
max_rank = data_patient["rank"].max()
if rank == max_rank:
return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general."
elif rank < 41:
return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)."
elif rank < 250:
return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)."
else:
return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity."
def get_relatives_list(hpo_list, hp_onto):
all_list = []
for hpo in hpo_list:
all_list.append(hpo)
if hpo in hp_onto.keys():
for parent in hp_onto[hpo]["parents"]:
all_list.append(parent)
for children in hp_onto[hpo]["childrens"]:
all_list.append(children)
return list(set(all_list))
def get_hpo_id(hpo_list):
hpo_id = []
for description in hpo_list:
hpo_id.append(hp_desc_id[description])
return ",".join(hpo_id)
hp_onto = load_hp_ontology()
hp_desc_id = hpo_description_to_id()
ncbi, symbol = symbol_to_id_to_dict()
# hpo = form.text_input(
# label="Provide your HPOs (separated by comma)",
# value="HP:0000107,HP:0000108,HP:0001407",
# )
with st.form("my_form"):
c1, c2 = st.columns(2)
with c1:
hpo_raw = st.multiselect(
"Select interactively your HPOs or...",
list(hp_desc_id.keys()),
["Renal cyst", "Hepatic cysts"],
)
with c2:
hpo = st.text_input(
"copy/paste your HPOs, separated with comma",
"HP:0000107,HP:0001407",
)
gene_diag_input = st.multiselect(
"Optional: provide HGNC gene symbol to be tested",
options=list(ncbi.keys()),
default=["PKD1"],
max_selections=1,
)
submit_button = st.form_submit_button(
label="Submit",
)
# form = st.form(key="my_form")
# gene_diag_input = form.text_input(
# label="Optional: provide HGNC gene symbol to be tested (in CAPITAL format)",
# value="PKD1",
# )
if submit_button:
if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0:
hpo = get_hpo_id(hpo_raw)
data = load_data()
pheno_NMF, reduced = load_nmf_model()
# cluster, umap = load_projection()
# umap_cohort = load_umap_cohort()
cohort = load_cohort()
cluster_info = load_cluster_data()
topic = load_topic_data()
similarity_terms_dict = load_similarity_dict()
hpo_list_ini = hpo.strip().split(",")
if gene_diag_input:
if gene_diag_input[0] in ncbi.keys():
gene_diag = gene_diag_input[0]
else:
st.write(
gene_diag_input
+ " gene are not in our database. Please check gene name (need to be in CAPITAL format)."
)
gene_diag = None
else:
gene_diag = None
hpo_list_up = []
for hpo in hpo_list_ini:
if hpo in ["HP:0000001"]:
pass
elif len(hpo) != 10:
st.write(
"Incorrect HPO format: "
+ hpo
+ ". Please check (7-digits terms with prefix HP:, and separed by commas)."
)
pass
elif hpo not in data.columns:
pass
st.write(hpo + " not available in current database. Please modify.")
else:
if data[hpo].astype(bool).sum(axis=0) != 0:
hpo_list_up.append(hpo)
else:
hpo_to_test = hp_onto[hpo]["direct_parent"][0]
while data[hpo_to_test].astype(bool).sum(
axis=0
) == 0 and hpo_to_test not in ["HP:0000001"]:
hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0]
if hpo_to_test in ["HP:0000001"]:
st.write(
"No gene-HPO associations was found for "
+ hpo
+ " and parents."
)
else:
hpo_list_up.append(hpo_to_test)
st.write(
"We replaced: ",
hpo,
" by ",
hp_onto[hpo]["direct_parent"][0],
"-",
get_hpo_name(hpo_to_test),
)
hpo_list = list(set(hpo_list_up))
del hpo_list_up
if hpo_list:
with st.expander("See HPO inputs"):
st.write(get_hpo_name_list(hpo_list_ini, hp_onto))
del hpo_list_ini
hpo_list_name = get_relatives_list(hpo_list, hp_onto)
st.header("Clinical description with symptom interaction modeling")
witness = np.zeros(len(data.columns))
witness_nmf = np.matmul(pheno_NMF.components_, witness)
patient = np.zeros(len(data.columns))
for hpo in hpo_list:
hpo_index = list(data.columns).index(hpo)
patient[hpo_index] = 1
patient_nmf = np.matmul(pheno_NMF.components_, patient)
witness_sugg_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: (x - witness_nmf) ** 2, axis=1)
)
patient_sugg_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: (x - patient_nmf) ** 2, axis=1)
)
case_sugg_df = (patient_sugg_df - witness_sugg_df).sum()
patient_df_info = pd.DataFrame(case_sugg_df).merge(
topic, left_index=True, right_index=True
)
patient_df_info["mean_score"] = round(
patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4
)
patient_df_info_write = patient_df_info[
["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"]
].sort_values("mean_score", ascending=False)
del case_sugg_df
del patient_sugg_df
del witness_sugg_df
del patient
with st.expander("See projection in groups of symptoms dimension*"):
st.dataframe(patient_df_info_write)
st.write(
"\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations"
)
match_proj_csv = convert_df(patient_df_info_write)
st.download_button(
"Download description projection",
match_proj_csv,
"clin_desc_projected.tsv",
"text/csv",
key="download-csv-proj",
)
# patient_transposed = sklearn.preprocessing.normalize(
# np.array(patient_df_info["mean_score"]).reshape(1, -1), norm="l1"
# )
# del patient_df_info
#
# patient_nmf_umap = umap.transform(pd.DataFrame(patient_transposed))
# del patient_transposed
#
# with st.expander("See projection in cohort"):
# umap_cohort["dist"] = abs(umap_cohort["x"] - patient_nmf_umap[0, 0]) + abs(
# umap_cohort["y"] - patient_nmf_umap[0, 1]
# )
# del patient_nmf_umap
# closest_patient = umap_cohort.nsmallest(3, "dist")
# st.write("Closest patients in the cohort are: ", closest_patient)
# st.write("Closest patient: ", cohort.loc[closest_patient.index[0]])
# st.write(
# get_hpo_name_list(
# cohort.loc[closest_patient.index[0]].hpo_list.split(","),
# hp_onto,
# )
# )
#
# cluster_selected = cluster_info[str(closest_patient["cluster"].values[0])]
# st.write("Selected cluster: ", closest_patient["cluster"].values[0])
# st.write("Number of patient in cluster: ", cluster_selected["n_patients"])
# del closest_patient
#
# gene_in_cluster = pd.DataFrame.from_dict(
# dict(Counter(cluster_selected["gene_list"])), orient="index"
# )
# gene_in_cluster.columns = ["count"]
# if gene_diag:
# if gene_diag in gene_in_cluster.index:
# st.write("Gene diag in cluster", gene_in_cluster.loc[gene_diag, :])
#
# st.write(
# "Gene(s) involved in cluster: ",
# gene_in_cluster.sort_values("count", ascending=False),
# )
# del gene_in_cluster
#
# group_involved = cluster_selected["group"]
# if (
# isinstance(group_involved, float)
# and math.isnan(float(group_involved)) == False
# ):
# topic_involved = topic.loc[topic_involved, :]
# st.write(
# "Group(s) of symptoms statistically enriched: ", topic_involved
# )
# elif isinstance(group_involved, str):
# group_list = [int(x) for x in cluster_selected["group"].split(",")]
# topic_involved = topic.loc[group_list, :]
# st.write(
# "Group(s) of symptoms statistically enriched: ", topic_involved
# )
# del topic_involved
# del group_involved
#
# dict_count_print = {}
# dict_count = dict(Counter(cluster_selected["hpo_list"]))
# dict_count_sorted = sorted(
# dict_count.items(), key=lambda x: x[1], reverse=True
# )
# del cluster_selected
# for element in dict_count_sorted:
# dict_count_print[element[0]] = {
# "description": hp_onto[element[0]]["name"],
# "count": element[1],
# }
# st.write(
# "HPOs declared in cluster:",
# pd.DataFrame.from_dict(dict_count_print, orient="index"),
# )
# del dict_count
# del dict_count_print
# del dict_count_sorted
#
sim_dict, hpo_list_add = get_similar_terms(hpo_list, similarity_terms_dict)
similar_list = list(set(hpo_list_add) - set(hpo_list))
similar_list_desc = get_hpo_name_list(similar_list, hp_onto)
if similar_list_desc:
with st.expander("See symptoms with similarity > 80%"):
similar_list_desc_df = pd.DataFrame.from_dict(
similar_list_desc, orient="index"
)
similar_list_desc_df.columns = ["description"]
st.write(similar_list_desc_df)
del similar_list_desc_df
del similar_list
del similar_list_desc
st.header("Phenotype matching")
results_sum = score(hpo_list, data)
results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1)
results_sum["score"] = results_sum["matchs"] + results_sum["sum"]
results_sum["rank"] = (
results_sum["score"].rank(ascending=False, method="max").astype(int)
)
cols = results_sum.columns.tolist()
cols = cols[-4:] + cols[:-4]
match = results_sum[cols].sort_values(by=["score"], ascending=False)
st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"]))
match_csv = convert_df(match)
st.download_button(
"Download matching results",
match_csv,
"match.tsv",
"text/csv",
key="download-csv-match",
)
if gene_diag:
if int(ncbi[gene_diag]) in results_sum.index:
p = (
ggplot(match, aes("score"))
+ geom_density()
+ geom_vline(
xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p))
st.write(
"Gene ID rank:",
results_sum.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(results_sum.loc[[int(ncbi[gene_diag])]])
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, results_sum),
)
del p
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del results_sum
del match
st.header("Phenotype matching by similarity of symptoms")
results_sum_add = score_sim_add(hpo_list_add, data, sim_dict)
results_sum_add["rank"] = (
results_sum_add["sum"].rank(ascending=False, method="max").astype(int)
)
cols = results_sum_add.columns.tolist()
cols = cols[-2:] + cols[:-2]
match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False)
st.dataframe(match_sim[match_sim["sum"] > 0.01])
match_sim_csv = convert_df(match_sim)
st.download_button(
"Download matching results",
match_sim_csv,
"match_sim.tsv",
"text/csv",
key="download-csv-match-sim",
)
if gene_diag:
if int(ncbi[gene_diag]) in results_sum_add.index:
p2 = (
ggplot(match_sim, aes("sum"))
+ geom_density()
+ geom_vline(
xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p2))
st.write(
"Gene ID rank:",
results_sum_add.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, results_sum_add),
)
del p2
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del sim_dict
del hpo_list_add
del results_sum_add
del match_sim
st.header("Phenotype matching by groups of symptoms")
patient_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: sum((x - patient_nmf) ** 2), axis=1)
)
witness_df = (
pd.DataFrame(reduced)
.set_index(data.index)
.apply(lambda x: sum((x - witness_nmf) ** 2), axis=1)
)
del patient_nmf
del witness
del witness_nmf
case_df = pd.DataFrame(patient_df - witness_df)
case_df.columns = ["score"]
case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max())
# case_df["frequency"] = matrix_frequency["variant_number"]
case_df["sum"] = case_df["score_norm"] # + case_df["frequency"]
case_df_sort = case_df.sort_values(by="sum", ascending=False)
case_df_sort["rank"] = (
case_df_sort["sum"].rank(ascending=False, method="max").astype(int)
)
case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol)
match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]]
st.dataframe(match_nmf[match_nmf["sum"] > 0.01])
match_nmf_csv = convert_df(match_nmf)
st.download_button(
"Download matching results",
match_nmf_csv,
"match_groups.tsv",
"text/csv",
key="download-csv-match-groups",
)
if gene_diag:
if int(ncbi[gene_diag]) in case_df_sort.index:
p3 = (
ggplot(match_nmf, aes("sum"))
+ geom_density()
+ geom_vline(
xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"],
linetype="dashed",
color="red",
size=1.5,
)
+ ggtitle("Matching score distribution")
+ xlab("Gene matching score")
+ ylab("% of genes")
+ theme_bw()
+ theme(
text=element_text(size=12),
figure_size=(5, 5),
axis_ticks=element_line(colour="black", size=4),
axis_line=element_line(colour="black", size=2),
axis_text_x=element_text(angle=45, hjust=1),
axis_text_y=element_text(angle=60, hjust=1),
subplots_adjust={"wspace": 0.1},
legend_position=(0.7, 0.35),
)
)
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(ggplot.draw(p3))
st.write(
"Gene ID rank:",
case_df_sort.loc[int(ncbi[gene_diag]), "rank"],
" | ",
"Gene ID count:",
round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4),
)
st.write(
"Gene ID phenotype specificity:",
get_phenotype_specificity(gene_diag, case_df_sort),
)
del p3
else:
st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
del case_df_sort
del match_nmf
del case_df
else:
st.write(
"No HPO terms provided in correct format.",
)