Spaces:
Running
Running
Kévin Yauy
commited on
Commit
•
86f91be
1
Parent(s):
3d7977d
fix(dependencies): fix updates
Browse filesFormer-commit-id: 24d597c3e610a89214ed8c2113986c7b1326ff61
- phenogenius_app.py +107 -106
- pyproject.toml +2 -2
phenogenius_app.py
CHANGED
@@ -55,12 +55,12 @@ image_chuga = Image.open("data/img/logo-chuga.png")
|
|
55 |
st.sidebar.image(image_chuga, caption=None, width=60)
|
56 |
|
57 |
|
58 |
-
@st.
|
59 |
def convert_df(df):
|
60 |
return df.to_csv(sep="\t").encode("utf-8")
|
61 |
|
62 |
|
63 |
-
@st.
|
64 |
def load_data():
|
65 |
matrix = pd.read_csv(
|
66 |
"data/resources/ohe_all_thesaurus_weighted.tsv.gz",
|
@@ -71,7 +71,7 @@ def load_data():
|
|
71 |
return matrix
|
72 |
|
73 |
|
74 |
-
@st.
|
75 |
def load_umap_cohort():
|
76 |
matrix = pd.read_csv(
|
77 |
"data/resources/umap_loc_cohort.tsv",
|
@@ -81,7 +81,7 @@ def load_umap_cohort():
|
|
81 |
return matrix
|
82 |
|
83 |
|
84 |
-
@st.
|
85 |
def load_cohort():
|
86 |
matrix = pd.read_csv(
|
87 |
"data/resources/cohort_diag.tsv",
|
@@ -90,8 +90,8 @@ def load_cohort():
|
|
90 |
return matrix
|
91 |
|
92 |
|
93 |
-
@st.
|
94 |
-
hash_funcs={"Pickle": lambda _: None},
|
95 |
)
|
96 |
def load_nmf_model():
|
97 |
with open("data/resources/pheno_NMF_390_model_42.pkl", "rb") as pickle_file:
|
@@ -101,7 +101,7 @@ def load_nmf_model():
|
|
101 |
return pheno_NMF, reduced
|
102 |
|
103 |
|
104 |
-
@st.
|
105 |
def symbol_to_id_to_dict():
|
106 |
# from NCBI
|
107 |
ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
|
@@ -113,8 +113,8 @@ def symbol_to_id_to_dict():
|
|
113 |
return ncbi_to_dict_ncbi, ncbi_to_dict
|
114 |
|
115 |
|
116 |
-
@st.
|
117 |
-
hash_funcs={"_json.Scanner": hash},
|
118 |
)
|
119 |
def load_hp_ontology():
|
120 |
with open("data/resources/hpo_obo.json") as json_data:
|
@@ -122,7 +122,7 @@ def load_hp_ontology():
|
|
122 |
return data_dict
|
123 |
|
124 |
|
125 |
-
@st.
|
126 |
def hpo_description_to_id():
|
127 |
data_dict = {}
|
128 |
for key, value in hp_onto.items():
|
@@ -130,8 +130,8 @@ def hpo_description_to_id():
|
|
130 |
return data_dict
|
131 |
|
132 |
|
133 |
-
@st.
|
134 |
-
hash_funcs={"_json.Scanner": hash},
|
135 |
)
|
136 |
def load_cluster_data():
|
137 |
with open("data/resources/cluster_info.json") as json_data:
|
@@ -139,7 +139,7 @@ def load_cluster_data():
|
|
139 |
return data_dict
|
140 |
|
141 |
|
142 |
-
@st.
|
143 |
def load_topic_data():
|
144 |
topic = pd.read_csv(
|
145 |
"data/resources/main_topics_hpo_390_42_filtered_norm_004.tsv",
|
@@ -149,8 +149,8 @@ def load_topic_data():
|
|
149 |
return topic
|
150 |
|
151 |
|
152 |
-
@st.
|
153 |
-
hash_funcs={"_json.Scanner": hash},
|
154 |
)
|
155 |
def load_similarity_dict():
|
156 |
with open("data/resources/similarity_dict_threshold_80.json") as json_data:
|
@@ -158,15 +158,15 @@ def load_similarity_dict():
|
|
158 |
return data_dict
|
159 |
|
160 |
|
161 |
-
@st.
|
162 |
-
|
163 |
-
)
|
164 |
-
def load_projection():
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
|
171 |
|
172 |
def get_symbol(gene):
|
@@ -308,8 +308,8 @@ if submit_button:
|
|
308 |
hpo = get_hpo_id(hpo_raw)
|
309 |
data = load_data()
|
310 |
pheno_NMF, reduced = load_nmf_model()
|
311 |
-
cluster, umap = load_projection()
|
312 |
-
umap_cohort = load_umap_cohort()
|
313 |
cohort = load_cohort()
|
314 |
cluster_info = load_cluster_data()
|
315 |
topic = load_topic_data()
|
@@ -435,88 +435,89 @@ if submit_button:
|
|
435 |
key="download-csv-proj",
|
436 |
)
|
437 |
|
438 |
-
patient_transposed = sklearn.preprocessing.normalize(
|
439 |
-
np.array(patient_df_info["mean_score"]).reshape(1, -1), norm="l1"
|
440 |
-
)
|
441 |
-
del patient_df_info
|
442 |
-
|
443 |
-
patient_nmf_umap = umap.transform(pd.DataFrame(patient_transposed))
|
444 |
-
del patient_transposed
|
445 |
-
|
446 |
-
with st.expander("See projection in cohort"):
|
447 |
-
umap_cohort["dist"] = abs(umap_cohort["x"] - patient_nmf_umap[0, 0]) + abs(
|
448 |
-
umap_cohort["y"] - patient_nmf_umap[0, 1]
|
449 |
-
)
|
450 |
-
del patient_nmf_umap
|
451 |
-
closest_patient = umap_cohort.nsmallest(3, "dist")
|
452 |
-
st.write("Closest patients in the cohort are: ", closest_patient)
|
453 |
-
st.write("Closest patient: ", cohort.loc[closest_patient.index[0]])
|
454 |
-
st.write(
|
455 |
-
get_hpo_name_list(
|
456 |
-
cohort.loc[closest_patient.index[0]].hpo_list.split(","),
|
457 |
-
hp_onto,
|
458 |
-
)
|
459 |
-
)
|
460 |
-
|
461 |
-
cluster_selected = cluster_info[str(closest_patient["cluster"].values[0])]
|
462 |
-
st.write("Selected cluster: ", closest_patient["cluster"].values[0])
|
463 |
-
st.write("Number of patient in cluster: ", cluster_selected["n_patients"])
|
464 |
-
del closest_patient
|
465 |
-
|
466 |
-
gene_in_cluster = pd.DataFrame.from_dict(
|
467 |
-
dict(Counter(cluster_selected["gene_list"])), orient="index"
|
468 |
-
)
|
469 |
-
gene_in_cluster.columns = ["count"]
|
470 |
-
if gene_diag:
|
471 |
-
if gene_diag in gene_in_cluster.index:
|
472 |
-
st.write("Gene diag in cluster", gene_in_cluster.loc[gene_diag, :])
|
473 |
-
|
474 |
-
st.write(
|
475 |
-
"Gene(s) involved in cluster: ",
|
476 |
-
gene_in_cluster.sort_values("count", ascending=False),
|
477 |
-
)
|
478 |
-
del gene_in_cluster
|
479 |
-
|
480 |
-
group_involved = cluster_selected["group"]
|
481 |
-
if (
|
482 |
-
isinstance(group_involved, float)
|
483 |
-
and math.isnan(float(group_involved)) == False
|
484 |
-
):
|
485 |
-
topic_involved = topic.loc[topic_involved, :]
|
486 |
-
st.write(
|
487 |
-
"Group(s) of symptoms statistically enriched: ", topic_involved
|
488 |
-
)
|
489 |
-
elif isinstance(group_involved, str):
|
490 |
-
group_list = [int(x) for x in cluster_selected["group"].split(",")]
|
491 |
-
topic_involved = topic.loc[group_list, :]
|
492 |
-
st.write(
|
493 |
-
"Group(s) of symptoms statistically enriched: ", topic_involved
|
494 |
-
)
|
495 |
-
del topic_involved
|
496 |
-
del group_involved
|
497 |
-
|
498 |
-
dict_count_print = {}
|
499 |
-
dict_count = dict(Counter(cluster_selected["hpo_list"]))
|
500 |
-
dict_count_sorted = sorted(
|
501 |
-
dict_count.items(), key=lambda x: x[1], reverse=True
|
502 |
-
)
|
503 |
-
del cluster_selected
|
504 |
-
for element in dict_count_sorted:
|
505 |
-
dict_count_print[element[0]] = {
|
506 |
-
"description": hp_onto[element[0]]["name"],
|
507 |
-
"count": element[1],
|
508 |
-
}
|
509 |
-
st.write(
|
510 |
-
"HPOs declared in cluster:",
|
511 |
-
pd.DataFrame.from_dict(dict_count_print, orient="index"),
|
512 |
-
)
|
513 |
-
del dict_count
|
514 |
-
del dict_count_print
|
515 |
-
del dict_count_sorted
|
516 |
-
|
517 |
sim_dict, hpo_list_add = get_similar_terms(hpo_list, similarity_terms_dict)
|
518 |
similar_list = list(set(hpo_list_add) - set(hpo_list))
|
519 |
-
similar_list_desc = get_hpo_name_list(similar_list, hp_onto)
|
|
|
520 |
if similar_list_desc:
|
521 |
with st.expander("See symptoms with similarity > 80%"):
|
522 |
similar_list_desc_df = pd.DataFrame.from_dict(
|
|
|
55 |
st.sidebar.image(image_chuga, caption=None, width=60)
|
56 |
|
57 |
|
58 |
+
@st.cache_data(max_entries=50)
|
59 |
def convert_df(df):
|
60 |
return df.to_csv(sep="\t").encode("utf-8")
|
61 |
|
62 |
|
63 |
+
@st.cache_data(max_entries=50)
|
64 |
def load_data():
|
65 |
matrix = pd.read_csv(
|
66 |
"data/resources/ohe_all_thesaurus_weighted.tsv.gz",
|
|
|
71 |
return matrix
|
72 |
|
73 |
|
74 |
+
@st.cache_data(max_entries=50)
|
75 |
def load_umap_cohort():
|
76 |
matrix = pd.read_csv(
|
77 |
"data/resources/umap_loc_cohort.tsv",
|
|
|
81 |
return matrix
|
82 |
|
83 |
|
84 |
+
@st.cache_data( max_entries=50)
|
85 |
def load_cohort():
|
86 |
matrix = pd.read_csv(
|
87 |
"data/resources/cohort_diag.tsv",
|
|
|
90 |
return matrix
|
91 |
|
92 |
|
93 |
+
@st.cache_data(
|
94 |
+
hash_funcs={"Pickle": lambda _: None}, max_entries=50
|
95 |
)
|
96 |
def load_nmf_model():
|
97 |
with open("data/resources/pheno_NMF_390_model_42.pkl", "rb") as pickle_file:
|
|
|
101 |
return pheno_NMF, reduced
|
102 |
|
103 |
|
104 |
+
@st.cache_data(max_entries=50)
|
105 |
def symbol_to_id_to_dict():
|
106 |
# from NCBI
|
107 |
ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
|
|
|
113 |
return ncbi_to_dict_ncbi, ncbi_to_dict
|
114 |
|
115 |
|
116 |
+
@st.cache_data(
|
117 |
+
hash_funcs={"_json.Scanner": hash}, max_entries=50
|
118 |
)
|
119 |
def load_hp_ontology():
|
120 |
with open("data/resources/hpo_obo.json") as json_data:
|
|
|
122 |
return data_dict
|
123 |
|
124 |
|
125 |
+
@st.cache_data(max_entries=50)
|
126 |
def hpo_description_to_id():
|
127 |
data_dict = {}
|
128 |
for key, value in hp_onto.items():
|
|
|
130 |
return data_dict
|
131 |
|
132 |
|
133 |
+
@st.cache_data(
|
134 |
+
hash_funcs={"_json.Scanner": hash}, max_entries=50
|
135 |
)
|
136 |
def load_cluster_data():
|
137 |
with open("data/resources/cluster_info.json") as json_data:
|
|
|
139 |
return data_dict
|
140 |
|
141 |
|
142 |
+
@st.cache_data(max_entries=50)
|
143 |
def load_topic_data():
|
144 |
topic = pd.read_csv(
|
145 |
"data/resources/main_topics_hpo_390_42_filtered_norm_004.tsv",
|
|
|
149 |
return topic
|
150 |
|
151 |
|
152 |
+
@st.cache_data(
|
153 |
+
hash_funcs={"_json.Scanner": hash}, max_entries=50
|
154 |
)
|
155 |
def load_similarity_dict():
|
156 |
with open("data/resources/similarity_dict_threshold_80.json") as json_data:
|
|
|
158 |
return data_dict
|
159 |
|
160 |
|
161 |
+
# @st.cache_data(
|
162 |
+
# hash_funcs={"Pickle": lambda _: None}, max_entries=50
|
163 |
+
# )
|
164 |
+
# def load_projection():
|
165 |
+
# with open("data/resources/clustering_model.pkl", "rb") as pickle_file:
|
166 |
+
# cluster = pk.load(pickle_file)
|
167 |
+
# with open("data/resources/umap_projection.pkl", "rb") as pickle_file:
|
168 |
+
# umap = pk.load(pickle_file)
|
169 |
+
# return cluster, umap
|
170 |
|
171 |
|
172 |
def get_symbol(gene):
|
|
|
308 |
hpo = get_hpo_id(hpo_raw)
|
309 |
data = load_data()
|
310 |
pheno_NMF, reduced = load_nmf_model()
|
311 |
+
# cluster, umap = load_projection()
|
312 |
+
# umap_cohort = load_umap_cohort()
|
313 |
cohort = load_cohort()
|
314 |
cluster_info = load_cluster_data()
|
315 |
topic = load_topic_data()
|
|
|
435 |
key="download-csv-proj",
|
436 |
)
|
437 |
|
438 |
+
# patient_transposed = sklearn.preprocessing.normalize(
|
439 |
+
# np.array(patient_df_info["mean_score"]).reshape(1, -1), norm="l1"
|
440 |
+
# )
|
441 |
+
# del patient_df_info
|
442 |
+
#
|
443 |
+
# patient_nmf_umap = umap.transform(pd.DataFrame(patient_transposed))
|
444 |
+
# del patient_transposed
|
445 |
+
#
|
446 |
+
# with st.expander("See projection in cohort"):
|
447 |
+
# umap_cohort["dist"] = abs(umap_cohort["x"] - patient_nmf_umap[0, 0]) + abs(
|
448 |
+
# umap_cohort["y"] - patient_nmf_umap[0, 1]
|
449 |
+
# )
|
450 |
+
# del patient_nmf_umap
|
451 |
+
# closest_patient = umap_cohort.nsmallest(3, "dist")
|
452 |
+
# st.write("Closest patients in the cohort are: ", closest_patient)
|
453 |
+
# st.write("Closest patient: ", cohort.loc[closest_patient.index[0]])
|
454 |
+
# st.write(
|
455 |
+
# get_hpo_name_list(
|
456 |
+
# cohort.loc[closest_patient.index[0]].hpo_list.split(","),
|
457 |
+
# hp_onto,
|
458 |
+
# )
|
459 |
+
# )
|
460 |
+
#
|
461 |
+
# cluster_selected = cluster_info[str(closest_patient["cluster"].values[0])]
|
462 |
+
# st.write("Selected cluster: ", closest_patient["cluster"].values[0])
|
463 |
+
# st.write("Number of patient in cluster: ", cluster_selected["n_patients"])
|
464 |
+
# del closest_patient
|
465 |
+
#
|
466 |
+
# gene_in_cluster = pd.DataFrame.from_dict(
|
467 |
+
# dict(Counter(cluster_selected["gene_list"])), orient="index"
|
468 |
+
# )
|
469 |
+
# gene_in_cluster.columns = ["count"]
|
470 |
+
# if gene_diag:
|
471 |
+
# if gene_diag in gene_in_cluster.index:
|
472 |
+
# st.write("Gene diag in cluster", gene_in_cluster.loc[gene_diag, :])
|
473 |
+
#
|
474 |
+
# st.write(
|
475 |
+
# "Gene(s) involved in cluster: ",
|
476 |
+
# gene_in_cluster.sort_values("count", ascending=False),
|
477 |
+
# )
|
478 |
+
# del gene_in_cluster
|
479 |
+
#
|
480 |
+
# group_involved = cluster_selected["group"]
|
481 |
+
# if (
|
482 |
+
# isinstance(group_involved, float)
|
483 |
+
# and math.isnan(float(group_involved)) == False
|
484 |
+
# ):
|
485 |
+
# topic_involved = topic.loc[topic_involved, :]
|
486 |
+
# st.write(
|
487 |
+
# "Group(s) of symptoms statistically enriched: ", topic_involved
|
488 |
+
# )
|
489 |
+
# elif isinstance(group_involved, str):
|
490 |
+
# group_list = [int(x) for x in cluster_selected["group"].split(",")]
|
491 |
+
# topic_involved = topic.loc[group_list, :]
|
492 |
+
# st.write(
|
493 |
+
# "Group(s) of symptoms statistically enriched: ", topic_involved
|
494 |
+
# )
|
495 |
+
# del topic_involved
|
496 |
+
# del group_involved
|
497 |
+
#
|
498 |
+
# dict_count_print = {}
|
499 |
+
# dict_count = dict(Counter(cluster_selected["hpo_list"]))
|
500 |
+
# dict_count_sorted = sorted(
|
501 |
+
# dict_count.items(), key=lambda x: x[1], reverse=True
|
502 |
+
# )
|
503 |
+
# del cluster_selected
|
504 |
+
# for element in dict_count_sorted:
|
505 |
+
# dict_count_print[element[0]] = {
|
506 |
+
# "description": hp_onto[element[0]]["name"],
|
507 |
+
# "count": element[1],
|
508 |
+
# }
|
509 |
+
# st.write(
|
510 |
+
# "HPOs declared in cluster:",
|
511 |
+
# pd.DataFrame.from_dict(dict_count_print, orient="index"),
|
512 |
+
# )
|
513 |
+
# del dict_count
|
514 |
+
# del dict_count_print
|
515 |
+
# del dict_count_sorted
|
516 |
+
#
|
517 |
sim_dict, hpo_list_add = get_similar_terms(hpo_list, similarity_terms_dict)
|
518 |
similar_list = list(set(hpo_list_add) - set(hpo_list))
|
519 |
+
similar_list_desc = get_hpo_name_list(similar_list, hp_onto)
|
520 |
+
|
521 |
if similar_list_desc:
|
522 |
with st.expander("See symptoms with similarity > 80%"):
|
523 |
similar_list_desc_df = pd.DataFrame.from_dict(
|
pyproject.toml
CHANGED
@@ -5,13 +5,12 @@ description = ""
|
|
5 |
authors = ["kevin.yauy <kevin.yauy@seqone.fr>"]
|
6 |
|
7 |
[tool.poetry.dependencies]
|
8 |
-
python = "
|
9 |
pandas = ">=1.3.0"
|
10 |
pandarallel = "^1.6.1"
|
11 |
scikit-learn = "^1.1.1"
|
12 |
ujson = "^5.4.0"
|
13 |
streamlit = "^1.11.1"
|
14 |
-
umap-learn = "^0.5.3"
|
15 |
matplotlib = "3.5"
|
16 |
seaborn = "^0.11.2"
|
17 |
plotnine = "^0.9.0"
|
@@ -19,6 +18,7 @@ obonet = "^0.3.0"
|
|
19 |
multipy = "^0.16"
|
20 |
psrecord = "^1.2"
|
21 |
numpy = ">=1.20,<1.24"
|
|
|
22 |
|
23 |
[tool.poetry.dev-dependencies]
|
24 |
pytest = "^5.2"
|
|
|
5 |
authors = ["kevin.yauy <kevin.yauy@seqone.fr>"]
|
6 |
|
7 |
[tool.poetry.dependencies]
|
8 |
+
python = "~3.8"
|
9 |
pandas = ">=1.3.0"
|
10 |
pandarallel = "^1.6.1"
|
11 |
scikit-learn = "^1.1.1"
|
12 |
ujson = "^5.4.0"
|
13 |
streamlit = "^1.11.1"
|
|
|
14 |
matplotlib = "3.5"
|
15 |
seaborn = "^0.11.2"
|
16 |
plotnine = "^0.9.0"
|
|
|
18 |
multipy = "^0.16"
|
19 |
psrecord = "^1.2"
|
20 |
numpy = ">=1.20,<1.24"
|
21 |
+
umap-learn = "^0.5.4"
|
22 |
|
23 |
[tool.poetry.dev-dependencies]
|
24 |
pytest = "^5.2"
|