lingbionlp commited on
Commit
4dc59ae
1 Parent(s): 82ee352

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -116
app.py CHANGED
@@ -1,157 +1,256 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- Created on Tue Nov 22 09:54:41 2022
4
 
5
  @author: luol2
6
  """
7
 
8
-
9
-
10
  import streamlit as st
11
- import argparse
12
- from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
13
  from src.dic_ner import dic_ont
14
  from src.tagging_text import bioTag
15
  import os
16
- import time
17
  import json
18
- import sys
19
- import nltk
20
- nltk.download('punkt')
21
- nltk.download('averaged_perceptron_tagger')
22
- nltk.download('wordnet')
23
 
24
  st.set_page_config(
25
  page_title="PhenoTagger",
26
- page_icon=":shark:",
27
- # layout="wide",
28
- initial_sidebar_state="expanded",
29
  menu_items={
30
- 'Get Help': 'https://www.extremelycoolapp.com/help',
31
- 'Report a bug': "https://www.extremelycoolapp.com/bug",
32
- 'About': "# This is a header. This is an *extremely* cool app!"
33
  }
34
  )
35
- st.title('PhenoTagger Demo')
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
-
41
 
 
42
 
43
- # with st.spinner('Model is being loaded..'):
44
-
45
- # print('load model done!')
46
-
47
 
 
48
 
49
-
 
 
 
 
 
 
 
 
 
 
 
50
  with st.form(key="my_form"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- @st.cache(allow_output_mutation=True)
53
- def load_model():
54
- ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  'word_hpo_file':'./dict_new/word_id_map.json',
56
  'hpo_word_file':'./dict_new/id_word_map.json'}
57
 
58
- # if para_set['model_type']=='cnn':
59
- # vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
60
- # 'charfile':'../vocab/char.vocab',
61
- # 'labelfile':'../dict_new/lable.vocab',
62
- # 'posfile':'../vocab/pos.vocab'}
63
- # modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
64
 
65
- # elif para_set['model_type']=='bioformer':
66
- vocabfiles={'labelfile':'./dict_new/lable.vocab',
67
- 'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
68
- 'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
69
- 'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
70
- modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
71
- # else:
72
- # print('Model type is wrong, please select cnn or bioformer.')
73
- # sys.exit()
74
 
75
-
76
- biotag_dic=dic_ont(ontfiles)
77
-
78
- # if para_set['model_type']=='cnn':
79
- # nn_model=bioTag_CNN(vocabfiles)
80
- # nn_model.load_model(modelfile)
81
- # elif para_set['model_type']=='bioformer':
82
- nn_model=bioTag_Bioformer(vocabfiles)
83
- session=nn_model.load_model(modelfile)
84
- test_tag='1232'
85
- return nn_model,biotag_dic,test_tag,session
86
-
87
-
88
- #hyper-parameter
89
- st.sidebar.header("Hyperparameter Settings")
90
- sbform = st.sidebar.form("Hyper-paramiters")
91
- # para_model=sbform.selectbox('Model', ['cnn', 'bioformer'])
92
- para_overlap=sbform.selectbox('Return overlapping concepts', ['True', 'False'])
93
- para_abbr=sbform.selectbox('Identify abbreviations', ['True', 'False'])
94
- para_threshold = sbform.slider('Threshold:', min_value=0.5, max_value=0.95, value=0.95, step=0.05)
95
- sbform.form_submit_button("Setting")
96
-
97
- st.write('parameters:', para_overlap,para_abbr,para_threshold)
98
- nn_model,biotag_dic,test_tag,session=load_model()
99
-
100
-
101
- input_text = st.text_area(
102
- "Paste your text below (max 500 words)",
103
- height=510,
104
- )
105
 
106
- MAX_WORDS = 500
107
- import re
108
- res = len(re.findall(r"\w+", input_text))
109
- if res > MAX_WORDS:
110
- st.warning(
111
- "⚠️ Your text contains "
112
- + str(res)
113
- + " words."
114
- + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
115
  )
116
 
117
- input_text = input_text[:MAX_WORDS]
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- submit_button = st.form_submit_button(label="✨ Get me the data!")
120
-
121
- if para_overlap=='True':
122
- para_overlap=True
123
- else:
124
- para_overlap=False
125
- if para_abbr=='True':
126
- para_abbr=True
127
- else:
128
- para_abbr=False
129
- para_set={
130
- #model_type':para_model, # cnn or bioformer
131
- 'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
132
- 'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
133
- 'ML_Threshold':para_threshold,# the Threshold of deep learning model
134
- }
135
-
136
-
137
 
138
  if not submit_button:
139
  st.stop()
140
-
141
 
142
- st.markdown(f"""**Results:**\n""")
 
 
 
 
 
 
 
 
 
 
 
 
143
  # print('dic...........:',biotag_dic.keys())
144
- print('........:',test_tag)
145
- print('........!!!!!!:',input_text)
146
- print('...input:',input_text)
147
- tag_result=bioTag(session,input_text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
148
- for ele in tag_result:
149
- start = ele[0]
150
- last = ele[1]
151
- mention = input_text[int(ele[0]):int(ele[1])]
152
- type='Phenotype'
153
- id=ele[2]
154
- score=ele[3]
155
- output=start+"\t"+last+"\t"+mention+"\t"+id+'\t'+score+"\n"
156
- st.info(output)
157
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ Created on Mon Nov 21 16:21:25 2022
4
 
5
  @author: luol2
6
  """
7
 
 
 
8
  import streamlit as st
9
+ from src.nn_model import bioTag_CNN,bioTag_Bioformer
 
10
  from src.dic_ner import dic_ont
11
  from src.tagging_text import bioTag
12
  import os
 
13
  import json
14
+ from pandas import DataFrame
 
 
 
 
15
 
16
  st.set_page_config(
17
  page_title="PhenoTagger",
18
+ page_icon="🎈",
19
+ layout="wide",
 
20
  menu_items={
21
+ 'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
22
+ 'About': "PhenoTagger v1.1"
 
23
  }
24
  )
 
25
 
26
 
27
+ # def _max_width_():
28
+ # max_width_str = f"max-width: 2400px;"
29
+ # st.markdown(
30
+ # f"""
31
+ # <style>
32
+ # .reportview-container .main .block-container{{
33
+ # {max_width_str}
34
+ # }}
35
+ # </style>
36
+ # """,
37
+ # unsafe_allow_html=True,
38
+ # )
39
 
40
 
41
+ # _max_width_()
42
 
43
+ # c30, c31, c32 = st.columns([2.5, 1, 3])
44
 
45
+ # with c30:
46
+ # # st.image("logo.png", width=400)
47
+ st.title("👨‍⚕️ PhenoTagger Demo")
 
48
 
49
+ with st.expander("ℹ️ - About this app", expanded=True):
50
 
51
+ st.write(
52
+ """
53
+ - This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
54
+ - PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
55
+ - Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
56
+ """
57
+ )
58
+
59
+ st.markdown("")
60
+
61
+ st.markdown("")
62
+ st.markdown("## 📌 Paste document ")
63
  with st.form(key="my_form"):
64
+
65
+
66
+ ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
67
+ with c1:
68
+ ModelType = st.radio(
69
+ "Choose your model",
70
+ ["Bioformer(Default)", "CNN"],
71
+ help="Bioformer is more precise, CNN is more efficient",
72
+ )
73
+
74
+ if ModelType == "Bioformer(Default)":
75
+ # kw_model = KeyBERT(model=roberta)
76
+
77
+ @st.cache(allow_output_mutation=True)
78
+ def load_model():
79
+ ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
80
+ 'word_hpo_file':'./dict_new/word_id_map.json',
81
+ 'hpo_word_file':'./dict_new/id_word_map.json'}
82
 
83
+
84
+ vocabfiles={'labelfile':'./dict_new/lable.vocab',
85
+ 'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
86
+ 'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
87
+ 'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
88
+ modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
89
+
90
+
91
+ biotag_dic=dic_ont(ontfiles)
92
+
93
+ nn_model=bioTag_Bioformer(vocabfiles)
94
+ nn_model.load_model(modelfile)
95
+ return nn_model,biotag_dic
96
+
97
+ nn_model,biotag_dic = load_model()
98
+
99
+ else:
100
+ @st.cache(allow_output_mutation=True)
101
+ def load_model():
102
+ ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
103
  'word_hpo_file':'./dict_new/word_id_map.json',
104
  'hpo_word_file':'./dict_new/id_word_map.json'}
105
 
106
+
107
+ vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200',
108
+ 'charfile':'./vocab/char.vocab',
109
+ 'labelfile':'./dict_new/lable.vocab',
110
+ 'posfile':'./vocab/pos.vocab'}
111
+ modelfile='./models/cnn_p5n5_b128_95_hponew1.h5'
112
 
113
+ biotag_dic=dic_ont(ontfiles)
114
+
115
+ nn_model=bioTag_CNN(vocabfiles)
116
+ nn_model.load_model(modelfile)
117
+
118
+ return nn_model,biotag_dic
119
+
120
+ nn_model,biotag_dic = load_model()
 
121
 
122
+ para_overlap = st.checkbox(
123
+ "Overlap concept",
124
+ value=True,
125
+ help="Tick this box to identify overlapping concepts",
126
+ )
127
+ para_abbr = st.checkbox(
128
+ "Abbreviaitons",
129
+ value=True,
130
+ help="Tick this box to identify abbreviations",
131
+ )
132
+
133
+ para_threshold = st.slider(
134
+ "Threshold",
135
+ min_value=0.5,
136
+ max_value=0.95,
137
+ value=0.95,
138
+ step=0.05,
139
+ help="Retrun the preditions which socre over the threshold.",
140
+ )
141
+
 
 
 
 
 
 
 
 
 
 
142
 
143
+
144
+
145
+ with c2:
146
+ doc = st.text_area(
147
+ "Paste your text below",
148
+ height=400,
 
 
 
149
  )
150
 
151
+ # MAX_WORDS = 500
152
+ # import re
153
+ # res = len(re.findall(r"\w+", doc))
154
+ # if res > MAX_WORDS:
155
+ # st.warning(
156
+ # "⚠️ Your text contains "
157
+ # + str(res)
158
+ # + " words."
159
+ # + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
160
+ # )
161
+
162
+ # doc = doc[:MAX_WORDS]
163
+
164
+ submit_button = st.form_submit_button(label="✨ Submit!")
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  if not submit_button:
168
  st.stop()
 
169
 
170
+
171
+ para_set={
172
+ #model_type':para_model, # cnn or bioformer
173
+ 'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
174
+ 'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
175
+ 'ML_Threshold':para_threshold,# the Threshold of deep learning model
176
+ }
177
+ st.markdown("")
178
+ st.markdown("## 💡 Tagging results:")
179
+ with st.spinner('Wait for tagging...'):
180
+ tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
181
+
182
+ st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True)
183
  # print('dic...........:',biotag_dic.keys())
184
+ # st.write('parameters:', para_overlap,para_abbr,para_threshold)
185
+
186
+ html_results=''
187
+ text_results=doc+'\n'
188
+ entity_end=0
189
+ hpoid_count={}
190
+ if len(tag_result)>=0:
191
+ for ele in tag_result:
192
+ entity_start=int(ele[0])
193
+ html_results+=doc[entity_end:entity_start]
194
+ entity_end=int(ele[1])
195
+ entity_id=ele[2]
196
+ entity_score=ele[3]
197
+ text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
198
+ if entity_id not in hpoid_count.keys():
199
+ hpoid_count[entity_id]=1
200
+ else:
201
+ hpoid_count[entity_id]+=1
202
+
203
+ html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
204
+ html_results+=doc[entity_end:]
205
+
206
+ else:
207
+ html_results=doc
208
+
209
+ st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)
210
+
211
+
212
+ #table
213
+ data_entity=[]
214
+ for ele in hpoid_count.keys():
215
+ temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count
216
+ data_entity.append(temp)
217
+
218
+
219
+ st.markdown("")
220
+ st.markdown("")
221
+ # st.markdown("## Table output:")
222
+
223
+ # cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
224
+
225
+ # with c1:
226
+ # CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
227
+ # with c2:
228
+ # CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
229
+ # with c3:
230
+ # CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
231
+
232
+ # st.header("")
233
+
234
+ df = (
235
+ DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"])
236
+ .sort_values(by="Frequency", ascending=False)
237
+ .reset_index(drop=True)
238
+ )
239
+
240
+ df.index += 1
241
+
242
+ c1, c2, c3 = st.columns([1, 4, 1])
243
+
244
+ # format_dictionary = {
245
+ # "Relevancy": "{:.1%}",
246
+ # }
247
+
248
+ # df = df.format(format_dictionary)
249
+
250
+ with c2:
251
+ st.table(df)
252
+
253
+ c1, c2, c3 = st.columns([1, 1, 1])
254
+ with c2:
255
+ st.download_button('Download annotations', text_results)
256
+