lingbionlp commited on
Commit
ae5152f
1 Parent(s): 0d793c3

Upload 23 files

Browse files
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Nov 22 09:54:41 2022
4
+
5
+ @author: luol2
6
+ """
7
+
8
+
9
+
10
+ import streamlit as st
11
+ import argparse
12
+ from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
13
+ from src.dic_ner import dic_ont
14
+ from src.tagging_text import bioTag
15
+ import os
16
+ import time
17
+ import json
18
+ import sys
19
+
20
+
21
+ st.set_page_config(
22
+ page_title="PhenoTagger",
23
+ page_icon=":shark:",
24
+ # layout="wide",
25
+ initial_sidebar_state="expanded",
26
+ menu_items={
27
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
28
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
29
+ 'About': "# This is a header. This is an *extremely* cool app!"
30
+ }
31
+ )
32
+ st.title('PhenoTagger Demo')
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+ # with st.spinner('Model is being loaded..'):
41
+
42
+ # print('load model done!')
43
+
44
+
45
+
46
+
47
+ with st.form(key="my_form"):
48
+
49
+ @st.cache(allow_output_mutation=True)
50
+ def load_model():
51
+ ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
52
+ 'word_hpo_file':'./dict_new/word_id_map.json',
53
+ 'hpo_word_file':'./dict_new/id_word_map.json'}
54
+
55
+ # if para_set['model_type']=='cnn':
56
+ # vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
57
+ # 'charfile':'../vocab/char.vocab',
58
+ # 'labelfile':'../dict_new/lable.vocab',
59
+ # 'posfile':'../vocab/pos.vocab'}
60
+ # modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
61
+
62
+ # elif para_set['model_type']=='bioformer':
63
+ vocabfiles={'labelfile':'./dict_new/lable.vocab',
64
+ 'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
65
+ 'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
66
+ 'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
67
+ modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
68
+ # else:
69
+ # print('Model type is wrong, please select cnn or bioformer.')
70
+ # sys.exit()
71
+
72
+
73
+ biotag_dic=dic_ont(ontfiles)
74
+
75
+ # if para_set['model_type']=='cnn':
76
+ # nn_model=bioTag_CNN(vocabfiles)
77
+ # nn_model.load_model(modelfile)
78
+ # elif para_set['model_type']=='bioformer':
79
+ nn_model=bioTag_Bioformer(vocabfiles)
80
+ session=nn_model.load_model(modelfile)
81
+ test_tag='1232'
82
+ return nn_model,biotag_dic,test_tag,session
83
+
84
+
85
+ #hyper-parameter
86
+ st.sidebar.header("Hyperparameter Settings")
87
+ sbform = st.sidebar.form("Hyper-paramiters")
88
+ # para_model=sbform.selectbox('Model', ['cnn', 'bioformer'])
89
+ para_overlap=sbform.selectbox('Return overlapping concepts', ['True', 'False'])
90
+ para_abbr=sbform.selectbox('Identify abbreviations', ['True', 'False'])
91
+ para_threshold = sbform.slider('Threshold:', min_value=0.5, max_value=0.95, value=0.95, step=0.05)
92
+ sbform.form_submit_button("Setting")
93
+
94
+ st.write('parameters:', para_overlap,para_abbr,para_threshold)
95
+ nn_model,biotag_dic,test_tag,session=load_model()
96
+
97
+
98
+ input_text = st.text_area(
99
+ "Paste your text below (max 500 words)",
100
+ height=510,
101
+ )
102
+
103
+ MAX_WORDS = 500
104
+ import re
105
+ res = len(re.findall(r"\w+", input_text))
106
+ if res > MAX_WORDS:
107
+ st.warning(
108
+ "⚠️ Your text contains "
109
+ + str(res)
110
+ + " words."
111
+ + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
112
+ )
113
+
114
+ input_text = input_text[:MAX_WORDS]
115
+
116
+ submit_button = st.form_submit_button(label="✨ Get me the data!")
117
+
118
+ if para_overlap=='True':
119
+ para_overlap=True
120
+ else:
121
+ para_overlap=False
122
+ if para_abbr=='True':
123
+ para_abbr=True
124
+ else:
125
+ para_abbr=False
126
+ para_set={
127
+ #model_type':para_model, # cnn or bioformer
128
+ 'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
129
+ 'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
130
+ 'ML_Threshold':para_threshold,# the Threshold of deep learning model
131
+ }
132
+
133
+
134
+
135
+ if not submit_button:
136
+ st.stop()
137
+
138
+
139
+ st.markdown(f"""**Results:**\n""")
140
+ # print('dic...........:',biotag_dic.keys())
141
+ print('........:',test_tag)
142
+ print('........!!!!!!:',input_text)
143
+ print('...input:',input_text)
144
+ tag_result=bioTag(session,input_text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
145
+ for ele in tag_result:
146
+ start = ele[0]
147
+ last = ele[1]
148
+ mention = input_text[int(ele[0]):int(ele[1])]
149
+ type='Phenotype'
150
+ id=ele[2]
151
+ score=ele[3]
152
+ output=start+"\t"+last+"\t"+mention+"\t"+id+'\t'+score+"\n"
153
+ st.info(output)
154
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ TensorFlow==2.3.0
2
+ Keras==2.4.3
3
+ nltk==3.5
4
+ keras-bert==0.86.0
5
+ bioc==1.3.4
6
+ streamlit==1.13.0
src/__pycache__/Transformer_keras.cpython-37.pyc ADDED
Binary file (3.77 kB). View file
 
src/__pycache__/abbre_resolution.cpython-37.pyc ADDED
Binary file (9.4 kB). View file
 
src/__pycache__/combine_result.cpython-37.pyc ADDED
Binary file (2.32 kB). View file
 
src/__pycache__/dic_ner.cpython-37.pyc ADDED
Binary file (5.35 kB). View file
 
src/__pycache__/ml_ner.cpython-37.pyc ADDED
Binary file (12 kB). View file
 
src/__pycache__/nn_model.cpython-37.pyc ADDED
Binary file (5.28 kB). View file
 
src/__pycache__/nn_represent.cpython-37.pyc ADDED
Binary file (7.6 kB). View file
 
src/__pycache__/post_processing.cpython-37.pyc ADDED
Binary file (1.13 kB). View file
 
src/__pycache__/restore_index.cpython-37.pyc ADDED
Binary file (2.33 kB). View file
 
src/__pycache__/ssplit_tokenzier.cpython-37.pyc ADDED
Binary file (1.46 kB). View file
 
src/__pycache__/tagging_text.cpython-37.pyc ADDED
Binary file (1.63 kB). View file
 
src/abbre_resolution.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Aug 11 16:52:40 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import logging
9
+ import regex
10
+ import sys
11
+ import io
12
+
13
+ """
14
+ A Python 3 refactoring of Vincent Van Asch's Python 2 code at
15
+
16
+ http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
17
+
18
+ Based on
19
+
20
+ A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
21
+ A. Schwartz and M. Hearst
22
+ Biocomputing, 2003, pp 451-462.
23
+
24
+ """
25
+
26
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
27
+ log = logging.getLogger('Abbre')
28
+
29
+
30
+ class Candidate(str):
31
+ def __init__(self, value):
32
+ super().__init__()
33
+ self.start = 0
34
+ self.stop = 0
35
+
36
+ def set_position(self, start, stop):
37
+ self.start = start
38
+ self.stop = stop
39
+
40
+
41
+ def yield_lines_from_file(file_path):
42
+ with open(file_path, 'rb') as f:
43
+ for line in f:
44
+ try:
45
+ line = line.decode('utf-8')
46
+ except UnicodeDecodeError:
47
+ line = line.decode('latin-1').encode('utf-8').decode('utf-8')
48
+ line = line.strip()
49
+ yield line
50
+ f.close()
51
+
52
+
53
+ def yield_lines_from_doc(doc_text):
54
+ for line in doc_text.split("\n"):
55
+ yield line.strip()
56
+
57
+
58
+ def best_candidates(sentence):
59
+ """
60
+ :param sentence: line read from input file
61
+ :return: a Candidate iterator
62
+ """
63
+
64
+ if '(' in sentence:
65
+ # Check some things first
66
+ if sentence.count('(') != sentence.count(')'):
67
+ raise ValueError("Unbalanced parentheses: {}".format(sentence))
68
+
69
+ if sentence.find('(') > sentence.find(')'):
70
+ raise ValueError("First parentheses is right: {}".format(sentence))
71
+
72
+ closeindex = -1
73
+ while 1:
74
+ # Look for open parenthesis
75
+ openindex = sentence.find('(', closeindex + 1)
76
+
77
+ if openindex == -1: break
78
+
79
+ # Look for closing parentheses
80
+ closeindex = openindex + 1
81
+ open = 1
82
+ skip = False
83
+ while open:
84
+ try:
85
+ char = sentence[closeindex]
86
+ except IndexError:
87
+ # We found an opening bracket but no associated closing bracket
88
+ # Skip the opening bracket
89
+ skip = True
90
+ break
91
+ if char == '(':
92
+ open += 1
93
+ elif char in [')', ';', ':']:
94
+ open -= 1
95
+ closeindex += 1
96
+
97
+ if skip:
98
+ closeindex = openindex + 1
99
+ continue
100
+
101
+ # Output if conditions are met
102
+ start = openindex + 1
103
+ stop = closeindex - 1
104
+ candidate = sentence[start:stop]
105
+
106
+ # Take into account whitespace that should be removed
107
+ start = start + len(candidate) - len(candidate.lstrip())
108
+ stop = stop - len(candidate) + len(candidate.rstrip())
109
+ candidate = sentence[start:stop]
110
+
111
+ if conditions(candidate):
112
+ new_candidate = Candidate(candidate)
113
+ new_candidate.set_position(start, stop)
114
+ yield new_candidate
115
+
116
+
117
+ def conditions(candidate):
118
+ """
119
+ Based on Schwartz&Hearst
120
+
121
+ 2 <= len(str) <= 10
122
+ len(tokens) <= 2
123
+ re.search('\p{L}', str)
124
+ str[0].isalnum()
125
+
126
+ and extra:
127
+ if it matches (\p{L}\.?\s?){2,}
128
+ it is a good candidate.
129
+
130
+ :param candidate: candidate abbreviation
131
+ :return: True if this is a good candidate
132
+ """
133
+ viable = True
134
+ if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
135
+ viable = True
136
+ if len(candidate) < 2 or len(candidate) > 10:
137
+ viable = False
138
+ if len(candidate.split()) > 2:
139
+ viable = False
140
+ if not regex.search('\p{L}', candidate):
141
+ viable = False
142
+ if not candidate[0].isalnum():
143
+ viable = False
144
+
145
+ return viable
146
+
147
+
148
+ def get_definition(candidate, sentence):
149
+ """
150
+ Takes a candidate and a sentence and returns the definition candidate.
151
+
152
+ The definintion candidate is the set of tokens (in front of the candidate)
153
+ that starts with a token starting with the first character of the candidate
154
+
155
+ :param candidate: candidate abbreviation
156
+ :param sentence: current sentence (single line from input file)
157
+ :return: candidate definition for this abbreviation
158
+ """
159
+ # Take the tokens in front of the candidate
160
+ tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
161
+ #print(tokens)
162
+ # the char that we are looking for
163
+ key = candidate[0].lower()
164
+
165
+ # Count the number of tokens that start with the same character as the candidate
166
+ # print(tokens)
167
+ firstchars = [t[0] for t in tokens]
168
+ # print(firstchars)
169
+ definition_freq = firstchars.count(key)
170
+ candidate_freq = candidate.lower().count(key)
171
+
172
+ # Look for the list of tokens in front of candidate that
173
+ # have a sufficient number of tokens starting with key
174
+ if candidate_freq <= definition_freq:
175
+ # we should at least have a good number of starts
176
+ count = 0
177
+ start = 0
178
+ startindex = len(firstchars) - 1
179
+
180
+ while count < candidate_freq:
181
+ if abs(start) > len(firstchars):
182
+ raise ValueError("candiate {} not found".format(candidate))
183
+ start -= 1
184
+ # Look up key in the definition
185
+ try:
186
+ startindex = firstchars.index(key, len(firstchars) + start)
187
+ except ValueError:
188
+ pass
189
+
190
+ # Count the number of keys in definition
191
+ count = firstchars[startindex:].count(key)
192
+
193
+ # We found enough keys in the definition so return the definition as a definition candidate
194
+ start = len(' '.join(tokens[:startindex]))
195
+ stop = candidate.start - 1
196
+ candidate = sentence[start:stop]
197
+
198
+ # Remove whitespace
199
+ start = start + len(candidate) - len(candidate.lstrip())
200
+ stop = stop - len(candidate) + len(candidate.rstrip())
201
+ candidate = sentence[start:stop]
202
+
203
+ new_candidate = Candidate(candidate)
204
+ new_candidate.set_position(start, stop)
205
+ #print('new_candidate:')
206
+ #print(new_candidate,start,stop)
207
+ return new_candidate
208
+
209
+ else:
210
+ raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
211
+
212
+
213
+ def select_definition(definition, abbrev):
214
+ """
215
+ Takes a definition candidate and an abbreviation candidate
216
+ and returns True if the chars in the abbreviation occur in the definition
217
+
218
+ Based on
219
+ A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
220
+ :param definition: candidate definition
221
+ :param abbrev: candidate abbreviation
222
+ :return:
223
+ """
224
+
225
+
226
+ if len(definition) < len(abbrev):
227
+ raise ValueError('Abbreviation is longer than definition')
228
+
229
+ if abbrev in definition.split():
230
+ raise ValueError('Abbreviation is full word of definition')
231
+
232
+ sindex = -1
233
+ lindex = -1
234
+
235
+ while 1:
236
+ try:
237
+ longchar = definition[lindex].lower()
238
+ except IndexError:
239
+ raise
240
+
241
+ shortchar = abbrev[sindex].lower()
242
+
243
+ if not shortchar.isalnum():
244
+ sindex -= 1
245
+
246
+ if sindex == -1 * len(abbrev):
247
+ if shortchar == longchar:
248
+ if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
249
+ break
250
+ else:
251
+ lindex -= 1
252
+ else:
253
+ lindex -= 1
254
+ if lindex == -1 * (len(definition) + 1):
255
+ raise ValueError("definition {} was not found in {}".format(abbrev, definition))
256
+
257
+ else:
258
+ if shortchar == longchar:
259
+ sindex -= 1
260
+ lindex -= 1
261
+ else:
262
+ lindex -= 1
263
+ # print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
264
+ new_candidate = Candidate(definition[lindex:len(definition)])
265
+ new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
266
+ definition = new_candidate
267
+
268
+ tokens = len(definition.split())
269
+ length = len(abbrev)
270
+
271
+ if tokens > min([length + 5, length * 2]):
272
+ raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
273
+
274
+ # Do not return definitions that contain unbalanced parentheses
275
+ if definition.count('(') != definition.count(')'):
276
+ raise ValueError("Unbalanced parentheses not allowed in a definition")
277
+ # print('select:')
278
+ # print(definition,definition.start, definition.stop)
279
+ new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
280
+ return new_definition_dict
281
+
282
+
283
+ def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
284
+ abbrev_map = []
285
+ omit = 0
286
+ written = 0
287
+ if file_path:
288
+ sentence_iterator = enumerate(yield_lines_from_file(file_path))
289
+ elif doc_text:
290
+ sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
291
+ else:
292
+ return abbrev_map
293
+
294
+ for i, sentence in sentence_iterator:
295
+ #print(sentence)
296
+ try:
297
+ for candidate in best_candidates(sentence):
298
+ #print(candidate)
299
+ try:
300
+ #print('begin get definition')
301
+ definition = get_definition(candidate, sentence)
302
+ #print('get_definition:')
303
+ #print(definition)
304
+
305
+ except (ValueError, IndexError) as e:
306
+ #log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
307
+ omit += 1
308
+ else:
309
+ try:
310
+ definition_dict = select_definition(definition, candidate)
311
+ except (ValueError, IndexError) as e:
312
+ #log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
313
+ omit += 1
314
+ else:
315
+ definition_dict['abbre']=candidate
316
+ abbrev_map.append(definition_dict)
317
+ written += 1
318
+ except (ValueError, IndexError) as e:
319
+ log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
320
+ log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
321
+ return abbrev_map
322
+
323
+ def postprocess_abbr(ner_result,ori_text):
324
+
325
+ final_result={}
326
+ if len(ner_result)==0:
327
+ return []
328
+ # abbr recognition
329
+ abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
330
+
331
+ # read ner results
332
+ nor_loc_list={} #{entity_name_location:entity_information}
333
+
334
+ for ele in ner_result:
335
+ nor_loc_list[str(ele[0])+' '+str(ele[1])]=ele
336
+ final_result['\t'.join(ele)]=[int(ele[0]),int(ele[1])]
337
+
338
+ #abbr matching
339
+ for abbr in abbr_result:
340
+ abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
341
+ if abbr_index in nor_loc_list.keys():
342
+
343
+ line=ori_text
344
+ abbr_text=abbr['abbre']
345
+ abbr_eid=0
346
+ while line.find(abbr_text)>=0:
347
+ abbr_sid=line.find(abbr_text)+abbr_eid
348
+ abbr_eid=abbr_sid+len(abbr_text)
349
+ # print(abbr_sid,abbr_eid)
350
+ if abbr_sid>0 and abbr_eid<len(ori_text):
351
+ if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
352
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
353
+ elif abbr_sid==0 and abbr_eid<len(ori_text):
354
+ if ori_text[abbr_eid].isalnum()==False:
355
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
356
+ elif abbr_sid>0 and abbr_eid==len(ori_text):
357
+ if ori_text[abbr_sid-1].isalnum()==False :
358
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
359
+ line=ori_text[abbr_eid:]
360
+ # print(final_result)
361
+ sorted_final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
362
+ final_result=[]
363
+ for ele in sorted_final_result:
364
+ final_result.append(ele[0].split('\t'))
365
+ return final_result
366
+
367
+ def ner_abbr(ner_result,abbr_result,ori_text):
368
+ # read ner results
369
+ nor_name_list={} #{entity_name:entity_information}
370
+ nor_loc_list={} #{entity_name_location:entity_information}
371
+ final_result={} #{entity_information:location} use to sort
372
+ for ele in ner_result:
373
+ temp_seg=ele.split('\t')
374
+ nor_loc_list[temp_seg[0]+' '+temp_seg[1]]=temp_seg
375
+ nor_name_list[temp_seg[2].lower()]=temp_seg
376
+ final_result['\t'.join(temp_seg[0:4])]=[int(temp_seg[0]),int(temp_seg[1])]
377
+
378
+ #abbr matching
379
+ for abbr in abbr_result:
380
+ abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
381
+ if abbr_index in nor_loc_list.keys():
382
+
383
+ line=ori_text
384
+ abbr_text=abbr['abbre']
385
+ abbr_eid=0
386
+ while line.find(abbr_text)>=0:
387
+ abbr_sid=line.find(abbr_text)+abbr_eid
388
+ abbr_eid=abbr_sid+len(abbr_text)
389
+ # print(abbr_sid,abbr_eid)
390
+ if abbr_sid>0 and abbr_eid<len(ori_text):
391
+ if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
392
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
393
+ elif abbr_sid==0 and abbr_eid<len(ori_text):
394
+ if ori_text[abbr_eid].isalnum()==False:
395
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
396
+ elif abbr_sid>0 and abbr_eid==len(ori_text):
397
+ if ori_text[abbr_sid-1].isalnum()==False :
398
+ final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
399
+ line=ori_text[abbr_eid:]
400
+ # print(final_result)
401
+ final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
402
+
403
+ return final_result
404
+
405
+
406
+
407
+
408
+ if __name__ == '__main__':
409
+ path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/diseaseTag/data/test/results/'
410
+ fin=open(path+'NCBI_test_phecr_95.tsv','r',encoding='utf-8')
411
+ context=fin.read().strip().split('\n\n')
412
+ fin.close()
413
+ fout=open(path+'NCBI_test_phecr_abbre_95.tsv','w',encoding='utf-8')
414
+ for doc in context:
415
+ lines=doc.split('\n')
416
+ ori_text=lines[1]
417
+ # print(ori_text)
418
+ fout.write(lines[0]+'\n'+lines[1]+'\n')
419
+ if len(lines)>2:
420
+ abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
421
+ print(abbr_result)
422
+ abbr_out=ner_abbr(lines[2:],abbr_result,ori_text)
423
+ else:
424
+ abbr_out=[]
425
+ # print('final:',abbr_out)
426
+ for ele in abbr_out:
427
+ fout.write(ele[0]+'\n')
428
+ fout.write('\n')
429
+ # sys.exit()
430
+ fout.close()
431
+ #last_out=combine_ml_dict_fn(abbr_out,infile)
432
+ #print(last_out)
433
+
434
+
src/combine_result.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Jun 15 11:24:45 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ def nest_overlap_entity(nest_list):
10
+ temp_result_list={}
11
+ for i in range(0, len(nest_list)):
12
+ hpoid=nest_list[i][3]
13
+ if hpoid not in temp_result_list.keys():
14
+ temp_result_list[hpoid]=nest_list[i]
15
+ else:
16
+ score=float(nest_list[i][4])
17
+ old_score=float(temp_result_list[hpoid][4])
18
+ if score>old_score: # retain higer score concept
19
+ temp_result_list[hpoid]=nest_list[i]
20
+ new_list=[]
21
+ for hpoid in temp_result_list.keys():
22
+ new_list.append(temp_result_list[hpoid])
23
+
24
+ temp_result_list={} #same index, different ids
25
+ for i in range(0, len(new_list)):
26
+ ids=new_list[i][0]+' '+new_list[i][1]
27
+ if ids not in temp_result_list.keys():
28
+ temp_result_list[ids]=new_list[i]
29
+ else:
30
+ score=float(nest_list[i][4])
31
+ old_score=float(temp_result_list[ids][4])
32
+ if score>old_score:
33
+ temp_result_list[ids]=new_list[i]
34
+ final_list=[]
35
+ for ids in temp_result_list.keys():
36
+ final_list.append(temp_result_list[ids])
37
+ return final_list
38
+ def combine_ml_dict(dict_tsv,ml_tsv,nest=True):
39
+ fin_dic=io.StringIO(dict_tsv)
40
+ fin_ml=io.StringIO(ml_tsv)
41
+ fout=io.StringIO()
42
+ all_dic=fin_dic.read().strip().split('\n\n')
43
+ all_ml=fin_ml.read().strip().split('\n\n')
44
+ fin_dic.close()
45
+ fin_ml.close()
46
+
47
+ for i in range(0,len(all_dic)):
48
+ lines_dic=all_dic[i].split('\n')
49
+ lines_ml=all_ml[i].split('\n')
50
+ entity_list={}
51
+ for j in range(1,len(lines_dic)):
52
+ seg=lines_dic[j].split('\t')
53
+ entity_list[lines_dic[j]]=[int(seg[0]),int(seg[1])] #dict results score 1.00
54
+ for j in range(1,len(lines_ml)):
55
+ seg=lines_ml[j].split('\t')
56
+ entity_list[lines_ml[j]]=[int(seg[0]),int(seg[1])]
57
+
58
+ entity_list=sorted(entity_list.items(), key=lambda kv:(kv[1]), reverse=False)
59
+ entity_list_sort=[]
60
+ for ele in entity_list:
61
+ entity_list_sort.append(ele[0])
62
+
63
+ final_entity=[]
64
+ if len(entity_list_sort)!=0:
65
+ first_entity=entity_list_sort[0].split('\t')
66
+ nest_list=[first_entity]
67
+ max_eid=int(first_entity[1])
68
+
69
+ for i in range(1,len(entity_list_sort)):
70
+ segs=entity_list_sort[i].split('\t')
71
+ if int(segs[0])> max_eid:
72
+ if len(nest_list)==1:
73
+ final_entity.append(nest_list[0])
74
+ nest_list=[]
75
+ nest_list.append(segs)
76
+ if int(segs[1])>max_eid:
77
+ max_eid=int(segs[1])
78
+ else:
79
+ tem=nest_overlap_entity(nest_list)
80
+ final_entity.extend(tem)
81
+ nest_list=[]
82
+ nest_list.append(segs)
83
+ if int(segs[1])>max_eid:
84
+ max_eid=int(segs[1])
85
+ else:
86
+ nest_list.append(segs)
87
+ if int(segs[1])>max_eid:
88
+ max_eid=int(segs[1])
89
+ if nest_list!=[]:
90
+ if len(nest_list)==1:
91
+ final_entity.append(nest_list[0])
92
+
93
+ else:
94
+ tem=nest_overlap_entity(nest_list)#find max entity
95
+ final_entity.extend(tem)
96
+
97
+ fout.write(lines_ml[0]+'\n')
98
+ for ele in final_entity:
99
+ fout.write('\t'.join(ele)+'\n')
100
+ fout.write('\n')
101
+ return fout.getvalue()
102
+
src/dic_ner.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 15:05:00 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import sys
8
+ import json
9
+ import io
10
+ from src.ssplit_tokenzier import ssplit_token_pos_lemma
11
+ class Trie(object):
12
+ class Node(object):
13
+ def __init__(self):
14
+ self.term = None
15
+ self.next = {}
16
+
17
+ def __init__(self, terms=[]):
18
+ self.root = Trie.Node()
19
+ for term in terms:
20
+ self.add(term)
21
+
22
+ def add(self, term):
23
+ node = self.root
24
+ for char in term:
25
+ if not char in node.next:
26
+ node.next[char] = Trie.Node()
27
+ node = node.next[char]
28
+ node.term = term
29
+
30
+ def match(self, query):
31
+ results = []
32
+ for i in range(len(query)):
33
+ node = self.root
34
+ for j in range(i, len(query)):
35
+ node = node.next.get(query[j])
36
+ if not node:
37
+ break
38
+ if node.term:
39
+ results.append((i, len(node.term)))
40
+ return results
41
+
42
+ def __repr__(self):
43
+ output = []
44
+ def _debug(output, char, node, depth=0):
45
+ output.append('%s[%s][%s]' % (' '*depth, char, node.term))
46
+ for (key, n) in node.next.items():
47
+ _debug(output, key, n, depth+1)
48
+ _debug(output, '', self.root)
49
+ return '\n'.join(output)
50
+
51
+ class dic_ont():
52
+
53
+ def __init__(self, ont_files):
54
+
55
+ dicin=open(ont_files['dic_file'],'r',encoding='utf-8')
56
+ win_size=50000
57
+ Dic=[]
58
+ print("loading dict!")
59
+ for line in dicin:
60
+ line=line.strip()
61
+ if len(line.split())<=win_size:
62
+ words=line.split()
63
+ for i in range(len(words)):
64
+ if len(words[i])>3 and (not words[i].isupper()):
65
+ words[i]=words[i].lower()
66
+ line=' '.join(words[0:])
67
+ Dic.append(line.strip())
68
+ print("Dic_len:",len(Dic))
69
+ dicin.close()
70
+
71
+ self.dic_trie = Trie(Dic)
72
+ print("load dic done!")
73
+
74
+ #load word hpo mapping
75
+ fin_map=open(ont_files['word_hpo_file'],'r',encoding='utf-8')
76
+ self.word_hpo=json.load(fin_map)
77
+ fin_map.close()
78
+
79
+ #load hpo word mapping
80
+ fin_map=open(ont_files['hpo_word_file'],'r',encoding='utf-8')
81
+ self.hpo_word=json.load(fin_map)
82
+ fin_map.close()
83
+
84
+ def matching(self, source):
85
+
86
+ fin=io.StringIO(source)
87
+ fout=io.StringIO()
88
+
89
+ sent_list=[]
90
+ sent = []
91
+ sent_ori_list=[]
92
+ sent_ori=[]
93
+
94
+ for line in fin:
95
+ line=line.strip()
96
+ if line=="":
97
+ sent_list.append(sent)
98
+ sent_ori_list.append(sent_ori)
99
+ sent=[]
100
+ sent_ori=[]
101
+ else:
102
+ words=line.split('\t')
103
+ words[1]=words[1].lower()
104
+ sent.append(words[1]) # word lemma
105
+ sent_ori.append(words[0])
106
+ sent=[]
107
+ fin.close()
108
+
109
+ for k in range(len(sent_list)):
110
+ sent = sent_list[k]
111
+ sentence=' '.join(sent[0:])+" "
112
+ sentence_ori=' '.join(sent_ori_list[k])
113
+ # print('sentence:',sentence)
114
+ result=self.dic_trie.match(sentence)
115
+ # print('result:',result)
116
+ new_result=[]
117
+ for i in range(0,len(result)):
118
+ if result[i][0]==0 and sentence[result[i][1]]==" ":
119
+ new_result.append([result[i][0],result[i][0]+result[i][1]])
120
+ elif result[i][0]>0 and sentence[result[i][0]-1]==' ' and sentence[result[i][0]+result[i][1]]==' ':
121
+ new_result.append([result[i][0],result[i][0]+result[i][1]])
122
+ # print('new result:',new_result)
123
+
124
+
125
+
126
+ if len(new_result)==0:
127
+ fout.write(sentence_ori+'\n\n')
128
+
129
+ else:
130
+ fout.write(sentence_ori+'\n')
131
+ for ele in new_result:
132
+ entity_text=sentence[ele[0]:ele[1]]
133
+ if entity_text in self.word_hpo.keys():
134
+ hpoid=self.word_hpo[entity_text]
135
+ else:
136
+ print('no id:', entity_text)
137
+ hpoid=['None']
138
+ if ele[0]==0:
139
+ sid="0"
140
+ else:
141
+ temp_sent=sentence[0:ele[0]]
142
+ sid=str(len(temp_sent.rstrip().split(' ')))
143
+ temp_sent=sentence[0:ele[1]]
144
+ eid=str(len(temp_sent.rstrip().split(' '))-1)
145
+ # print(sid,eid,entity_text,hpoid[0])
146
+ fout.write(sid+'\t'+eid+'\t'+entity_text+'\t'+";".join(hpoid)+'\t1.00\n')
147
+ fout.write('\n')
148
+
149
+ return fout.getvalue()
150
+
151
+
152
+ if __name__=='__main__':
153
+
154
+ ontfiles={'dic_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/hpo_noabb_lemma.dic',
155
+ 'word_hpo_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/word_hpoid_map.json',
156
+ 'hpo_word_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/hpoid_word_map.json'}
157
+ biotag_dic=dic_ont(ontfiles)
158
+ text='Nevoid basal cell carcinoma syndrome (NBCCS) is a hereditary condition transmitted as an autosomal dominant trait with complete penetrance and variable expressivity. The syndrome is characterised by numerous basal cell carcinomas (BCCs), odontogenic keratocysts of the jaws, palmar and/or plantar pits, skeletal abnormalities and intracranial calcifications. In this paper, the clinical features of 37 Italian patients are reviewed. Jaw cysts and calcification of falx cerebri were the most frequently observed anomalies, followed by BCCs and palmar/plantar pits. Similar to the case of African Americans, the relatively low frequency of BCCs in the Italian population is probably due to protective skin pigmentation. A future search based on mutation screening might establish a possible genotype phenotype correlation in Italian patients.'
159
+ ssplit_token=ssplit_token_pos_lemma(text)
160
+ # print(ssplit_token)
161
+ dic_result=biotag_dic.matching(ssplit_token)
162
+ print(dic_result)
163
+
164
+
src/ml_ner.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 16:41:54 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ import time
10
+ import numpy as np
11
+ from keras import backend as K
12
+ def ml_intext(infile):
13
+ fin=open(infile,'r',encoding='utf-8')
14
+ alltexts=fin.read().strip().split('\n\n')
15
+ fin.close()
16
+ data_list=[]
17
+ label_list=[]
18
+ for sents in alltexts:
19
+ lines=sents.split('\n')
20
+ temp_sentece=[]
21
+ label=lines[0].split('\t')[0]
22
+ label_list.append(label)
23
+ for i in range(1,len(lines)):
24
+ seg=lines[i].split('\t')
25
+ temp_sentece.append(seg)
26
+ data_list.append(temp_sentece)
27
+ return data_list,label_list
28
+ def ml_intext_fn(ml_input):
29
+ fin=io.StringIO(ml_input)
30
+ alltexts=fin.read().strip().split('\n\n')
31
+ fin.close()
32
+ data_list=[]
33
+ label_list=[]
34
+ for sents in alltexts:
35
+ lines=sents.split('\n')
36
+ temp_sentece=[]
37
+ label=lines[0].split('\t')[0]
38
+ label_list.append(label)
39
+ for i in range(1,len(lines)):
40
+ seg=lines[i].split('\t')
41
+ temp_sentece.append(seg)
42
+ data_list.append(temp_sentece)
43
+ return data_list,label_list
44
+ def pun_filter(temp_entity):
45
+ pun_list=[',','.','!',';',':','?','(',')','[',']','{','}']
46
+ filter_flag=0
47
+ for ele in temp_entity:
48
+ if ele in pun_list:
49
+ filter_flag=1
50
+ break
51
+ return filter_flag
52
+ def pos_filter(temp_pos,temp_entity):
53
+ pos_list_l=['PRP']
54
+ pos_list=['IN','DT','CC','O','MD','EX','POS','WDT','WP','WP$','WRB','TO','PRP$']
55
+ verb_word=['is','are','was','were','had','have','has','be','been','also']
56
+ filter_flag=0
57
+
58
+ if (temp_entity[0] in verb_word) or (temp_entity[-1] in verb_word):
59
+ filter_flag=1
60
+ if (temp_pos[0] in pos_list) or (temp_pos[-1] in pos_list) or (temp_pos[0] in pos_list_l):
61
+ filter_flag=1
62
+ return filter_flag
63
+
64
+ def build_ngram_testset_filted(conll_input,Ngram=8):
65
+
66
+ fin_genia=io.StringIO(conll_input)
67
+ fout_context=io.StringIO()
68
+ fout_txt=io.StringIO()
69
+
70
+ index_dict={}
71
+ allentity=[]
72
+ alltext=fin_genia.read().strip().split('\n\n')
73
+ fin_genia.close()
74
+ num_total=0
75
+ for i in range(0,len(alltext)):
76
+
77
+ lines=alltext[i].split('\n')
78
+ ori_txt=[]
79
+ for ele in lines:
80
+ seg=ele.split('\t')
81
+ ori_txt.append(seg[0])
82
+ fout_txt.write(' '.join(ori_txt)+'\n')
83
+
84
+ if Ngram>len(lines):
85
+ Ngram=len(lines)
86
+
87
+ fout_context_list=[]
88
+ temp_entity=[]
89
+ temp_pos=[]
90
+ for ngram in range(2,Ngram+1):
91
+ if ngram==1:
92
+ for j in range(0, len(lines)):
93
+ sid=0
94
+ eid=0
95
+ for m in range(0,len(lines)):
96
+ if m==j:
97
+ sid=m
98
+ eid=m
99
+ fout_context_list.append(lines[m]+'\tO\tB')
100
+ temp_seg=lines[m].split('\t')
101
+ temp_entity.append(temp_seg[0])
102
+ temp_pos.append(temp_seg[3])
103
+ else:
104
+ pass
105
+ # print(sentence[m])
106
+ # fout_context_list.append(lines[m]+'\tO\tO')
107
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
108
+ num_total+=1
109
+ if ' '.join(temp_entity) not in allentity:
110
+ allentity.append(' '.join(temp_entity))
111
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
112
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
113
+ index_dict[str(num_total)]=[i,sid,eid]
114
+ temp_entity=[]
115
+ temp_pos=[]
116
+ fout_context_list=[]
117
+ elif ngram==2:
118
+ for j in range(0, len(lines)-1):
119
+ sid=0
120
+ eid=0
121
+ for m in range(0,len(lines)):
122
+ if m==j:
123
+ fout_context_list.append(lines[m]+'\tO\tB')
124
+ sid=m
125
+ temp_seg=lines[m].split('\t')
126
+ temp_entity.append(temp_seg[0])
127
+ temp_pos.append(temp_seg[3])
128
+ elif m==j+1:
129
+ fout_context_list.append(lines[m]+'\tO\tB')
130
+ eid=m
131
+ temp_seg=lines[m].split('\t')
132
+ temp_entity.append(temp_seg[0])
133
+ temp_pos.append(temp_seg[3])
134
+ else:
135
+ pass
136
+ # fout_context_list.append(lines[m]+'\tO\tO')
137
+
138
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
139
+ num_total+=1
140
+ if ' '.join(temp_entity) not in allentity:
141
+ allentity.append(' '.join(temp_entity))
142
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
143
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
144
+ index_dict[str(num_total)]=[i,sid,eid]
145
+ temp_entity=[]
146
+ temp_pos=[]
147
+ fout_context_list=[]
148
+ else :
149
+ for j in range(0, len(lines)-ngram+1):
150
+ sid=0
151
+ eid=0
152
+ for m in range(0,len(lines)):
153
+ if m==j:
154
+ fout_context_list.append(lines[m]+'\tO\tB')
155
+ sid=m
156
+ temp_seg=lines[m].split('\t')
157
+ temp_entity.append(temp_seg[0])
158
+ temp_pos.append(temp_seg[3])
159
+ elif m>j and m<j+ngram-1:
160
+ fout_context_list.append(lines[m]+'\tO\tB')
161
+ temp_seg=lines[m].split('\t')
162
+ temp_entity.append(temp_seg[0])
163
+ temp_pos.append(temp_seg[2])
164
+ elif m==j+ngram-1:
165
+ fout_context_list.append(lines[m]+'\tO\tB')
166
+ eid=m
167
+ temp_seg=lines[m].split('\t')
168
+ temp_entity.append(temp_seg[0])
169
+ temp_pos.append(temp_seg[3])
170
+ else:
171
+ pass
172
+ # fout_context_list.append(lines[m]+'\tO\tO')
173
+
174
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
175
+ num_total+=1
176
+ if ' '.join(temp_entity) not in allentity:
177
+ allentity.append(' '.join(temp_entity))
178
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
179
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
180
+ index_dict[str(num_total)]=[i,sid,eid]
181
+
182
+ temp_entity=[]
183
+ temp_pos=[]
184
+ fout_context_list=[]
185
+
186
+ return fout_context.getvalue(),fout_txt.getvalue(),index_dict
187
+
188
+ def build_all_ngram_testset_filted(conll_input,Ngram=8):
189
+
190
+ fin_genia=io.StringIO(conll_input)
191
+ fout_context=io.StringIO()
192
+ fout_txt=io.StringIO()
193
+
194
+ index_dict={}
195
+ allentity=[]
196
+ alltext=fin_genia.read().strip().split('\n\n')
197
+ fin_genia.close()
198
+ num_total=0
199
+ for i in range(0,len(alltext)):
200
+
201
+ lines=alltext[i].split('\n')
202
+ ori_txt=[]
203
+ for ele in lines:
204
+ seg=ele.split('\t')
205
+ ori_txt.append(seg[0])
206
+ fout_txt.write(' '.join(ori_txt)+'\n')
207
+
208
+ if Ngram>len(lines):
209
+ Ngram=len(lines)
210
+
211
+ fout_context_list=[]
212
+ temp_entity=[]
213
+ temp_pos=[]
214
+ for ngram in range(1,Ngram+1):
215
+ if ngram==1:
216
+ for j in range(0, len(lines)):
217
+ sid=0
218
+ eid=0
219
+ for m in range(0,len(lines)):
220
+ if m==j:
221
+ sid=m
222
+ eid=m
223
+ fout_context_list.append(lines[m]+'\tO\tB')
224
+ temp_seg=lines[m].split('\t')
225
+ temp_entity.append(temp_seg[0])
226
+ temp_pos.append(temp_seg[3])
227
+ else:
228
+ pass
229
+ # print(sentence[m])
230
+ # fout_context_list.append(lines[m]+'\tO\tO')
231
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
232
+ num_total+=1
233
+ if ' '.join(temp_entity) not in allentity:
234
+ allentity.append(' '.join(temp_entity))
235
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
236
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
237
+ index_dict[str(num_total)]=[i,sid,eid]
238
+ temp_entity=[]
239
+ temp_pos=[]
240
+ fout_context_list=[]
241
+ elif ngram==2:
242
+ for j in range(0, len(lines)-1):
243
+ sid=0
244
+ eid=0
245
+ for m in range(0,len(lines)):
246
+ if m==j:
247
+ fout_context_list.append(lines[m]+'\tO\tB')
248
+ sid=m
249
+ temp_seg=lines[m].split('\t')
250
+ temp_entity.append(temp_seg[0])
251
+ temp_pos.append(temp_seg[3])
252
+ elif m==j+1:
253
+ fout_context_list.append(lines[m]+'\tO\tB')
254
+ eid=m
255
+ temp_seg=lines[m].split('\t')
256
+ temp_entity.append(temp_seg[0])
257
+ temp_pos.append(temp_seg[3])
258
+ else:
259
+ pass
260
+ # fout_context_list.append(lines[m]+'\tO\tO')
261
+
262
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
263
+ num_total+=1
264
+ if ' '.join(temp_entity) not in allentity:
265
+ allentity.append(' '.join(temp_entity))
266
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
267
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
268
+ index_dict[str(num_total)]=[i,sid,eid]
269
+ temp_entity=[]
270
+ temp_pos=[]
271
+ fout_context_list=[]
272
+ else :
273
+ for j in range(0, len(lines)-ngram+1):
274
+ sid=0
275
+ eid=0
276
+ for m in range(0,len(lines)):
277
+ if m==j:
278
+ fout_context_list.append(lines[m]+'\tO\tB')
279
+ sid=m
280
+ temp_seg=lines[m].split('\t')
281
+ temp_entity.append(temp_seg[0])
282
+ temp_pos.append(temp_seg[3])
283
+ elif m>j and m<j+ngram-1:
284
+ fout_context_list.append(lines[m]+'\tO\tB')
285
+ temp_seg=lines[m].split('\t')
286
+ temp_entity.append(temp_seg[0])
287
+ temp_pos.append(temp_seg[2])
288
+ elif m==j+ngram-1:
289
+ fout_context_list.append(lines[m]+'\tO\tB')
290
+ eid=m
291
+ temp_seg=lines[m].split('\t')
292
+ temp_entity.append(temp_seg[0])
293
+ temp_pos.append(temp_seg[3])
294
+ else:
295
+ pass
296
+ # fout_context_list.append(lines[m]+'\tO\tO')
297
+
298
+ if pun_filter(temp_entity)==0 and pos_filter(temp_pos,temp_entity)==0:
299
+ num_total+=1
300
+ if ' '.join(temp_entity) not in allentity:
301
+ allentity.append(' '.join(temp_entity))
302
+ fout_context.write('HP:None\t'+' '.join(temp_entity)+'\n')
303
+ fout_context.write('\n'.join(fout_context_list)+'\n\n')
304
+ index_dict[str(num_total)]=[i,sid,eid]
305
+
306
+ temp_entity=[]
307
+ temp_pos=[]
308
+ fout_context_list=[]
309
+
310
+ return fout_context.getvalue(),fout_txt.getvalue(),index_dict
311
+
312
+ def output_result(result,label_2_index,Top_N=5):
313
+
314
+ fout=io.StringIO()
315
+ hpo_label={}
316
+
317
+ for key in label_2_index.keys():
318
+ hpo_label[label_2_index[key]]=key
319
+
320
+
321
+ for line in result:
322
+ #Top_index=line.argsort()[-1*Top_N:][::-1]
323
+ index_top_unsort=np.argpartition(line,-Top_N)[-Top_N:]
324
+ values_top=line[index_top_unsort]
325
+ Top_index=index_top_unsort[np.argsort(-values_top)]
326
+ temp_list=[]
327
+ for max_index in Top_index:
328
+ hpo_id=hpo_label[max_index]
329
+ hpo_id_value=round(line[max_index],5)
330
+ temp_list.append(str(hpo_id)+'|'+str(hpo_id_value))
331
+ fout.write('\t'.join(temp_list)+'\n')
332
+
333
+ return fout.getvalue()
334
+
335
+ def decode_tsv(test_score, ml_input_index, ml_input_txt, T=0.8):
336
+
337
+ fin_predict=io.StringIO(test_score)
338
+ fin_text=io.StringIO(ml_input_txt)
339
+ fout=io.StringIO()
340
+
341
+ test_txt=fin_text.read().strip().split('\n')
342
+ test_index=ml_input_index
343
+ test_pre=fin_predict.read().strip().split('\n')
344
+
345
+ fin_text.close()
346
+ fin_predict.close()
347
+
348
+ sent_result={}
349
+ for i in range(0,len(test_pre)):
350
+ seg_pre=test_pre[i].split('\t')[0].split('|')
351
+ #print(seg_pre,T)
352
+ if float(seg_pre[1])>T and seg_pre[0]!='HP:None':
353
+ term_id=str(i+1)
354
+ pre_result=[test_index[term_id][1],test_index[term_id][2],seg_pre[0],seg_pre[1]]
355
+ sent_id=str(test_index[term_id][0])
356
+ if sent_id not in sent_result.keys():
357
+ sent_result[sent_id]=[pre_result]
358
+ else:
359
+ sent_result[sent_id].append(pre_result)
360
+
361
+ for i in range(0,len(test_txt)):
362
+ fout.write(test_txt[i]+'\n')
363
+ if str(i) in sent_result.keys():
364
+ temp_result={}
365
+ for ele in sent_result[str(i)]:
366
+ temp_line=str(ele[0])+'\t'+str(ele[1])+'\t'+' '.join(test_txt[i].split()[ele[0]:ele[1]+1])+'\t'+ele[2]+'\t'+ele[3]
367
+ temp_result[temp_line]=[ele[0],ele[1]]
368
+ if len(temp_result)>=1:
369
+ temp_result=sorted(temp_result.items(), key=lambda d: (d[1][0],d[1][1]), reverse=False)
370
+ for ent in temp_result:
371
+ fout.write(ent[0]+'\n')
372
+ fout.write('\n')
373
+
374
+ return fout.getvalue()
375
+
376
+ def score_filter(temp_entity, T=0.1):
377
+
378
+ result_list=[]
379
+ for i in range(0,len(temp_entity)):
380
+ if float (temp_entity[i][-1])>T:
381
+ result_list.append(temp_entity[i])
382
+ return(result_list)
383
+ def find_max_entity_nest(nest_list):
384
+ temp_result_list={}
385
+ for i in range(0, len(nest_list)):
386
+ hpoid=nest_list[i][-2]
387
+ score=float(nest_list[i][-1])
388
+ if hpoid not in temp_result_list.keys():
389
+ temp_result_list[hpoid]=nest_list[i]
390
+ else:
391
+ if score>float(temp_result_list[hpoid][-1]):
392
+ temp_result_list[hpoid]=nest_list[i]
393
+ new_list=[]
394
+ for hpoid in temp_result_list.keys():
395
+ new_list.append(temp_result_list[hpoid])
396
+ return new_list
397
+ def duplicate_filter(temp_entity):
398
+ result_list=[]
399
+ if len(temp_entity)>1:
400
+ first_entity=temp_entity[0]
401
+ nest_list=[first_entity]
402
+ max_eid=int(first_entity[1])
403
+
404
+ for i in range(1,len(temp_entity)):
405
+ segs=temp_entity[i]
406
+ if int(segs[0])> max_eid:
407
+ if len(nest_list)==1:
408
+ result_list.append(nest_list[0])
409
+ nest_list=[segs]
410
+ if int(segs[1])>max_eid:
411
+ max_eid=int(segs[1])
412
+ else:
413
+ result_list.extend(find_max_entity_nest(nest_list))
414
+ nest_list=[segs]
415
+
416
+ if int(segs[1])>max_eid:
417
+ max_eid=int(segs[1])
418
+
419
+ else:
420
+ nest_list.append(segs)
421
+ if int(segs[1])>max_eid:
422
+ max_eid=int(segs[1])
423
+ if nest_list!=[]:
424
+ if len(nest_list)==1:
425
+ result_list.append(nest_list[0])
426
+
427
+ else:
428
+ result_list.extend(find_max_entity_nest(nest_list))
429
+ else:
430
+ result_list=temp_entity
431
+ return result_list
432
+ def combine_strategy(test_decode_temp, T=0.8):
433
+ fin=io.StringIO(test_decode_temp)
434
+ fout=io.StringIO()
435
+
436
+ documents=fin.read().strip().split('\n\n')
437
+ fin.close()
438
+
439
+ for doc in documents:
440
+ lines=doc.split('\n')
441
+ context=lines[0]
442
+ final_entity_list=[]
443
+ if len(lines)>1:
444
+ # all entity candidates
445
+ temp_entity=[]
446
+ for i in range(1,len(lines)):
447
+ temp_entity.append(lines[i].split('\t'))
448
+ #print('all entity condidates: ',len(temp_entity))
449
+
450
+ # 将阈值低于T的候选过滤
451
+ filter1=score_filter(temp_entity,T)
452
+ # print('filter1:', len(filter1))
453
+ filter2=duplicate_filter(filter1)
454
+ #print('filter2:', filter2)
455
+ final_entity_list=filter2
456
+
457
+ fout.write(context+'\n')
458
+ for ele in final_entity_list:
459
+ fout.write('\t'.join(ele)+'\n')
460
+ fout.write('\n')
461
+
462
+ return fout.getvalue()
463
+
464
+
465
+ def model_predict(session,ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
466
+ if nn_model.model_type=='cnn':
467
+ #startTime=time.time()
468
+ test_set,test_label = ml_intext_fn(ml_input)
469
+ test_x, test_y = nn_model.rep.represent_instances_all_feas(test_set,test_label,word_max_len=nn_model.hyper['sen_max'],char_max_len=nn_model.hyper['word_max'])
470
+ input_test = []
471
+
472
+ if nn_model.fea_dict['word'] == 1:
473
+ input_test.append(test_x[0])
474
+
475
+ if nn_model.fea_dict['char'] == 1:
476
+ input_test.append(test_x[1])
477
+
478
+ if nn_model.fea_dict['lemma'] == 1:
479
+ input_test.append(test_x[2])
480
+
481
+ if nn_model.fea_dict['pos'] == 1:
482
+ input_test.append(test_x[3])
483
+ # print('ml-model-represent:',time.time()-startTime)
484
+ # startTime=time.time()
485
+ K.set_session(session)
486
+ test_pre = nn_model.model.predict(input_test)
487
+ # print('ml-model-predict:',time.time()-startTime)
488
+
489
+ elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
490
+ #startTime=time.time()
491
+ test_set,test_label = ml_intext_fn(ml_input)
492
+ test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
493
+ #print('ml-model-represent:',time.time()-startTime)
494
+ #startTime=time.time()
495
+ #K.set_session(session)
496
+ #with session.as_default():
497
+ #with session.graph.as_default():
498
+ #print('......session')
499
+ test_pre = nn_model.model.predict(test_x)
500
+ #print('ml-model-modedpred:',time.time()-startTime)
501
+ # startTime=time.time()
502
+ test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
503
+ # print('ml-model-output:',time.time()-startTime)
504
+ #print('test_score:',test_score)
505
+ # startTime=time.time()
506
+ test_decode_temp=decode_tsv(test_score, ml_input_index, ml_input_txt, T=Threshold)
507
+ # print('ml-model-decode:',time.time()-startTime)
508
+ #print('decode_temp:\n',test_decode_temp)
509
+ # test_pre_tsv=combine_strategy(test_decode_temp,T=Threshold)
510
+ return test_decode_temp
511
+
512
+ def model_predict_old(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
513
+ if nn_model.model_type=='cnn':
514
+
515
+ test_set,test_label = ml_intext_fn(ml_input)
516
+ test_x, test_y = nn_model.rep.represent_instances_all_feas(test_set,test_label,word_max_len=nn_model.hyper['sen_max'],char_max_len=nn_model.hyper['word_max'])
517
+ input_test = []
518
+
519
+ if nn_model.fea_dict['word'] == 1:
520
+ input_test.append(test_x[0])
521
+
522
+ if nn_model.fea_dict['char'] == 1:
523
+ input_test.append(test_x[1])
524
+
525
+ if nn_model.fea_dict['lemma'] == 1:
526
+ input_test.append(test_x[2])
527
+
528
+ if nn_model.fea_dict['pos'] == 1:
529
+ input_test.append(test_x[3])
530
+ K.set_session(nn_model.session)
531
+ with nn_model.session.as_default():
532
+ with nn_model.session.graph.as_default():
533
+ test_pre = nn_model.model.predict(input_test,batch_size=256)
534
+
535
+ elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
536
+
537
+ test_set,test_label = ml_intext_fn(ml_input)
538
+ test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
539
+ K.set_session(nn_model.session)
540
+ with nn_model.session.as_default():
541
+ with nn_model.session.graph.as_default():
542
+ test_pre = nn_model.model.predict(test_x,batch_size=128)
543
+
544
+ test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
545
+ #print('test_score:',test_score)
546
+ test_decode_temp=decode_tsv(test_score, ml_input_index, ml_input_txt, T=0.0)
547
+ #print('decode_temp:\n',test_decode_temp)
548
+ test_pre_tsv=combine_strategy(test_decode_temp,T=Threshold)
549
+ return test_pre_tsv
550
+
551
+ def output_txt(ml_input_txt):
552
+ fin_text=io.StringIO(ml_input_txt)
553
+ fout=io.StringIO()
554
+
555
+ test_txt=fin_text.read().strip().split('\n')
556
+
557
+ fin_text.close()
558
+
559
+ for i in range(0,len(test_txt)):
560
+ fout.write(test_txt[i]+'\n')
561
+ fout.write('\n')
562
+
563
+ return fout.getvalue()
564
+
565
+ def ml_tagging(session,ssplit_token,ml_model,Threshold):
566
+ # startTime=time.time()
567
+ ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
568
+ # print('ml-ngrambuild:',time.time()-startTime)
569
+ #print('ml_input:')
570
+ #print(ml_input)
571
+ # startTime=time.time()
572
+ if len(ml_input_index)>0:
573
+ ml_pre_tsv=model_predict(session,ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
574
+ else:
575
+ ml_pre_tsv=output_txt(ml_input_txt)
576
+ # print('ml-modelpred:',time.time()-startTime)
577
+ return ml_pre_tsv
578
+
579
+ def ml_tagging_allngram(ssplit_token,ml_model,Threshold):
580
+ ml_input, ml_input_txt,ml_input_index=build_all_ngram_testset_filted(ssplit_token)
581
+ #print('ml_input:')
582
+ #print(ml_input)
583
+ if len(ml_input_index)>0:
584
+ ml_pre_tsv=model_predict_old(ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
585
+ else:
586
+ ml_pre_tsv=output_txt(ml_input_txt)
587
+ return ml_pre_tsv
src/nn_model.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Thu Mar 26 09:04:13 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import time
9
+ import sys
10
+ import numpy as np
11
+ import tensorflow as tf
12
+ import keras
13
+ from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
14
+ from keras.layers import *
15
+ from keras.models import Model
16
+ from keras import backend as K
17
+ from keras_bert import load_trained_model_from_checkpoint
18
+
19
+
20
+
21
+
22
+ class bioTag_CNN():
23
+ def __init__(self, model_files):
24
+ self.model_type='cnn'
25
+ model_test_type='cnn'
26
+ self.fea_dict = {'word': 1,
27
+ 'char': 1,
28
+ 'lemma':0,
29
+ 'pos':0}
30
+
31
+ self.hyper = {'sen_max' :20,
32
+ 'word_max' :40,
33
+ 'charvec_size' :50,
34
+ 'pos_size' :50}
35
+
36
+ self.w2vfile=model_files['w2vfile']
37
+ self.charfile=model_files['charfile']
38
+ self.labelfile=model_files['labelfile']
39
+ self.posfile=model_files['posfile']
40
+ self.session = K.get_session()
41
+ vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
42
+ print('loading w2v model.....')
43
+ self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
44
+
45
+ print('building model......')
46
+ all_fea = []
47
+ fea_list = []
48
+
49
+ if self.fea_dict['word'] == 1:
50
+ word_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='word_input')
51
+ all_fea.append(word_input)
52
+ word_fea = Embedding(self.rep.vec_table.shape[0], self.rep.vec_table.shape[1], weights=[self.rep.vec_table], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='word_emd')(word_input)
53
+ fea_list.append(word_fea)
54
+
55
+ if self.fea_dict['char'] == 1:
56
+ char_input = Input(shape=(self.hyper['sen_max'],self.hyper['word_max']), dtype='int32', name='char_input')
57
+ all_fea.append(char_input)
58
+ char_fea = TimeDistributed(Embedding(self.rep.char_table_size, self.hyper['charvec_size'], trainable=True,mask_zero=False), name='char_emd')(char_input)
59
+ char_fea = TimeDistributed(Conv1D(self.hyper['charvec_size']*2, 3, padding='same',activation='relu'), name="char_cnn")(char_fea)
60
+ char_fea_max = TimeDistributed(GlobalMaxPooling1D(), name="char_pooling_max")(char_fea)
61
+ fea_list.append(char_fea_max)
62
+
63
+ if self.fea_dict['lemma'] == 1:
64
+ lemma_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='lemma_input')
65
+ all_fea.append(lemma_input)
66
+ lemma_fea = Embedding(self.rep.vec_table.shape[0], self.rep.vec_table.shape[1], weights=[self.rep.vec_table], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='lemma_emd')(lemma_input)
67
+ fea_list.append(lemma_fea)
68
+
69
+ if self.fea_dict['pos'] == 1:
70
+ pos_input = Input(shape=(self.hyper['sen_max'],), dtype='int32', name='pos_input')
71
+ all_fea.append(pos_input)
72
+ pos_fea = Embedding(self.rep.pos_table_size, self.hyper['pos_size'], trainable=True,mask_zero=False, input_length=self.hyper['sen_max'], name='pos_emd')(pos_input)
73
+ fea_list.append(pos_fea)
74
+
75
+ if len(fea_list) == 1:
76
+ concate_vec = fea_list[0]
77
+ else:
78
+ concate_vec = Concatenate()(fea_list)
79
+
80
+ concate_vec = Dropout(0.4)(concate_vec)
81
+
82
+ # model
83
+ if model_test_type=='cnn':
84
+ cnn = Conv1D(1024, 1, padding='valid', activation='relu',name='cnn1')(concate_vec)
85
+ cnn = GlobalMaxPooling1D()(cnn)
86
+ elif model_test_type=='lstm':
87
+ bilstm = Bidirectional(LSTM(200, return_sequences=True, implementation=2, dropout=0.4, recurrent_dropout=0.4), name='bilstm1')(concate_vec)
88
+ cnn = GlobalMaxPooling1D()(bilstm)
89
+
90
+
91
+ dense = Dense(1024, activation='relu')(cnn)
92
+ dense= Dropout(0.4)(dense)
93
+ output = Dense(self.rep.label_table_size, activation='softmax')(dense)
94
+ self.model = Model(inputs=all_fea, outputs=output)
95
+ def load_model(self,model_file):
96
+ self.model.load_weights(model_file)
97
+ self.session = K.get_session()
98
+ print(self.session)
99
+ #self.model.summary()
100
+ print('load cnn model done!')
101
+
102
+ class bioTag_BERT():
103
+ def __init__(self, model_files):
104
+ self.model_type='bert'
105
+ self.maxlen = 64
106
+ config_path = model_files['config_path']
107
+ checkpoint_path = model_files['checkpoint_path']
108
+ vocab_path = model_files['vocab_path']
109
+ self.label_file=model_files['labelfile']
110
+ self.session = tf.Session()
111
+
112
+ self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
113
+
114
+
115
+ bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=False, trainable=True,seq_len=self.maxlen)
116
+
117
+ x1_in = Input(shape=(None,))
118
+ x2_in = Input(shape=(None,))
119
+ x = bert_model([x1_in, x2_in])
120
+ x = Lambda(lambda x: x[:, 0])(x)
121
+ outputs = Dense(self.rep.label_table_size, activation='softmax')(x)
122
+
123
+ self.model = Model(inputs=[x1_in,x2_in], outputs=outputs)
124
+
125
+ def load_model(self,model_file):
126
+ self.model.load_weights(model_file)
127
+ self.session = K.get_session()
128
+ print(self.session)
129
+ #self.model.summary()
130
+
131
+ class bioTag_Bioformer():
132
+ def __init__(self, model_files):
133
+ self.model_type='bioformer'
134
+ self.maxlen = 32
135
+ config_path = model_files['config_path']
136
+ checkpoint_path = model_files['checkpoint_path']
137
+ vocab_path = model_files['vocab_path']
138
+ self.label_file=model_files['labelfile']
139
+
140
+ self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
141
+
142
+
143
+ bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=False, trainable=True,seq_len=self.maxlen)
144
+
145
+ x1_in = Input(shape=(None,))
146
+ x2_in = Input(shape=(None,))
147
+ x = bert_model([x1_in, x2_in])
148
+ x = Lambda(lambda x: x[:, 0])(x)
149
+ outputs = Dense(self.rep.label_table_size, activation='softmax')(x)
150
+
151
+ self.model = Model(inputs=[x1_in,x2_in], outputs=outputs)
152
+
153
+ def load_model(self,model_file):
154
+ self.model.load_weights(model_file)
155
+ #self.model._make_predict_function()
156
+ #session = K.get_session()
157
+ #print(session)
158
+ #self.model.summary()
159
+ session=''
160
+ return session
161
+ print('load bioformer model done!')
162
+
src/nn_represent.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 10:02:20 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import time
8
+ import os, sys
9
+ import numpy as np
10
+ from keras.preprocessing.sequence import pad_sequences
11
+ from keras_bert import Tokenizer
12
+
13
+
14
+ class CNN_RepresentationLayer(object):
15
+
16
+
17
+ def __init__(self, wordvec_file, vocab_file=[],\
18
+ vec_size=50, word_size=10000, frequency=10000):
19
+
20
+ '''
21
+ wordvec_file : the file path of word embedding
22
+ vec_size : the dimension size of word vector
23
+ learned by word2vec tool
24
+
25
+ word_size : the size of word vocabulary
26
+
27
+ frequency : the threshold for the words left according to
28
+ their frequency appeared in the text
29
+ for example, when frequency is 10000, the most
30
+ frequent appeared 10000 words are considered
31
+
32
+ '''
33
+ #load word embedding
34
+ file = open(wordvec_file)
35
+ first_line = file.readline().strip()
36
+ file.close()
37
+ self.word_size = int(first_line.split()[0])
38
+ self.vec_size = int(first_line.split()[1])
39
+ self.frequency = frequency
40
+
41
+ if self.frequency>self.word_size:
42
+ self.vec_table = np.zeros((self.word_size + 2, self.vec_size))
43
+ else:
44
+ self.vec_table = np.zeros((self.frequency + 2, self.vec_size))
45
+ self.word_2_index = {}
46
+ self.load_wordvecs(wordvec_file)
47
+
48
+ #other fea
49
+ self.char_2_index={}
50
+ self.char_table_size=0
51
+ if 'char' in vocab_file.keys():
52
+ self.load_fea_vocab(vocab_file['char'],self.char_2_index)
53
+ self.char_table_size=len(self.char_2_index)
54
+ #print(self.char_table_size)
55
+ #print(self.char_2_index)
56
+
57
+ self.label_2_index={}
58
+ self.label_table_size=0
59
+ if 'label' in vocab_file.keys():
60
+ self.load_label_vocab(vocab_file['label'],self.label_2_index)
61
+ self.label_table_size=len(self.label_2_index)
62
+ #print(self.label_table_size)
63
+ #print(self.char_2_index)
64
+
65
+ self.pos_2_index={}
66
+ self.pos_table_size=0
67
+ if 'pos' in vocab_file.keys():
68
+ self.load_fea_vocab(vocab_file['pos'],self.pos_2_index)
69
+ self.pos_table_size=len(self.pos_2_index)
70
+ #print(self.pos_table_size)
71
+
72
+
73
+
74
+ def load_wordvecs(self, wordvec_file):
75
+
76
+ file = open(wordvec_file,'r',encoding='utf-8')
77
+ file.readline()
78
+ #print(self.word_size)
79
+ #print(self.vec_size)
80
+ row = 0
81
+ self.word_2_index['padding_0'] = row #oov-zero vector
82
+ row+=1
83
+ for line in file:
84
+ if row <= self.word_size and row <= self.frequency:
85
+ line_split = line.strip().split(' ')
86
+ self.word_2_index[line_split[0]] = row
87
+ for col in range(self.vec_size):
88
+ self.vec_table[row][col] = float(line_split[col + 1])
89
+ row += 1
90
+ else:
91
+ break
92
+
93
+ self.word_2_index['sparse_vectors'] = row #oov-zero vector
94
+ file.close()
95
+
96
+ def load_fea_vocab(self,fea_file,fea_index):
97
+ fin=open(fea_file,'r',encoding='utf-8')
98
+ i=0
99
+ fea_index['padding_0']=i
100
+ i+=1
101
+ fea_index['oov_padding']=i
102
+ i+=1
103
+ for line in fin:
104
+ fea_index[line.strip()]=i
105
+ i+=1
106
+ fin.close()
107
+
108
+ def load_label_vocab(self,fea_file,fea_index):
109
+ fin=open(fea_file,'r',encoding='utf-8')
110
+ i=0
111
+ for line in fin:
112
+ fea_index[line.strip()]=i
113
+ i+=1
114
+ fin.close()
115
+
116
+ def generate_label_list(self,labels):
117
+ label_list=[]
118
+
119
+ for label in labels:
120
+ temp_label=[0]*self.label_table_size
121
+ temp_label[self.label_2_index[label]]=1
122
+ label_list.append(temp_label)
123
+ return label_list
124
+
125
+ def represent_instances_all_feas(self, instances, labels, word_max_len=100, char_max_len=50):
126
+
127
+ x_text_list=[]
128
+ x_word_list=[]
129
+ x_char_list=[]
130
+
131
+ x_lemma_np=[]
132
+ x_pos_np=[]
133
+ y_np=[]
134
+ startTime=time.time()
135
+ for sentence in instances:
136
+ sentence_list=[]
137
+ sentence_word_list=[]
138
+ sentence_lemma_list=[]
139
+ sentence_pos_list=[]
140
+ sentence_text=[]
141
+ for j in range(0,len(sentence)):
142
+ word=sentence[j]
143
+ #char fea
144
+ char_list=[0]*char_max_len
145
+ for i in range(len(word[0])):
146
+ if i<char_max_len:
147
+ if word[0][i] in self.char_2_index.keys():
148
+ char_list[i]=self.char_2_index[word[0][i]]
149
+ else:
150
+ char_list[i]=self.char_2_index['oov_padding']
151
+ sentence_word_list.append(char_list)
152
+
153
+ #word fea
154
+ sentence_text.append(word[0].lower())
155
+ if word[0].lower() in self.word_2_index.keys():
156
+ sentence_list.append(self.word_2_index[word[0].lower()])
157
+ else:
158
+ sentence_list.append(self.word_2_index['sparse_vectors'])
159
+ """
160
+ #lemma fea
161
+ if word[1].lower() in self.word_2_index.keys():
162
+ sentence_lemma_list.append(self.word_2_index[word[1].lower()])
163
+ else:
164
+ sentence_lemma_list.append(self.word_2_index['sparse_vectors'])
165
+
166
+ #pos fea
167
+ if word[3] in self.pos_2_index.keys():
168
+ sentence_pos_list.append(self.pos_2_index[word[3]])
169
+ else:
170
+ sentence_pos_list.append(self.pos_2_index['oov_padding'])
171
+ """
172
+ x_text_list.append(sentence_text)
173
+ x_word_list.append(sentence_list)
174
+ x_char_list.append(sentence_word_list)
175
+ # x_lemma_list.append(sentence_lemma_list)
176
+ # x_pos_list.append(sentence_pos_list)
177
+
178
+
179
+ #print('\nword:',x_word_list)
180
+ #print('\nchar:',x_char_list)
181
+ #print('\nlemma:',x_lemma_list)
182
+ #print('\npos:',x_pos_list)
183
+ #y_list=self.generate_label_list(labels)
184
+ #print('\ny_list:',y_list)
185
+
186
+ x_word_np = pad_sequences(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding
187
+ x_char_np = pad_sequences(x_char_list, word_max_len, value=0, padding='post',truncating='post')
188
+ #x_lemma_np = pad_sequences(x_lemma_list, word_max_len, value=0, padding='post',truncating='post')
189
+ #x_pos_np = pad_sequences(x_pos_list, word_max_len, value=0, padding='post',truncating='post')
190
+ #y_np = np.array(y_list)
191
+ return [x_word_np, x_char_np, x_lemma_np, x_pos_np, x_text_list], y_np
192
+
193
+ def represent_instances_all_feas_myself(self, instances, labels, word_max_len=100, char_max_len=50):
194
+
195
+ x_text_list=[]
196
+ x_word_list=[]
197
+ x_char_list=[]
198
+ x_lemma_list=[]
199
+ x_pos_list=[]
200
+
201
+ y_list=[]
202
+ startTime=time.time()
203
+ for sentence in instances:
204
+ sentence_list=[0]*word_max_len
205
+ sentence_word_list=[[0]*char_max_len for i in range(word_max_len)]
206
+ sentence_lemma_list=[0]*word_max_len
207
+ sentence_pos_list=[0]*word_max_len
208
+ sentence_text=[]
209
+ for j in range(0,len(sentence)):
210
+ word=sentence[j]
211
+
212
+ sentence_text.append(word[0].lower())
213
+
214
+ if j<word_max_len:
215
+ #char fea
216
+ for i in range(len(word[0])):
217
+ if i<char_max_len:
218
+ if word[0][i] in self.char_2_index.keys():
219
+ sentence_word_list[j][i]=self.char_2_index[word[0][i]]
220
+ else:
221
+ sentence_word_list[j][i]=self.char_2_index['oov_padding']
222
+
223
+ #word fea
224
+ if word[0].lower() in self.word_2_index.keys():
225
+ sentence_list[j]=self.word_2_index[word[0].lower()]
226
+ else:
227
+ sentence_list[j]=self.word_2_index['sparse_vectors']
228
+
229
+ #lemma fea
230
+ if word[1].lower() in self.word_2_index.keys():
231
+ sentence_lemma_list[j]=self.word_2_index[word[1].lower()]
232
+ else:
233
+ sentence_lemma_list[j]=self.word_2_index['sparse_vectors']
234
+
235
+ #pos fea
236
+ if word[3] in self.pos_2_index.keys():
237
+ sentence_pos_list[j]=self.pos_2_index[word[3]]
238
+ else:
239
+ sentence_pos_list[j]=self.pos_2_index['oov_padding']
240
+
241
+ x_text_list.append(sentence_text)
242
+ x_word_list.append(sentence_list)
243
+ x_char_list.append(sentence_word_list)
244
+ x_lemma_list.append(sentence_lemma_list)
245
+ x_pos_list.append(sentence_pos_list)
246
+
247
+ print('ml-model-represent-list:',time.time()-startTime)
248
+ startTime=time.time()
249
+ #print('\nword:',x_word_list)
250
+ #print('\nchar:',x_char_list)
251
+ #print('\nlemma:',x_lemma_list)
252
+ #print('\npos:',x_pos_list)
253
+ y_list=self.generate_label_list(labels)
254
+ #print('\ny_list:',y_list)
255
+ # x_word_np = pad_sequences2(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding
256
+ # x_char_np = pad_sequences2(x_char_list, word_max_len, value=0, padding='post',truncating='post')
257
+ # x_lemma_np = pad_sequences2(x_lemma_list, word_max_len, value=0, padding='post',truncating='post')
258
+ # x_pos_np = pad_sequences2(x_pos_list, word_max_len, value=0, padding='post',truncating='post')
259
+
260
+ x_word_np = np.array(x_word_list) # right padding
261
+ x_char_np = pad_sequences2(x_char_list)
262
+ x_lemma_np = np.array(x_lemma_list)
263
+ x_pos_np = np.array(x_pos_list)
264
+ y_np = np.array(y_list)
265
+ print('ml-model-represent-pad:',time.time()-startTime)
266
+ return [x_word_np, x_char_np, x_lemma_np, x_pos_np, x_text_list], y_np
267
+
268
+
269
+
270
+ class BERT_RepresentationLayer(object):
271
+
272
+
273
+ def __init__(self, vocab_path, label_file):
274
+
275
+
276
+ #load vocab
277
+ self.bert_vocab_dict = {}
278
+ self.load_bert_vocab(vocab_path,self.bert_vocab_dict)
279
+ self.tokenizer = Tokenizer(self.bert_vocab_dict)
280
+
281
+ #load label
282
+ self.label_2_index={}
283
+ self.label_table_size=0
284
+ self.load_label_vocab(label_file,self.label_2_index)
285
+ self.label_table_size=len(self.label_2_index)
286
+
287
+ def load_label_vocab(self,fea_file,fea_index):
288
+ fin=open(fea_file,'r',encoding='utf-8')
289
+ i=0
290
+ for line in fin:
291
+ fea_index[line.strip()]=i
292
+ i+=1
293
+ fin.close()
294
+ def load_bert_vocab(self,vocab_file,vocab_dict):
295
+ fin=open(vocab_file,'r',encoding='utf-8')
296
+ i=0
297
+ for line in fin:
298
+ vocab_dict[line.strip()]=i
299
+ i+=1
300
+ fin.close()
301
+
302
+ def generate_label_list(self,labels):
303
+ label_list=[]
304
+
305
+ for label in labels:
306
+ temp_label=[0]*self.label_table_size
307
+ temp_label[self.label_2_index[label]]=1
308
+ label_list.append(temp_label)
309
+ return label_list
310
+
311
+ def load_data(self,instances, labels, word_max_len=100):
312
+
313
+ x_index=[]
314
+ x_seg=[]
315
+ y_np=[]
316
+
317
+ for sentence in instances:
318
+ sentence_text_list=[]
319
+ for j in range(0,len(sentence)):
320
+ sentence_text_list.append(sentence[j][0])
321
+ sentence_text=' '.join(sentence_text_list)
322
+ #print(self.tokenizer.tokenize(first=sentence_text))
323
+ x1, x2 = self.tokenizer.encode(first=sentence_text)
324
+ x_index.append(x1)
325
+ x_seg.append(x2)
326
+
327
+ # y_list=self.generate_label_list(labels)
328
+
329
+ x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding
330
+ x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post')
331
+ # y_np = np.array(y_list)
332
+
333
+ return [x1_np, x2_np], y_np
334
+
335
+ if __name__ == '__main__':
336
+ pass
337
+
338
+
src/post_processing.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Thu Jun 18 20:08:30 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ def combine_overlap(mention_list):
9
+
10
+ entity_list=[]
11
+ if len(mention_list)>2:
12
+
13
+ first_entity=mention_list[0]
14
+ nest_list=[first_entity]
15
+ max_eid=int(first_entity[1])
16
+ for i in range(1,len(mention_list)):
17
+ segs=mention_list[i]
18
+ if int(segs[0])> max_eid:
19
+ if len(nest_list)==1:
20
+ entity_list.append(nest_list[0])
21
+ nest_list=[]
22
+ nest_list.append(segs)
23
+ if int(segs[1])>max_eid:
24
+ max_eid=int(segs[1])
25
+ else:
26
+ tem=find_max_entity(nest_list)#find max entity
27
+ entity_list.append(tem)
28
+ nest_list=[]
29
+ nest_list.append(segs)
30
+ if int(segs[1])>max_eid:
31
+ max_eid=int(segs[1])
32
+
33
+ else:
34
+ nest_list.append(segs)
35
+ if int(segs[1])>max_eid:
36
+ max_eid=int(segs[1])
37
+ if nest_list!=[]:
38
+ if len(nest_list)==1:
39
+ entity_list.append(nest_list[0])
40
+
41
+ else:
42
+ tem=find_max_entity(nest_list)#find max entity
43
+ entity_list.append(tem)
44
+ else:
45
+ entity_list=mention_list
46
+
47
+ return entity_list
48
+
49
+ def find_max_entity(nest_list):
50
+ max_len=0
51
+ max_entity=[]
52
+ for i in range(0, len(nest_list)):
53
+ length=int(nest_list[i][1])-int(nest_list[i][0])
54
+ if length>max_len:
55
+ max_len=length
56
+ max_entity=nest_list[i]
57
+
58
+ return max_entity
src/restore_index.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Jun 14 17:19:02 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import io
9
+ import sys
10
+
11
+ def restore_index_nest_fn(ori_text,file_pre):
12
+
13
+
14
+ fin_pre=io.StringIO(file_pre)
15
+ #print(file_pre)
16
+ all_pre=fin_pre.read().strip().split('\n\n')
17
+ fin_pre.close()
18
+ #print(len(all_pre))
19
+
20
+ new_sentence=''
21
+ restore_result=[]
22
+
23
+ sentence_ori=ori_text.lower().replace('``','" ')
24
+ sentence_ori=sentence_ori.replace("''",'" ')
25
+ for pre_i in range(0,len(all_pre)):
26
+ pre_lines=all_pre[pre_i].split('\n')
27
+ #print(pre_lines)
28
+ # print(sentence_ori)
29
+ if len(pre_lines)>1:
30
+ #print(pre_lines)
31
+ sentence_pre=pre_lines[0].lower().replace('``','"')
32
+ sentence_pre=sentence_pre.replace("''",'"')
33
+ sentence_pre=sentence_pre.split()
34
+ pre_result=[]
35
+ for i in range(1,len(pre_lines)):
36
+ pre_result.append(pre_lines[i].split('\t'))
37
+
38
+ restore_sid=0
39
+ restore_eid=0
40
+ each_word_id=[]
41
+
42
+ for i in range(0,len(sentence_pre)):
43
+
44
+ temp_id=sentence_ori.find(sentence_pre[i])
45
+ if temp_id<0:
46
+ if sentence_pre[i].find('"')>=0:
47
+ temp_id = sentence_ori.find(sentence_pre[i].replace('"','" '))
48
+ else:
49
+ #print('ori:',sentence_ori)
50
+ print('resotr index error:',sentence_pre[i])
51
+ new_sentence+=sentence_ori[0:temp_id]
52
+
53
+ restore_sid=len(new_sentence)
54
+ restore_eid=len(new_sentence)+len(sentence_pre[i])
55
+ each_word_id.append([str(restore_sid),str(restore_eid)])
56
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
57
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
58
+ # print('each_word:',each_word_id)
59
+ for pre_ele in pre_result:
60
+ # if len(pre_ele)>4:
61
+ # temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],pre_ele[4]]
62
+ # else:
63
+ # temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],'1.00']
64
+ temp_pre_result=[each_word_id[int(pre_ele[0])][0],each_word_id[int(pre_ele[1])][1],pre_ele[3].split('|')[0],pre_ele[4]]
65
+ if temp_pre_result not in restore_result:
66
+ restore_result.append(temp_pre_result)
67
+ else:
68
+ sentence_pre=pre_lines[0].lower().replace('``','"')
69
+ sentence_pre=sentence_pre.replace("''",'"')
70
+ sentence_pre=sentence_pre.split()
71
+
72
+ for i in range(0,len(sentence_pre)):
73
+
74
+ temp_id=sentence_ori.find(sentence_pre[i])
75
+ if temp_id<0:
76
+ if sentence_pre[i].find('"')>=0:
77
+ temp_id = sentence_ori.find(sentence_pre[i].replace('"','" '))
78
+ else:
79
+ print('resotr index error:',sentence_pre[i])
80
+ new_sentence+=sentence_ori[0:temp_id]
81
+ new_sentence+=sentence_ori[temp_id:temp_id+len(sentence_pre[i])]
82
+ sentence_ori=sentence_ori[temp_id+len(sentence_pre[i]):]
83
+ # print('resotre:',restore_result)
84
+ return restore_result
85
+
86
+ if __name__=='__main__':
87
+ path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/data/test/gsc/result/'
88
+ fin=open(path+'GSCplus_Nest_biobert.tsv','r',encoding='utf-8')
89
+ fout=open(path+'GSCplus_Nest_restore_biobert.tsv','w',encoding='utf-8')
90
+ all_context=fin.read().strip().split('\n\n\n\n')
91
+ fin.close()
92
+ file_num=0
93
+ for doc in all_context:
94
+ file_num+=1
95
+ print('file_num:',file_num)
96
+ doc_ele=doc.split('\n\n')
97
+ first_line = doc_ele[0].split('\n')
98
+ pmid=first_line[0]
99
+ ori_text=first_line[1]
100
+ pre_result='\n\n'.join(doc_ele[1:])
101
+ # print('pmid:',pmid)
102
+ # print('ori:',ori_text)
103
+ # print('pre:',pre_result)
104
+ final_result=restore_index_nest_fn(ori_text,pre_result)
105
+ fout.write(pmid+'\n'+ori_text+'\n')
106
+ for ele in final_result:
107
+ fout.write('\t'.join(ele)+'\t'+ori_text[int(ele[0]):int(ele[1])]+'\n')
108
+ fout.write('\n')
109
+ fout.close()
src/ssplit_tokenzier.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 15:26:44 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import nltk
9
+ from nltk.stem import WordNetLemmatizer
10
+ from nltk.corpus import wordnet
11
+ from nltk.stem.porter import PorterStemmer
12
+ lemmatizer = WordNetLemmatizer()
13
+ stemmer = PorterStemmer()
14
+ import io
15
+
16
+ def get_wordnet_pos(treebank_tag):
17
+ if treebank_tag.startswith('J'):
18
+ return wordnet.ADJ
19
+ elif treebank_tag.startswith('V'):
20
+ return wordnet.VERB
21
+ elif treebank_tag.startswith('N'):
22
+ return wordnet.NOUN
23
+ elif treebank_tag.startswith('R') or treebank_tag=='IN':
24
+ return wordnet.ADV
25
+ else:
26
+ return wordnet.NOUN
27
+
28
+ def ssplit_token_pos_lemma(in_text):
29
+
30
+ fout=io.StringIO()
31
+
32
+ line=in_text.strip()
33
+ line=line.replace('-',' - ').replace('/',' / ')
34
+ sentences = nltk.sent_tokenize(line)
35
+ sentences = [nltk.word_tokenize(sent) for sent in sentences]
36
+ # print(sentences)
37
+ for sent in sentences:
38
+ token_pos = nltk.pos_tag(sent)
39
+ for token in token_pos:
40
+ lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1]))
41
+ stem = stemmer.stem(token[0].lower())
42
+ fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n')
43
+ fout.write('\n')
44
+
45
+ return fout.getvalue()
src/tagging_text.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Aug 24 16:21:23 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import argparse
9
+ from src.ssplit_tokenzier import ssplit_token_pos_lemma
10
+ from src.ml_ner import ml_tagging,ml_tagging_allngram
11
+ from src.combine_result import combine_ml_dict
12
+ from src.restore_index import restore_index_nest_fn
13
+ from src.dic_ner import dic_ont
14
+ from src.post_processing import combine_overlap
15
+ from src.abbre_resolution import postprocess_abbr
16
+ import os
17
+ import time
18
+ import json
19
+
20
+ #hybrid method
21
+ def bioTag(session,text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
22
+
23
+ # startTime=time.time()
24
+ ssplit_token=ssplit_token_pos_lemma(text)
25
+ # print(ssplit_token)
26
+ # print('ssplit token:',time.time()-startTime)
27
+
28
+ # startTime=time.time()
29
+ dict_tsv=biotag_dic.matching(ssplit_token)
30
+ # print('dict tsv:\n',dict_tsv)
31
+ # print('dict ner:',time.time()-startTime)
32
+
33
+ # startTime=time.time()
34
+ ml_tsv=ml_tagging(session,ssplit_token,ml_model,Threshold)
35
+ #print('ml_tsv:\n',ml_tsv)
36
+ # print('ml ner:',time.time()-startTime)
37
+
38
+ # startTime=time.time()
39
+ combine_tsv=combine_ml_dict(dict_tsv,ml_tsv)
40
+ #combine_tsv=combine_ml_dict_fn(ml_tsv,dict_tsv)
41
+ #print('combine:\n',combine_tsv)
42
+ # print('combine:',time.time()-startTime)
43
+
44
+ # startTime=time.time()
45
+ final_result= restore_index_nest_fn(text,combine_tsv)
46
+ # print('final ner:',time.time()-startTime)
47
+ if onlyLongest==True:
48
+ final_result=combine_overlap(final_result)
49
+ if abbrRecog==True:
50
+ final_result=postprocess_abbr(final_result,text)
51
+ # print('final result:')
52
+ # print(final_result)
53
+ # print('final ner:',time.time()-startTime)
54
+
55
+ return final_result
56
+
57
+ # only machine learning-based method
58
+ def bioTag_ml(text,ml_model,onlyLongest=False,abbrRecog=False, Threshold=0.95):
59
+
60
+ # startTime=time.time()
61
+ ssplit_token=ssplit_token_pos_lemma(text)
62
+ # print(ssplit_token)
63
+ # print('ssplit token:',time.time()-startTime)
64
+
65
+ # startTime=time.time()
66
+ ml_tsv=ml_tagging_allngram(ssplit_token,ml_model,Threshold)
67
+ # print('ml_tsv:\n',ml_tsv)
68
+ # print('ml ner:',time.time()-startTime)
69
+
70
+ final_result= restore_index_nest_fn(text,ml_tsv)
71
+ # print('final ner:',time.time()-startTime)
72
+ if onlyLongest==True:
73
+ final_result=combine_overlap(final_result)
74
+
75
+ if abbrRecog==True:
76
+ final_result=postprocess_abbr(final_result,text)
77
+
78
+ return final_result
79
+
80
+ # only dict method
81
+ def bioTag_dic(text,biotag_dic,onlyLongest=False, abbrRecog=False):
82
+
83
+ # startTime=time.time()
84
+ ssplit_token=ssplit_token_pos_lemma(text)
85
+ # print(ssplit_token)
86
+ # print('ssplit token:',time.time()-startTime)
87
+
88
+ # startTime=time.time()
89
+ dict_tsv=biotag_dic.matching(ssplit_token)
90
+ # print('dict tsv:\n',dict_tsv)
91
+ # print('dict ner:',time.time()-startTime)
92
+
93
+ final_result= restore_index_nest_fn(text,dict_tsv)
94
+ # print('final ner:',time.time()-startTime)
95
+ if onlyLongest==True:
96
+ final_result=combine_overlap(final_result)
97
+
98
+ if abbrRecog==True:
99
+ final_result=postprocess_abbr(final_result,text)
100
+
101
+ return final_result
102
+