lingbionlp commited on
Commit
8ab6ceb
1 Parent(s): 5e753a2

Upload 2 files

Browse files
Files changed (2) hide show
  1. AIO_label.vocab +21 -0
  2. postprocessing.py +551 -0
AIO_label.vocab ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ O
2
+ B-Gene
3
+ I-Gene
4
+ O-Gene
5
+ B-FamilyName
6
+ I-FamilyName
7
+ B-Disease
8
+ I-Disease
9
+ O-Disease
10
+ B-Chemical
11
+ I-Chemical
12
+ O-Chemical
13
+ B-Mutation
14
+ I-Mutation
15
+ O-Mutation
16
+ B-Species
17
+ I-Species
18
+ O-Species
19
+ B-CellLine
20
+ I-CellLine
21
+ O-CellLine
postprocessing.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Nov 03 20:08:30 2022
4
+
5
+ @author: luol2
6
+ """
7
+
8
+
9
+ import logging
10
+ import regex
11
+ import sys
12
+ import io
13
+
14
+ """
15
+ A Python 3 refactoring of Vincent Van Asch's Python 2 code at
16
+
17
+ http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
18
+
19
+ Based on
20
+
21
+ A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
22
+ A. Schwartz and M. Hearst
23
+ Biocomputing, 2003, pp 451-462.
24
+
25
+ """
26
+
27
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
28
+ log = logging.getLogger('Abbre')
29
+
30
+
31
+ class Candidate(str):
32
+ def __init__(self, value):
33
+ super().__init__()
34
+ self.start = 0
35
+ self.stop = 0
36
+
37
+ def set_position(self, start, stop):
38
+ self.start = start
39
+ self.stop = stop
40
+
41
+
42
+ def yield_lines_from_file(file_path):
43
+ with open(file_path, 'rb') as f:
44
+ for line in f:
45
+ try:
46
+ line = line.decode('utf-8')
47
+ except UnicodeDecodeError:
48
+ line = line.decode('latin-1').encode('utf-8').decode('utf-8')
49
+ line = line.strip()
50
+ yield line
51
+ f.close()
52
+
53
+
54
+ def yield_lines_from_doc(doc_text):
55
+ for line in doc_text.split("\n"):
56
+ yield line.strip()
57
+
58
+
59
+ def best_candidates(sentence):
60
+ """
61
+ :param sentence: line read from input file
62
+ :return: a Candidate iterator
63
+ """
64
+
65
+ if '(' in sentence:
66
+ # Check some things first
67
+ if sentence.count('(') != sentence.count(')'):
68
+ raise ValueError("Unbalanced parentheses: {}".format(sentence))
69
+
70
+ if sentence.find('(') > sentence.find(')'):
71
+ raise ValueError("First parentheses is right: {}".format(sentence))
72
+
73
+ closeindex = -1
74
+ while 1:
75
+ # Look for open parenthesis
76
+ openindex = sentence.find('(', closeindex + 1)
77
+
78
+ if openindex == -1: break
79
+
80
+ # Look for closing parentheses
81
+ closeindex = openindex + 1
82
+ open = 1
83
+ skip = False
84
+ while open:
85
+ try:
86
+ char = sentence[closeindex]
87
+ except IndexError:
88
+ # We found an opening bracket but no associated closing bracket
89
+ # Skip the opening bracket
90
+ skip = True
91
+ break
92
+ if char == '(':
93
+ open += 1
94
+ elif char in [')', ';', ':']:
95
+ open -= 1
96
+ closeindex += 1
97
+
98
+ if skip:
99
+ closeindex = openindex + 1
100
+ continue
101
+
102
+ # Output if conditions are met
103
+ start = openindex + 1
104
+ stop = closeindex - 1
105
+ candidate = sentence[start:stop]
106
+
107
+ # Take into account whitespace that should be removed
108
+ start = start + len(candidate) - len(candidate.lstrip())
109
+ stop = stop - len(candidate) + len(candidate.rstrip())
110
+ candidate = sentence[start:stop]
111
+
112
+ if conditions(candidate):
113
+ new_candidate = Candidate(candidate)
114
+ new_candidate.set_position(start, stop)
115
+ yield new_candidate
116
+
117
+
118
+ def conditions(candidate):
119
+ """
120
+ Based on Schwartz&Hearst
121
+
122
+ 2 <= len(str) <= 10
123
+ len(tokens) <= 2
124
+ re.search('\p{L}', str)
125
+ str[0].isalnum()
126
+
127
+ and extra:
128
+ if it matches (\p{L}\.?\s?){2,}
129
+ it is a good candidate.
130
+
131
+ :param candidate: candidate abbreviation
132
+ :return: True if this is a good candidate
133
+ """
134
+ viable = True
135
+ if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
136
+ viable = True
137
+ if len(candidate) < 2 or len(candidate) > 10:
138
+ viable = False
139
+ if len(candidate.split()) > 2:
140
+ viable = False
141
+ if not regex.search('\p{L}', candidate):
142
+ viable = False
143
+ if not candidate[0].isalnum():
144
+ viable = False
145
+
146
+ return viable
147
+
148
+
149
+ def get_definition(candidate, sentence):
150
+ """
151
+ Takes a candidate and a sentence and returns the definition candidate.
152
+
153
+ The definintion candidate is the set of tokens (in front of the candidate)
154
+ that starts with a token starting with the first character of the candidate
155
+
156
+ :param candidate: candidate abbreviation
157
+ :param sentence: current sentence (single line from input file)
158
+ :return: candidate definition for this abbreviation
159
+ """
160
+ # Take the tokens in front of the candidate
161
+ tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
162
+ #print(tokens)
163
+ # the char that we are looking for
164
+ key = candidate[0].lower()
165
+
166
+ # Count the number of tokens that start with the same character as the candidate
167
+ # print(tokens)
168
+ firstchars = [t[0] for t in tokens]
169
+ # print(firstchars)
170
+ definition_freq = firstchars.count(key)
171
+ candidate_freq = candidate.lower().count(key)
172
+
173
+ # Look for the list of tokens in front of candidate that
174
+ # have a sufficient number of tokens starting with key
175
+ if candidate_freq <= definition_freq:
176
+ # we should at least have a good number of starts
177
+ count = 0
178
+ start = 0
179
+ startindex = len(firstchars) - 1
180
+
181
+ while count < candidate_freq:
182
+ if abs(start) > len(firstchars):
183
+ raise ValueError("candiate {} not found".format(candidate))
184
+ start -= 1
185
+ # Look up key in the definition
186
+ try:
187
+ startindex = firstchars.index(key, len(firstchars) + start)
188
+ except ValueError:
189
+ pass
190
+
191
+ # Count the number of keys in definition
192
+ count = firstchars[startindex:].count(key)
193
+
194
+ # We found enough keys in the definition so return the definition as a definition candidate
195
+ start = len(' '.join(tokens[:startindex]))
196
+ stop = candidate.start - 1
197
+ candidate = sentence[start:stop]
198
+
199
+ # Remove whitespace
200
+ start = start + len(candidate) - len(candidate.lstrip())
201
+ stop = stop - len(candidate) + len(candidate.rstrip())
202
+ candidate = sentence[start:stop]
203
+
204
+ new_candidate = Candidate(candidate)
205
+ new_candidate.set_position(start, stop)
206
+ #print('new_candidate:')
207
+ #print(new_candidate,start,stop)
208
+ return new_candidate
209
+
210
+ else:
211
+ raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
212
+
213
+
214
+ def select_definition(definition, abbrev):
215
+ """
216
+ Takes a definition candidate and an abbreviation candidate
217
+ and returns True if the chars in the abbreviation occur in the definition
218
+
219
+ Based on
220
+ A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
221
+ :param definition: candidate definition
222
+ :param abbrev: candidate abbreviation
223
+ :return:
224
+ """
225
+
226
+
227
+ if len(definition) < len(abbrev):
228
+ raise ValueError('Abbreviation is longer than definition')
229
+
230
+ if abbrev in definition.split():
231
+ raise ValueError('Abbreviation is full word of definition')
232
+
233
+ sindex = -1
234
+ lindex = -1
235
+
236
+ while 1:
237
+ try:
238
+ longchar = definition[lindex].lower()
239
+ except IndexError:
240
+ raise
241
+
242
+ shortchar = abbrev[sindex].lower()
243
+
244
+ if not shortchar.isalnum():
245
+ sindex -= 1
246
+
247
+ if sindex == -1 * len(abbrev):
248
+ if shortchar == longchar:
249
+ if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
250
+ break
251
+ else:
252
+ lindex -= 1
253
+ else:
254
+ lindex -= 1
255
+ if lindex == -1 * (len(definition) + 1):
256
+ raise ValueError("definition {} was not found in {}".format(abbrev, definition))
257
+
258
+ else:
259
+ if shortchar == longchar:
260
+ sindex -= 1
261
+ lindex -= 1
262
+ else:
263
+ lindex -= 1
264
+ # print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
265
+ new_candidate = Candidate(definition[lindex:len(definition)])
266
+ new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
267
+ definition = new_candidate
268
+
269
+ tokens = len(definition.split())
270
+ length = len(abbrev)
271
+
272
+ if tokens > min([length + 5, length * 2]):
273
+ raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
274
+
275
+ # Do not return definitions that contain unbalanced parentheses
276
+ if definition.count('(') != definition.count(')'):
277
+ raise ValueError("Unbalanced parentheses not allowed in a definition")
278
+ # print('select:')
279
+ # print(definition,definition.start, definition.stop)
280
+ new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
281
+ return new_definition_dict
282
+
283
+
284
+ def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
285
+ abbrev_map = [] #[{definition,start,stop,abbre}]
286
+ abbr_full_dict={} #{abbre:(fullname_start,fullname_stop)}
287
+ fullloc_abbr_dict={} #{"fullname_s fullname_e":abbr}
288
+ omit = 0
289
+ written = 0
290
+ if file_path:
291
+ sentence_iterator = enumerate(yield_lines_from_file(file_path))
292
+ elif doc_text:
293
+ sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
294
+ else:
295
+ return abbrev_map
296
+
297
+ for i, sentence in sentence_iterator:
298
+ #print(sentence)
299
+ try:
300
+ for candidate in best_candidates(sentence):
301
+ #print(candidate)
302
+ try:
303
+ #print('begin get definition')
304
+ definition = get_definition(candidate, sentence)
305
+ #print('get_definition:')
306
+ #print(definition)
307
+
308
+ except (ValueError, IndexError) as e:
309
+ #log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
310
+ omit += 1
311
+ else:
312
+ try:
313
+ definition_dict = select_definition(definition, candidate)
314
+ except (ValueError, IndexError) as e:
315
+ #log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
316
+ omit += 1
317
+ else:
318
+ definition_dict['abbre']=candidate
319
+ abbrev_map.append(definition_dict)
320
+ abbr_full_dict[definition_dict['abbre']]=(definition_dict['start'],definition_dict['stop'])
321
+ fullloc_abbr_dict[str(definition_dict['start'])+' '+str(definition_dict['stop'])]=definition_dict['abbre']
322
+ written += 1
323
+ except (ValueError, IndexError) as e:
324
+ log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
325
+ log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
326
+ return abbrev_map,abbr_full_dict,fullloc_abbr_dict
327
+
328
+
329
+ def postprocess_abbr(ner_result,ori_text): #ner_result {'entity_s entity_e':[eles]}
330
+
331
+ final_result=[]
332
+ if len(ner_result)==0:
333
+ return {}
334
+
335
+ # abbr recognition
336
+ abbr_list, abbr_full_dict,fullloc_abbr_dict=extract_abbreviation_definition_pairs(doc_text=ori_text)
337
+ # print(abbr_list)
338
+ #print(abbr_full_dict)
339
+ # print(fullloc_abbr_dict)
340
+
341
+ #ner loc
342
+ ner_loc_result={}
343
+ for ele in ner_result.keys():
344
+ # ner_loc_result[ner_result[ele][0]+' '+ner_result[ele][1]]=ner_result[ele]
345
+ ner_loc_result[ner_result[ele][1]]=ner_result[ele]
346
+
347
+ # remove the wrong abbr, add miss abbr
348
+ for entity_loc in ner_result.keys():
349
+
350
+ if (ner_result[entity_loc][-1]!='CellLine') and (ner_result[entity_loc][2] in abbr_full_dict.keys()) : #the entity is abbr
351
+ #use the fullname entity type
352
+ fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
353
+
354
+ if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
355
+ final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
356
+
357
+
358
+ # # fullname_loc=str(abbr_full_dict[ner_result[entity_loc][2]][0])+' '+str(abbr_full_dict[ner_result[entity_loc][2]][1])
359
+ # fullname_loc_e=str(abbr_full_dict[ner_result[entity_loc][2]][1])
360
+ # if (ner_result[entity_loc][-1]=='Gene') or (ner_result[entity_loc][-1]=='FamilyName'): #gene keep original entity type
361
+ # if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
362
+ # final_result.append(ner_result[entity_loc])
363
+ # # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
364
+ # # final_result.append(ner_result[entity_loc])
365
+ # else: # no-gene use the fullname entity type
366
+ # if fullname_loc_e in ner_loc_result.keys(): #fullname is entity
367
+ # final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
368
+ # # elif fullname_loc_e in ner_loc_result.keys(): #fullname is entity
369
+ # # final_result.append([ner_result[entity_loc][0], ner_result[entity_loc][1],ner_result[entity_loc][2],ner_loc_result[fullname_loc_e][-1]])
370
+
371
+
372
+
373
+ elif entity_loc in fullloc_abbr_dict.keys(): #the entity is fullname
374
+ abbr_loc_s=ori_text.find(fullloc_abbr_dict[entity_loc],int(ner_result[entity_loc][1]))
375
+ final_result.append(ner_result[entity_loc])
376
+ if abbr_loc_s>=0:
377
+ abbr_loc_e=abbr_loc_s+len(fullloc_abbr_dict[entity_loc])
378
+ abbr_loc=str(abbr_loc_s)+' '+str(abbr_loc_e)
379
+ # print(abbr_loc,fullloc_abbr_dict[entity_loc])
380
+ if abbr_loc not in ner_result.keys():#add abbr
381
+ final_result.append([str(abbr_loc_s),str(abbr_loc_e),ori_text[abbr_loc_s:abbr_loc_e],ner_result[entity_loc][-1]])
382
+
383
+ else:
384
+ #if entity is only Punctuation
385
+ if len(ner_result[entity_loc][2])==1 and (not ner_result[entity_loc][2].isalpha()):
386
+ pass
387
+ # print(ner_result[entity_loc])
388
+ else:
389
+ final_result.append(ner_result[entity_loc])
390
+
391
+
392
+ #print(final_result)
393
+ return final_result
394
+
395
+
396
+ def entity_consistency(ner_result,ori_text): #ner_result=[]
397
+
398
+ final_result={}
399
+ entity_loc_set=set()
400
+ entity_type={} #{entity:{type1:num,type2:num}}
401
+
402
+ for segs in ner_result:
403
+ entity_loc_set.add(segs[0]+' '+segs[1])
404
+ final_result['\t'.join(segs)]=[int(segs[0]),int(segs[1])]
405
+ if len(segs[2])>1:
406
+ if segs[2].isupper():#entity is all supper abbr
407
+ if segs[2] not in entity_type.keys():
408
+ entity_type[segs[2]]={segs[-1]:1}
409
+ else:
410
+ if segs[-1] in entity_type[segs[2]]:
411
+ entity_type[segs[2]][segs[-1]]+=1
412
+ else:
413
+ entity_type[segs[2]][segs[-1]]=1
414
+ else: #not abbr
415
+ if segs[2].lower() not in entity_type.keys():
416
+ entity_type[segs[2].lower()]={segs[-1]:1}
417
+ else:
418
+ if segs[-1] in entity_type[segs[2].lower()]:
419
+ entity_type[segs[2].lower()][segs[-1]]+=1
420
+ else:
421
+ entity_type[segs[2].lower()][segs[-1]]=1
422
+
423
+
424
+ # print(entity_type)
425
+ # print('..........')
426
+ entity_type_major={}
427
+ for ele in entity_type.keys():
428
+ entity_type_major[ele]=max(zip(entity_type[ele].values(), entity_type[ele].keys()))[1]
429
+ # print(entity_type_major)
430
+
431
+
432
+ #find miss entity
433
+ for entity_text in entity_type_major.keys():
434
+
435
+ if entity_text.isupper():#entity is all supper abbr
436
+ new_text=ori_text
437
+ else:
438
+ new_text=ori_text.lower()
439
+ ent_eid=0
440
+ while new_text.find(entity_text,ent_eid)>=0:
441
+ ent_sid=new_text.find(entity_text,ent_eid)
442
+ ent_eid=ent_sid+len(entity_text)
443
+ entity_loc=str(ent_sid)+' '+str(ent_eid)
444
+ # print(abbr_sid,abbr_eid)
445
+ if entity_loc not in entity_loc_set:
446
+ if ent_sid>0 and ent_eid<len(new_text):
447
+ if new_text[ent_sid-1].isalnum()==False and new_text[ent_eid].isalnum()==False:
448
+ final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
449
+ entity_loc_set.add(entity_loc)
450
+ elif ent_sid==0 and ent_eid<len(new_text):
451
+ if new_text[ent_eid].isalnum()==False:
452
+ final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
453
+ entity_loc_set.add(entity_loc)
454
+ elif ent_sid>0 and ent_eid==len(new_text):
455
+ if new_text[ent_sid-1].isalnum()==False :
456
+ final_result[str(ent_sid)+'\t'+str(ent_eid)+'\t'+ori_text[ent_sid:ent_eid]+'\t'+entity_type_major[entity_text]]=[ent_sid,ent_eid]
457
+ entity_loc_set.add(entity_loc)
458
+
459
+ if len(final_result)!=len(ner_result):#add new entity, sort , remover overloppling
460
+ final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
461
+ mention_list=[]
462
+ for ele in final_result:
463
+ mention_list.append(ele[0].split('\t'))
464
+ final_ner_result=combine_overlap(mention_list)
465
+ else:
466
+ final_ner_result=ner_result
467
+ return final_ner_result
468
+
469
+ def combine_overlap(mention_list):
470
+
471
+ entity_list=[]
472
+ if len(mention_list)>2:
473
+
474
+ first_entity=mention_list[0]
475
+ nest_list=[first_entity]
476
+ max_eid=int(first_entity[1])
477
+ for i in range(1,len(mention_list)):
478
+ segs=mention_list[i]
479
+ if int(segs[0])>= max_eid:
480
+ if len(nest_list)==1:
481
+ entity_list.append(nest_list[0])
482
+ nest_list=[]
483
+ nest_list.append(segs)
484
+ if int(segs[1])>max_eid:
485
+ max_eid=int(segs[1])
486
+ else:
487
+ tem=find_max_entity(nest_list)#find max entity
488
+ entity_list.append(tem)
489
+ nest_list=[]
490
+ nest_list.append(segs)
491
+ if int(segs[1])>max_eid:
492
+ max_eid=int(segs[1])
493
+
494
+ else:
495
+ nest_list.append(segs)
496
+ if int(segs[1])>max_eid:
497
+ max_eid=int(segs[1])
498
+ if nest_list!=[]:
499
+ if len(nest_list)==1:
500
+ entity_list.append(nest_list[0])
501
+
502
+ else:
503
+ tem=find_max_entity(nest_list)#find max entity
504
+ entity_list.append(tem)
505
+ else:
506
+ entity_list=mention_list
507
+
508
+ return entity_list
509
+
510
+ def find_max_entity(nest_list):
511
+ max_len=0
512
+ max_entity=[]
513
+ for i in range(0, len(nest_list)):
514
+ length=int(nest_list[i][1])-int(nest_list[i][0])
515
+ if length>max_len:
516
+ max_len=length
517
+ max_entity=nest_list[i]
518
+
519
+ return max_entity
520
+
521
+
522
+
523
+
524
+ if __name__ == '__main__':
525
+
526
+ path='//panfs/pan1/bionlplab/luol2/PubTator3/example/post-out/'
527
+ fin=open(path+'PubmedBERT-CRF-AIO_ALL.test_preds','r',encoding='utf-8')
528
+ all_in=fin.read().strip().split('\n\n')
529
+ fout=open(path+'PubmedBERT-CRF-AIO_ALL-post4.test_preds','w',encoding='utf-8')
530
+ for doc in all_in:
531
+ lines=doc.split('\n')
532
+ pmid=lines[0].split('|t|')[0]
533
+ ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
534
+ ner_result={}
535
+ for i in range(2,len(lines)):
536
+ seg=lines[i].split('\t')
537
+ ner_result[seg[1]+' '+seg[2]]=seg[1:]
538
+ # abbr recognition
539
+ final_ner=postprocess_abbr(ner_result,ori_text)
540
+ #entity consistence
541
+ final_ner=entity_consistency(final_ner,ori_text)
542
+ # final_result=sorted(final_ner.items(), key=lambda kv:(kv[1]), reverse=False)
543
+ fout.write(lines[0]+'\n'+lines[1]+'\n')
544
+ for ele in final_ner:
545
+ fout.write(pmid+'\t'+'\t'.join(ele)+'\n')
546
+ fout.write('\n')
547
+ fout.close()
548
+
549
+ # sys.exit()
550
+
551
+