PhenoTagger-Demo / src /abbre_resolution.py
lingbionlp's picture
Upload 23 files
ae5152f
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 11 16:52:40 2020
@author: luol2
"""
import logging
import regex
import sys
import io
"""
A Python 3 refactoring of Vincent Van Asch's Python 2 code at
http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
Based on
A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
A. Schwartz and M. Hearst
Biocomputing, 2003, pp 451-462.
"""
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger('Abbre')
class Candidate(str):
def __init__(self, value):
super().__init__()
self.start = 0
self.stop = 0
def set_position(self, start, stop):
self.start = start
self.stop = stop
def yield_lines_from_file(file_path):
with open(file_path, 'rb') as f:
for line in f:
try:
line = line.decode('utf-8')
except UnicodeDecodeError:
line = line.decode('latin-1').encode('utf-8').decode('utf-8')
line = line.strip()
yield line
f.close()
def yield_lines_from_doc(doc_text):
for line in doc_text.split("\n"):
yield line.strip()
def best_candidates(sentence):
"""
:param sentence: line read from input file
:return: a Candidate iterator
"""
if '(' in sentence:
# Check some things first
if sentence.count('(') != sentence.count(')'):
raise ValueError("Unbalanced parentheses: {}".format(sentence))
if sentence.find('(') > sentence.find(')'):
raise ValueError("First parentheses is right: {}".format(sentence))
closeindex = -1
while 1:
# Look for open parenthesis
openindex = sentence.find('(', closeindex + 1)
if openindex == -1: break
# Look for closing parentheses
closeindex = openindex + 1
open = 1
skip = False
while open:
try:
char = sentence[closeindex]
except IndexError:
# We found an opening bracket but no associated closing bracket
# Skip the opening bracket
skip = True
break
if char == '(':
open += 1
elif char in [')', ';', ':']:
open -= 1
closeindex += 1
if skip:
closeindex = openindex + 1
continue
# Output if conditions are met
start = openindex + 1
stop = closeindex - 1
candidate = sentence[start:stop]
# Take into account whitespace that should be removed
start = start + len(candidate) - len(candidate.lstrip())
stop = stop - len(candidate) + len(candidate.rstrip())
candidate = sentence[start:stop]
if conditions(candidate):
new_candidate = Candidate(candidate)
new_candidate.set_position(start, stop)
yield new_candidate
def conditions(candidate):
"""
Based on Schwartz&Hearst
2 <= len(str) <= 10
len(tokens) <= 2
re.search('\p{L}', str)
str[0].isalnum()
and extra:
if it matches (\p{L}\.?\s?){2,}
it is a good candidate.
:param candidate: candidate abbreviation
:return: True if this is a good candidate
"""
viable = True
if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()):
viable = True
if len(candidate) < 2 or len(candidate) > 10:
viable = False
if len(candidate.split()) > 2:
viable = False
if not regex.search('\p{L}', candidate):
viable = False
if not candidate[0].isalnum():
viable = False
return viable
def get_definition(candidate, sentence):
"""
Takes a candidate and a sentence and returns the definition candidate.
The definintion candidate is the set of tokens (in front of the candidate)
that starts with a token starting with the first character of the candidate
:param candidate: candidate abbreviation
:param sentence: current sentence (single line from input file)
:return: candidate definition for this abbreviation
"""
# Take the tokens in front of the candidate
tokens = regex.split(r'[\s\-]+', sentence[:candidate.start - 2].lower())
#print(tokens)
# the char that we are looking for
key = candidate[0].lower()
# Count the number of tokens that start with the same character as the candidate
# print(tokens)
firstchars = [t[0] for t in tokens]
# print(firstchars)
definition_freq = firstchars.count(key)
candidate_freq = candidate.lower().count(key)
# Look for the list of tokens in front of candidate that
# have a sufficient number of tokens starting with key
if candidate_freq <= definition_freq:
# we should at least have a good number of starts
count = 0
start = 0
startindex = len(firstchars) - 1
while count < candidate_freq:
if abs(start) > len(firstchars):
raise ValueError("candiate {} not found".format(candidate))
start -= 1
# Look up key in the definition
try:
startindex = firstchars.index(key, len(firstchars) + start)
except ValueError:
pass
# Count the number of keys in definition
count = firstchars[startindex:].count(key)
# We found enough keys in the definition so return the definition as a definition candidate
start = len(' '.join(tokens[:startindex]))
stop = candidate.start - 1
candidate = sentence[start:stop]
# Remove whitespace
start = start + len(candidate) - len(candidate.lstrip())
stop = stop - len(candidate) + len(candidate.rstrip())
candidate = sentence[start:stop]
new_candidate = Candidate(candidate)
new_candidate.set_position(start, stop)
#print('new_candidate:')
#print(new_candidate,start,stop)
return new_candidate
else:
raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
def select_definition(definition, abbrev):
"""
Takes a definition candidate and an abbreviation candidate
and returns True if the chars in the abbreviation occur in the definition
Based on
A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst
:param definition: candidate definition
:param abbrev: candidate abbreviation
:return:
"""
if len(definition) < len(abbrev):
raise ValueError('Abbreviation is longer than definition')
if abbrev in definition.split():
raise ValueError('Abbreviation is full word of definition')
sindex = -1
lindex = -1
while 1:
try:
longchar = definition[lindex].lower()
except IndexError:
raise
shortchar = abbrev[sindex].lower()
if not shortchar.isalnum():
sindex -= 1
if sindex == -1 * len(abbrev):
if shortchar == longchar:
if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
break
else:
lindex -= 1
else:
lindex -= 1
if lindex == -1 * (len(definition) + 1):
raise ValueError("definition {} was not found in {}".format(abbrev, definition))
else:
if shortchar == longchar:
sindex -= 1
lindex -= 1
else:
lindex -= 1
# print('lindex:',lindex,len(definition),definition[lindex:len(definition)])
new_candidate = Candidate(definition[lindex:len(definition)])
new_candidate.set_position(definition.start+lindex+len(definition), definition.stop)
definition = new_candidate
tokens = len(definition.split())
length = len(abbrev)
if tokens > min([length + 5, length * 2]):
raise ValueError("did not meet min(|A|+5, |A|*2) constraint")
# Do not return definitions that contain unbalanced parentheses
if definition.count('(') != definition.count(')'):
raise ValueError("Unbalanced parentheses not allowed in a definition")
# print('select:')
# print(definition,definition.start, definition.stop)
new_definition_dict={'definition':definition,'start':definition.start,'stop':definition.stop}
return new_definition_dict
def extract_abbreviation_definition_pairs(file_path=None, doc_text=None):
abbrev_map = []
omit = 0
written = 0
if file_path:
sentence_iterator = enumerate(yield_lines_from_file(file_path))
elif doc_text:
sentence_iterator = enumerate(yield_lines_from_doc(doc_text))
else:
return abbrev_map
for i, sentence in sentence_iterator:
#print(sentence)
try:
for candidate in best_candidates(sentence):
#print(candidate)
try:
#print('begin get definition')
definition = get_definition(candidate, sentence)
#print('get_definition:')
#print(definition)
except (ValueError, IndexError) as e:
#log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0]))
omit += 1
else:
try:
definition_dict = select_definition(definition, candidate)
except (ValueError, IndexError) as e:
#log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition_dict, candidate, e.args[0]))
omit += 1
else:
definition_dict['abbre']=candidate
abbrev_map.append(definition_dict)
written += 1
except (ValueError, IndexError) as e:
log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0]))
log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit))
return abbrev_map
def postprocess_abbr(ner_result,ori_text):
final_result={}
if len(ner_result)==0:
return []
# abbr recognition
abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
# read ner results
nor_loc_list={} #{entity_name_location:entity_information}
for ele in ner_result:
nor_loc_list[str(ele[0])+' '+str(ele[1])]=ele
final_result['\t'.join(ele)]=[int(ele[0]),int(ele[1])]
#abbr matching
for abbr in abbr_result:
abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
if abbr_index in nor_loc_list.keys():
line=ori_text
abbr_text=abbr['abbre']
abbr_eid=0
while line.find(abbr_text)>=0:
abbr_sid=line.find(abbr_text)+abbr_eid
abbr_eid=abbr_sid+len(abbr_text)
# print(abbr_sid,abbr_eid)
if abbr_sid>0 and abbr_eid<len(ori_text):
if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
elif abbr_sid==0 and abbr_eid<len(ori_text):
if ori_text[abbr_eid].isalnum()==False:
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
elif abbr_sid>0 and abbr_eid==len(ori_text):
if ori_text[abbr_sid-1].isalnum()==False :
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+nor_loc_list[abbr_index][2]+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
line=ori_text[abbr_eid:]
# print(final_result)
sorted_final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
final_result=[]
for ele in sorted_final_result:
final_result.append(ele[0].split('\t'))
return final_result
def ner_abbr(ner_result,abbr_result,ori_text):
# read ner results
nor_name_list={} #{entity_name:entity_information}
nor_loc_list={} #{entity_name_location:entity_information}
final_result={} #{entity_information:location} use to sort
for ele in ner_result:
temp_seg=ele.split('\t')
nor_loc_list[temp_seg[0]+' '+temp_seg[1]]=temp_seg
nor_name_list[temp_seg[2].lower()]=temp_seg
final_result['\t'.join(temp_seg[0:4])]=[int(temp_seg[0]),int(temp_seg[1])]
#abbr matching
for abbr in abbr_result:
abbr_index=str(abbr['start'])+' '+str(abbr['stop'])
if abbr_index in nor_loc_list.keys():
line=ori_text
abbr_text=abbr['abbre']
abbr_eid=0
while line.find(abbr_text)>=0:
abbr_sid=line.find(abbr_text)+abbr_eid
abbr_eid=abbr_sid+len(abbr_text)
# print(abbr_sid,abbr_eid)
if abbr_sid>0 and abbr_eid<len(ori_text):
if ori_text[abbr_sid-1].isalnum()==False and ori_text[abbr_eid].isalnum()==False:
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
elif abbr_sid==0 and abbr_eid<len(ori_text):
if ori_text[abbr_eid].isalnum()==False:
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
elif abbr_sid>0 and abbr_eid==len(ori_text):
if ori_text[abbr_sid-1].isalnum()==False :
final_result[str(abbr_sid)+'\t'+str(abbr_eid)+'\t'+abbr_text+'\t'+nor_loc_list[abbr_index][3]]=[abbr_sid,abbr_eid]
line=ori_text[abbr_eid:]
# print(final_result)
final_result=sorted(final_result.items(), key=lambda kv:(kv[1]), reverse=False)
return final_result
if __name__ == '__main__':
path='//panfs/pan1/bionlp/lulab/luoling/HPO_project/diseaseTag/data/test/results/'
fin=open(path+'NCBI_test_phecr_95.tsv','r',encoding='utf-8')
context=fin.read().strip().split('\n\n')
fin.close()
fout=open(path+'NCBI_test_phecr_abbre_95.tsv','w',encoding='utf-8')
for doc in context:
lines=doc.split('\n')
ori_text=lines[1]
# print(ori_text)
fout.write(lines[0]+'\n'+lines[1]+'\n')
if len(lines)>2:
abbr_result=extract_abbreviation_definition_pairs(doc_text=ori_text)
print(abbr_result)
abbr_out=ner_abbr(lines[2:],abbr_result,ori_text)
else:
abbr_out=[]
# print('final:',abbr_out)
for ele in abbr_out:
fout.write(ele[0]+'\n')
fout.write('\n')
# sys.exit()
fout.close()
#last_out=combine_ml_dict_fn(abbr_out,infile)
#print(last_out)