MedNERN-CR-JA / EntityNormalizer.py
gabrielandrade2's picture
Update model with additional negative examples, improve support scripts
576d564
raw history blame
No virus
2.04 kB
import mojimoji
import pandas as pd
from rapidfuzz import fuzz, process
class EntityDictionary:
def __init__(self, path, candidate_column, normalization_column):
if path is None:
raise ValueError('Path to dictionary file is not specified.')
if candidate_column is None:
raise ValueError('Candidate column is not specified.')
if normalization_column is None:
raise ValueError('Normalization column is not specified.')
self.df = pd.read_csv(path)
self.candidate_column = candidate_column
self.normalization_column = normalization_column
def get_candidates_list(self):
return self.df.iloc[:, self.candidate_column].to_list()
def get_normalization_list(self):
return self.df.iloc[:, self.normalization_column].to_list()
def get_normalized_term(self, term):
return self.df[self.df.iloc[:, self.candidate_column] == term].iloc[:, self.normalization_column].item()
class DefaultDiseaseDict(EntityDictionary):
def __init__(self):
super().__init__('dictionaries/disease_dict.csv', 0, 2)
class DefaultDrugDict(EntityDictionary):
def __init__(self):
super().__init__('dictionaries/drug_dict.csv', 0, 2)
class EntityNormalizer:
def __init__(self, database: EntityDictionary, matching_method=fuzz.ratio, matching_threshold=0):
self.database = database
self.matching_method = matching_method
self.matching_threshold = matching_threshold
self.candidates = [mojimoji.han_to_zen(x) for x in self.database.get_candidates_list()]
def normalize(self, term):
term = mojimoji.han_to_zen(term)
preferred_candidate = process.extractOne(term, self.candidates, scorer=self.matching_method)
score = preferred_candidate[1]
if score > self.matching_threshold:
ret = self.database.get_normalized_term(preferred_candidate[0])
return ('' if pd.isna(ret) else ret), score
else:
return '', score