|
import mojimoji |
|
import pandas as pd |
|
from rapidfuzz import fuzz, process |
|
|
|
|
|
class EntityDictionary: |
|
|
|
def __init__(self, path, candidate_column, normalization_column): |
|
if path is None: |
|
raise ValueError('Path to dictionary file is not specified.') |
|
if candidate_column is None: |
|
raise ValueError('Candidate column is not specified.') |
|
if normalization_column is None: |
|
raise ValueError('Normalization column is not specified.') |
|
|
|
self.df = pd.read_csv(path) |
|
self.candidate_column = candidate_column |
|
self.normalization_column = normalization_column |
|
|
|
def get_candidates_list(self): |
|
return self.df.iloc[:, self.candidate_column].to_list() |
|
|
|
def get_normalization_list(self): |
|
return self.df.iloc[:, self.normalization_column].to_list() |
|
|
|
def get_normalized_term(self, term): |
|
return self.df[self.df.iloc[:, self.candidate_column] == term].iloc[:, self.normalization_column].item() |
|
|
|
|
|
class DefaultDiseaseDict(EntityDictionary): |
|
|
|
def __init__(self): |
|
super().__init__('dictionaries/disease_dict.csv', 0, 2) |
|
|
|
|
|
class DefaultDrugDict(EntityDictionary): |
|
|
|
def __init__(self): |
|
super().__init__('dictionaries/drug_dict.csv', 0, 2) |
|
|
|
|
|
class EntityNormalizer: |
|
|
|
def __init__(self, database: EntityDictionary, matching_method=fuzz.ratio, matching_threshold=0): |
|
self.database = database |
|
self.matching_method = matching_method |
|
self.matching_threshold = matching_threshold |
|
self.candidates = [mojimoji.han_to_zen(x) for x in self.database.get_candidates_list()] |
|
|
|
def normalize(self, term): |
|
term = mojimoji.han_to_zen(term) |
|
preferred_candidate = process.extractOne(term, self.candidates, scorer=self.matching_method) |
|
score = preferred_candidate[1] |
|
|
|
if score > self.matching_threshold: |
|
ret = self.database.get_normalized_term(preferred_candidate[0]) |
|
return ('' if pd.isna(ret) else ret), score |
|
else: |
|
return '', score |
|
|