Add normalization methods

Browse files

Files changed (7) hide show

.gitattributes +1 -0
EntityNormalizer.py +51 -0
README.md +37 -11
dictionaries/disease_dict.csv +3 -0
dictionaries/drug_dict.csv +3 -0
predict.py +59 -36
requirements.txt +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

EntityNormalizer.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import mojimoji
+import pandas as pd
+from rapidfuzz import fuzz, process
+class EntityDictionary:
+    def __init__(self, path):
+        self.df = pd.read_csv(path)
+    def get_candidates_list(self):
+        return self.df.iloc[:, 0].to_list()
+    def get_normalization_list(self):
+        return self.df.iloc[:, 2].to_list()
+    def get_normalized_term(self, term):
+        return self.df[self.df.iloc[:, 0] == term].iloc[:, 2].item()
+class DiseaseDict(EntityDictionary):
+    def __init__(self):
+        super().__init__('dictionaries/disease_dict.csv')
+class DrugDict(EntityDictionary):
+    def __init__(self):
+        super().__init__('dictionaries/drug_dict.csv')
+class EntityNormalizer:
+    def __init__(self, database: EntityDictionary, matching_method=fuzz.ratio, matching_threshold=0):
+        self.database = database
+        self.matching_method = matching_method
+        self.matching_threshold = matching_threshold
+        self.candidates = [mojimoji.han_to_zen(x) for x in self.database.get_candidates_list()]
+    def normalize(self, term):
+        term = mojimoji.han_to_zen(term)
+        preferred_candidate = process.extractOne(term, self.candidates, scorer=self.matching_method)
+        score = preferred_candidate[1]
+        if score > self.matching_threshold:
+            ret = self.database.get_normalized_term(preferred_candidate[0])
+            return ('' if pd.isna(ret) else ret), score
+        else:
+            return '', score

README.md CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
-language:
-- ja
 license:
-- cc-by-4.0
 tags:
-- NER
-- medical documents
 datasets:
-- MedTxt-CR-JA-training-v2.xml
 metrics:
-- NTCIR-16 Real-MedNLP subtask 1
 ---
@@ -18,18 +18,45 @@ This is a model for named entity recognition of Japanese medical documents.
 ### How to use
 Download the following five files and put into the same folder.
 - id_to_tags.pkl
 - key_attr.pkl
 - NER_medNLP.py
 - predict.py
 - text.txt (This is an input file which should be predicted, which could be changed.)
-You can use this model by running predict.py.
 ```
 python3 predict.py
 ```
 ### Input Example
 ```
@@ -40,10 +67,9 @@ python3 predict.py
 ### Output Example
 ```
- <d certainty="positive">肥大型心筋症、心房細動</d>に対して<m-key state="executed">ＷＦ</m-key>投与が開始となった。
-<timex3 type="med">治療経過中</timex3>に<d certainty="positive">非持続性心室頻拍</d>が認められたため<m-key state="executed">アミオダロン</m-key>が併用となった。
 ```
 ### Publication
-Tomohiro Nishiyama, Aki Ando, Mihiro Nishidani, Shuntaro Yada, Shoko Wakamiya, Eiji Aramaki: NAISTSOC at the NTCIR-16 Real-MedNLP Task, In Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies (NTCIR-16), pp. 330-333, 2022

 ---
+language:
+  - ja
 license:
+  - cc-by-4.0
 tags:
+  - NER
+  - medical documents
 datasets:
+  - MedTxt-CR-JA-training-v2.xml
 metrics:
+  - NTCIR-16 Real-MedNLP subtask 1
 ---
 ### How to use
 Download the following five files and put into the same folder.
 - id_to_tags.pkl
 - key_attr.pkl
 - NER_medNLP.py
 - predict.py
 - text.txt (This is an input file which should be predicted, which could be changed.)
+You can use this model by running `predict.py`.
 ```
 python3 predict.py
 ```
+#### Entity normalization
+This model supports entity normalization via dictionary matching. The dictionary is a list of medical terms or
+drugs and their standard forms.
+Two different dictionaries are used for drug and disease normalization, stored in the `dictionaries` folder as
+`drug_dict.csv` and `disease_dict.csv`, respectively.
+To enable normalization you can add the `--normalize` flag to the `predict.py` command.
+```
+python3 predict.py --normalize
+```
+Normalization will add the `norm` attribute to the output XML tags. This attribute can be empty if a normalized form of
+the term is not found.
+The provided disease normalization dictionary (`dictionaties/disease_dict.csv`) is based on the [Manbyo Dictionary](https://sociocom.naist.jp/manbyo-dic-en/) and provides normalization to the standard ICD code for the diseases.
+The default drug dictionary (`dictionaties/drug_dict.csv`) is based on the [Hyakuyaku Dictionary](https://sociocom.naist.jp/hyakuyaku-dic-en/).
+The dictionary is a CSV file with three columns: the first column is the surface form term and the third column contain
+its standard form. The second column is not used.
+User can freely change the dictionary to fit their needs, as long as the format and filename are kept.
 ### Input Example
 ```
 ### Output Example
 ```
+<d certainty="positive" norm="I422">肥大型心筋症、心房細動</d>に対して<m-key state="executed" norm="ワルファリンカリウム">ＷＦ</m-key>投与が開始となった。
+<timex3 type="med">治療経過中</timex3>に<d certainty="positive" norm="I472">非持続性心室頻拍</d>が認められたため<m-key state="executed" norm="アミオダロン塩酸塩">アミオダロン</m-key>が併用となった。
 ```
 ### Publication

dictionaries/disease_dict.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e104832f7bc912497936c11c7196f7a7949a5e69d9414f47a6a6a3bf7caec6b
+size 20832536

dictionaries/drug_dict.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7b62baab746dc53ef6909840dfc2f8d2a80e591c930fbb0a035907082a4bec
+size 5442740

predict.py CHANGED Viewed

@@ -1,54 +1,49 @@
 # %%
 from tqdm import tqdm
 import unicodedata
 import re
 import pickle
 import torch
 import NER_medNLP as ner
-from bs4 import BeautifulSoup
-# import from_XML_to_json as XtC
-# import itertools
-# import random
-# import json
-# from torch.utils.data import DataLoader
-# from transformers import BertJapaneseTokenizer, BertForTokenClassification
-# import pytorch_lightning as pl
-# import pandas as pd
-# import numpy as np
-# import codecs
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-#%% global変数として使う
 dict_key = {}
-#%%
 def to_xml(data):
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
     text = data['text']
     count = 0
     for i, entities in enumerate(data['entities_predicted']):
         if entities == "":
-            return
         span = entities['span']
         type_id = id_to_tags[entities['type_id']].split('_')
         tag = type_id[0]
         if not type_id[1] == "":
-            attr = ' ' + value_to_key(type_id[1], key_attr) +  '=' + '"' + type_id[1] + '"'
         else:
             attr = ""
         add_tag = "<" + str(tag) + str(attr) + ">"
-        text = text[:span[0]+count] + add_tag + text[span[0]+count:]
         count += len(add_tag)
         add_tag = "</" + str(tag) + ">"
-        text = text[:span[1]+count] + add_tag + text[span[1]+count:]
         count += len(add_tag)
     return text
@@ -58,18 +53,18 @@ def predict_entities(modelpath, sentences_list, len_num_entity_type):
     #     checkpoint_path = modelpath + ".ckpt"
     # )
     # bert_tc = model.bert_tc.cuda()
-    model = ner.BertForTokenClassification_pl(modelpath, num_labels=81, lr=1e-5)
     bert_tc = model.bert_tc.to(device)
     MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
     tokenizer = ner.NER_tokenizer_BIO.from_pretrained(
         MODEL_NAME,
-        num_entity_type =  len_num_entity_type#Entityの数を変え忘れないように！
     )
-    #entities_list = [] # 正解の固有表現を追加していく
-    entities_predicted_list = [] # 抽出された固有表現を追加していく
     text_entities_set = []
     for dataset in sentences_list:
@@ -79,24 +74,25 @@ def predict_entities(modelpath, sentences_list, len_num_entity_type):
             encoding, spans = tokenizer.encode_plus_untagged(
                 text, return_tensors='pt'
             )
-            encoding = { k: v.to(device) for k, v in encoding.items() }
             with torch.no_grad():
                 output = bert_tc(**encoding)
                 scores = output.logits
                 scores = scores[0].cpu().numpy().tolist()
             # 分類スコアを固有表現に変換する
             entities_predicted = tokenizer.convert_bert_output_to_entities(
                 text, scores, spans
             )
-            #entities_list.append(sample['entities'])
             entities_predicted_list.append(entities_predicted)
             text_entities.append({'text': text, 'entities_predicted': entities_predicted})
         text_entities_set.append(text_entities)
     return text_entities_set
 def combine_sentences(text_entities_set, insert: str):
     documents = []
     for text_entities in tqdm(text_entities_set):
@@ -106,25 +102,51 @@ def combine_sentences(text_entities_set, insert: str):
         documents.append('\n'.join(document))
     return documents
-def value_to_key(value, key_attr):#attributeから属性名を取得
     global dict_key
     if dict_key.get(value) != None:
         return dict_key[value]
     for k in key_attr.keys():
         for v in key_attr[k]:
             if value == v:
-                dict_key[v]=k
                 return k
 # %%
 if __name__ == '__main__':
     with open("id_to_tags.pkl", "rb") as tf:
         id_to_tags = pickle.load(tf)
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
     with open('text.txt') as f:
         articles_raw = f.read()
     article_norm = unicodedata.normalize('NFKC', articles_raw)
@@ -133,10 +155,11 @@ if __name__ == '__main__':
     text_entities_set = predict_entities("sociocom/RealMedNLP_CR_JA", [sentences_norm], len(id_to_tags))
     for i, texts_ent in enumerate(text_entities_set[0]):
         texts_ent['text'] = sentences_raw[i]
     documents = combine_sentences(text_entities_set, '\n')

 # %%
+import argparse
 from tqdm import tqdm
 import unicodedata
 import re
 import pickle
 import torch
 import NER_medNLP as ner
+from EntityNormalizer import EntityNormalizer, DiseaseDict, DrugDict
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# %% global変数として使う
 dict_key = {}
+# %%
 def to_xml(data):
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
     text = data['text']
     count = 0
     for i, entities in enumerate(data['entities_predicted']):
         if entities == "":
+            return
         span = entities['span']
         type_id = id_to_tags[entities['type_id']].split('_')
         tag = type_id[0]
         if not type_id[1] == "":
+            attr = ' ' + value_to_key(type_id[1], key_attr) + '=' + '"' + type_id[1] + '"'
         else:
             attr = ""
+        if 'norm' in entities:
+            attr = attr + ' norm="' + str(entities['norm']) + '"'
         add_tag = "<" + str(tag) + str(attr) + ">"
+        text = text[:span[0] + count] + add_tag + text[span[0] + count:]
         count += len(add_tag)
         add_tag = "</" + str(tag) + ">"
+        text = text[:span[1] + count] + add_tag + text[span[1] + count:]
         count += len(add_tag)
     return text
     #     checkpoint_path = modelpath + ".ckpt"
     # )
     # bert_tc = model.bert_tc.cuda()
+    model = ner.BertForTokenClassification_pl(modelpath, num_labels=81, lr=1e-5)
     bert_tc = model.bert_tc.to(device)
     MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
     tokenizer = ner.NER_tokenizer_BIO.from_pretrained(
         MODEL_NAME,
+        num_entity_type=len_num_entity_type  # Entityの数を変え忘れないように！
     )
+    # entities_list = [] # 正解の固有表現を追加していく
+    entities_predicted_list = []  # 抽出された固有表現を追加していく
     text_entities_set = []
     for dataset in sentences_list:
             encoding, spans = tokenizer.encode_plus_untagged(
                 text, return_tensors='pt'
             )
+            encoding = {k: v.to(device) for k, v in encoding.items()}
             with torch.no_grad():
                 output = bert_tc(**encoding)
                 scores = output.logits
                 scores = scores[0].cpu().numpy().tolist()
             # 分類スコアを固有表現に変換する
             entities_predicted = tokenizer.convert_bert_output_to_entities(
                 text, scores, spans
             )
+            # entities_list.append(sample['entities'])
             entities_predicted_list.append(entities_predicted)
             text_entities.append({'text': text, 'entities_predicted': entities_predicted})
         text_entities_set.append(text_entities)
     return text_entities_set
 def combine_sentences(text_entities_set, insert: str):
     documents = []
     for text_entities in tqdm(text_entities_set):
         documents.append('\n'.join(document))
     return documents
+def value_to_key(value, key_attr):  # attributeから属性名を取得
     global dict_key
     if dict_key.get(value) != None:
         return dict_key[value]
     for k in key_attr.keys():
         for v in key_attr[k]:
             if value == v:
+                dict_key[v] = k
                 return k
 # %%
+def normalize_entities(text_entities_set):
+    disease_normalizer = EntityNormalizer(DiseaseDict(), matching_threshold=50)
+    drug_normalizer = EntityNormalizer(DrugDict(), matching_threshold=50)
+    for entry in text_entities_set:
+        for text_entities in entry:
+            entities = text_entities['entities_predicted']
+            for entity in entities:
+                tag = id_to_tags[entity['type_id']].split('_')[0]
+                normalizer = drug_normalizer if tag == 'm-key' \
+                    else disease_normalizer if tag == 'd' \
+                    else None
+                if normalizer is None:
+                    continue
+                normalization, score = normalizer.normalize(entity['name'])
+                entity['norm'] = str(normalization)
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Predict entities from text')
+    parser.add_argument('--normalize', action=argparse.BooleanOptionalAction, help='Enable entity normalization')
+    args = parser.parse_args()
     with open("id_to_tags.pkl", "rb") as tf:
         id_to_tags = pickle.load(tf)
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
     with open('text.txt') as f:
         articles_raw = f.read()
     article_norm = unicodedata.normalize('NFKC', articles_raw)
     text_entities_set = predict_entities("sociocom/RealMedNLP_CR_JA", [sentences_norm], len(id_to_tags))
     for i, texts_ent in enumerate(text_entities_set[0]):
         texts_ent['text'] = sentences_raw[i]
+    if args.normalize:
+        normalize_entities(text_entities_set)
     documents = combine_sentences(text_entities_set, '\n')

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==22.2.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+et-xmlfile==1.1.0
+filelock==3.11.0
+frozenlist==1.3.3
+fsspec==2023.4.0
+fugashi==1.2.1
+huggingface-hub==0.13.4
+idna==3.4
+ipadic==1.0.0
+Jinja2==3.1.2
+Levenshtein==0.20.9
+lightning-utilities==0.8.0
+MarkupSafe==2.1.2
+mojimoji==0.0.12
+mpmath==1.3.0
+multidict==6.0.4
+networkx==3.1
+numpy==1.24.2
+openpyxl==3.1.2
+packaging==23.0
+pandas==2.0.0
+python-dateutil==2.8.2
+pytorch-lightning==2.0.1.post0
+pytz==2023.3
+PyYAML==6.0
+rapidfuzz==2.15.1
+regex==2023.3.23
+requests==2.28.2
+six==1.16.0
+soupsieve==2.4
+sympy==1.11.1
+tokenizers==0.13.3
+torch==2.0.0
+torchmetrics==0.11.4
+tqdm==4.65.0
+transformers==4.27.4
+typing_extensions==4.5.0
+tzdata==2023.3
+urllib3==1.26.15
+yarl==1.8.2