Update model with additional negative examples, improve support scripts

Browse files

Files changed (11) hide show

EntityNormalizer.py +17 -9
NER_medNLP.py +77 -79
README.md +86 -16
config.json +13 -1
id_to_tags.pkl +2 -2
model.safetensors +0 -3
predict.py +79 -42
pytorch_model.bin +2 -2
requirements.txt +30 -33
tokenizer_config.json +1 -1
utils.py +15 -0

EntityNormalizer.py CHANGED Viewed

@@ -5,29 +5,38 @@ from rapidfuzz import fuzz, process
 class EntityDictionary:
-    def __init__(self, path):
         self.df = pd.read_csv(path)
     def get_candidates_list(self):
-        return self.df.iloc[:, 0].to_list()
     def get_normalization_list(self):
-        return self.df.iloc[:, 2].to_list()
     def get_normalized_term(self, term):
-        return self.df[self.df.iloc[:, 0] == term].iloc[:, 2].item()
-class DiseaseDict(EntityDictionary):
     def __init__(self):
-        super().__init__('dictionaries/disease_dict.csv')
-class DrugDict(EntityDictionary):
     def __init__(self):
-        super().__init__('dictionaries/drug_dict.csv')
 class EntityNormalizer:
@@ -48,4 +57,3 @@ class EntityNormalizer:
             return ('' if pd.isna(ret) else ret), score
         else:
             return '', score

 class EntityDictionary:
+    def __init__(self, path, candidate_column, normalization_column):
+        if path is None:
+            raise ValueError('Path to dictionary file is not specified.')
+        if candidate_column is None:
+            raise ValueError('Candidate column is not specified.')
+        if normalization_column is None:
+            raise ValueError('Normalization column is not specified.')
         self.df = pd.read_csv(path)
+        self.candidate_column = candidate_column
+        self.normalization_column = normalization_column
     def get_candidates_list(self):
+        return self.df.iloc[:, self.candidate_column].to_list()
     def get_normalization_list(self):
+        return self.df.iloc[:, self.normalization_column].to_list()
     def get_normalized_term(self, term):
+        return self.df[self.df.iloc[:, self.candidate_column] == term].iloc[:, self.normalization_column].item()
+class DefaultDiseaseDict(EntityDictionary):
     def __init__(self):
+        super().__init__('dictionaries/disease_dict.csv', 0, 2)
+class DefaultDrugDict(EntityDictionary):
     def __init__(self):
+        super().__init__('dictionaries/drug_dict.csv', 0, 2)
 class EntityNormalizer:
             return ('' if pd.isna(ret) else ret), score
         else:
             return '', score

NER_medNLP.py CHANGED Viewed

@@ -1,46 +1,47 @@
 # %%
 import itertools
-from tqdm import tqdm
 import numpy as np
-import torch
-from transformers import BertJapaneseTokenizer, BertForTokenClassification
 import pytorch_lightning as pl
-# from torch.utils.data import DataLoader
-# import from_XML_to_json as XtC
-# import random
-# import json
-# import unicodedata
-# import pandas as pd
 # %%
-# 8-16
 # PyTorch Lightningのモデル
 class BertForTokenClassification_pl(pl.LightningModule):
-    def __init__(self, model_name, num_labels, lr):
         super().__init__()
         self.save_hyperparameters()
         self.bert_tc = BertForTokenClassification.from_pretrained(
-            model_name,
-            num_labels=num_labels
         )
     def training_step(self, batch, batch_idx):
         output = self.bert_tc(**batch)
         loss = output.loss
         self.log('train_loss', loss)
         return loss
     def validation_step(self, batch, batch_idx):
         output = self.bert_tc(**batch)
         val_loss = output.loss
         self.log('val_loss', val_loss)
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
 # %%
@@ -58,58 +59,58 @@ class NER_tokenizer_BIO(BertJapaneseTokenizer):
         符号化とラベル列の作成を行う。
         """
         # 固有表現の前後でtextを分割し、それぞれのラベルをつけておく。
-        splitted = [] # 分割後の文字列を追加していく
         position = 0
         for entity in entities:
             start = entity['span'][0]
             end = entity['span'][1]
             label = entity['type_id']
-            splitted.append({'text':text[position:start], 'label':0})
-            splitted.append({'text':text[start:end], 'label':label})
             position = end
-        splitted.append({'text': text[position:], 'label':0})
-        splitted = [ s for s in splitted if s['text'] ]
         # 分割されたそれぞれの文章をトークン化し、ラベルをつける。
-        tokens = [] # トークンを追加していく
-        labels = [] # ラベルを追加していく
         for s in splitted:
             tokens_splitted = self.tokenize(s['text'])
             label = s['label']
-            if label > 0: # 固有表現
                 # まずトークン全てにI-タグを付与
                 # 番号順O-tag:0, B-tag:1~タグの数，I-tag:タグの数〜
-                labels_splitted =  \
-                    [ label + self.num_entity_type ] * len(tokens_splitted)
                 # 先頭のトークンをB-タグにする
                 labels_splitted[0] = label
-            else: # それ以外
-                labels_splitted =  [0] * len(tokens_splitted)
             tokens.extend(tokens_splitted)
             labels.extend(labels_splitted)
         # 符号化を行いBERTに入力できる形式にする。
         input_ids = self.convert_tokens_to_ids(tokens)
         encoding = self.prepare_for_model(
-            input_ids,
-            max_length=max_length,
             padding='max_length',
             truncation=True
-        )
         # ラベルに特殊トークンを追加
         # max_lengthで切り取って，その前後に[CLS]と[SEP]を追加するためのラベルを入れる
-        labels = [0] + labels[:max_length-2] + [0]
         # max_lengthに満たない場合は，満たない分を後ろ側に追加する
-        labels = labels + [0]*( max_length - len(labels) )
         encoding['labels'] = labels
         return encoding
     def encode_plus_untagged(
-        self, text, max_length=None, return_tensors=None
     ):
         """
         文章をトークン化し、それぞれのトークンの文章中の位置も特定しておく。
@@ -117,50 +118,50 @@ class NER_tokenizer_BIO(BertJapaneseTokenizer):
         """
         # 文章のトークン化を行い、
         # それぞれのトークンと文章中の文字列を対応づける。
-        tokens = [] # トークンを追加していく。
-        tokens_original = [] # トークンに対応する文章中の文字列を追加していく。
-        words = self.word_tokenizer.tokenize(text) # MeCabで単語に分割
         for word in words:
             # 単語をサブワードに分割
-            tokens_word = self.subword_tokenizer.tokenize(word)
             tokens.extend(tokens_word)
-            if tokens_word[0] == '[UNK]': # 未知語への対応
                 tokens_original.append(word)
             else:
                 tokens_original.extend([
-                    token.replace('##','') for token in tokens_word
                 ])
         # 各トークンの文章中での位置を調べる。（空白の位置を考慮する）
         position = 0
-        spans = [] # トークンの位置を追加していく。
         for token in tokens_original:
             l = len(token)
             while 1:
-                if token != text[position:position+l]:
                     position += 1
                 else:
-                    spans.append([position, position+l])
                     position += l
                     break
         # 符号化を行いBERTに入力できる形式にする。
-        input_ids = self.convert_tokens_to_ids(tokens)
         encoding = self.prepare_for_model(
-            input_ids,
-            max_length=max_length,
-            padding='max_length' if max_length else False,
             truncation=True if max_length else False
         )
         sequence_length = len(encoding['input_ids'])
         # 特殊トークン[CLS]に対するダミーのspanを追加。
-        spans = [[-1, -1]] + spans[:sequence_length-2]
         # 特殊トークン[SEP]、[PAD]に対するダミーのspanを追加。
-        spans = spans + [[-1, -1]] * ( sequence_length - len(spans) )
         # 必要に応じてtorch.Tensorにする。
         if return_tensors == 'pt':
-            encoding = { k: torch.tensor([v]) for k, v in encoding.items() }
         return encoding, spans
@@ -169,28 +170,26 @@ class NER_tokenizer_BIO(BertJapaneseTokenizer):
         """
         Viterbiアルゴリズムで最適解を求める。
         """
-        m = 2*num_entity_type + 1
         penalty_matrix = np.zeros([m, m])
         for i in range(m):
-            for j in range(1+num_entity_type, m):
-                if not ( (i == j) or (i+num_entity_type == j) ):
-                    penalty_matrix[i,j] = penalty
-        path = [ [i] for i in range(m) ]
-        scores_path = scores_bert[0] - penalty_matrix[0,:]
         scores_bert = scores_bert[1:]
         for scores in scores_bert:
-            assert len(scores) == 2*num_entity_type + 1
-            score_matrix = np.array(scores_path).reshape(-1,1) \
-                + np.array(scores).reshape(1,-1) \
-                - penalty_matrix
             scores_path = score_matrix.max(axis=0)
             argmax = score_matrix.argmax(axis=0)
             path_new = []
             for i, idx in enumerate(argmax):
-                path_new.append( path[idx] + [i] )
             path = path_new
         labels_optimal = path[np.argmax(scores_path)]
@@ -203,26 +202,26 @@ class NER_tokenizer_BIO(BertJapaneseTokenizer):
         """
         assert len(spans) == len(scores)
         num_entity_type = self.num_entity_type
         # 特殊トークンに対応する部分を取り除く
-        scores = [score for score, span in zip(scores, spans) if span[0]!=-1]
-        spans = [span for span in spans if span[0]!=-1]
         # Viterbiアルゴリズムでラベルの予測値を決める。
         labels = self.Viterbi(scores, num_entity_type)
         # 同じラベルが連続するトークンをまとめて、固有表現を抽出する。
         entities = []
         for label, group \
-            in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
             group = list(group)
             start = spans[group[0][0]][0]
             end = spans[group[-1][0]][1]
-            if label != 0: # 固有表現であれば
                 if 1 <= label <= num_entity_type:
-                     # ラベルが`B-`ならば、新しいentityを追加
                     entity = {
                         "name": text[start:end],
                         "span": [start, end],
@@ -231,8 +230,7 @@ class NER_tokenizer_BIO(BertJapaneseTokenizer):
                     entities.append(entity)
                 else:
                     # ラベルが`I-`ならば、直近のentityを更新
-                    entity['span'][1] = end
                     entity['name'] = text[entity['span'][0]:entity['span'][1]]
-        return entities

 # %%
 import itertools
 import numpy as np
 import pytorch_lightning as pl
+import torch
+from transformers import BertForTokenClassification, BertJapaneseTokenizer
 # %%
+# 8-16
 # PyTorch Lightningのモデル
 class BertForTokenClassification_pl(pl.LightningModule):
+    def __init__(self, num_labels, model='sociocom/MedNERN-CR-JA', lr=1e-4):
         super().__init__()
+        self.lr = lr
         self.save_hyperparameters()
         self.bert_tc = BertForTokenClassification.from_pretrained(
+            model,
+            num_labels=num_labels,
+            ignore_mismatched_sizes=True
         )
+    @classmethod
+    def from_pretrained_bin(cls, model_path, num_labels):
+        model = cls(num_labels)
+        model.load_state_dict(torch.load(model_path))
+        return model
     def training_step(self, batch, batch_idx):
         output = self.bert_tc(**batch)
         loss = output.loss
         self.log('train_loss', loss)
         return loss
     def validation_step(self, batch, batch_idx):
         output = self.bert_tc(**batch)
         val_loss = output.loss
         self.log('val_loss', val_loss)
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.lr)
 # %%
         符号化とラベル列の作成を行う。
         """
         # 固有表現の前後でtextを分割し、それぞれのラベルをつけておく。
+        splitted = []  # 分割後の文字列を追加していく
         position = 0
         for entity in entities:
             start = entity['span'][0]
             end = entity['span'][1]
             label = entity['type_id']
+            splitted.append({'text': text[position:start], 'label': 0})
+            splitted.append({'text': text[start:end], 'label': label})
             position = end
+        splitted.append({'text': text[position:], 'label': 0})
+        splitted = [s for s in splitted if s['text']]
         # 分割されたそれぞれの文章をトークン化し、ラベルをつける。
+        tokens = []  # トークンを追加していく
+        labels = []  # ラベルを追加していく
         for s in splitted:
             tokens_splitted = self.tokenize(s['text'])
             label = s['label']
+            if label > 0:  # 固有表現
                 # まずトークン全てにI-タグを付与
                 # 番号順O-tag:0, B-tag:1~タグの数，I-tag:タグの数〜
+                labels_splitted = \
+                    [label + self.num_entity_type] * len(tokens_splitted)
                 # 先頭のトークンをB-タグにする
                 labels_splitted[0] = label
+            else:  # それ以外
+                labels_splitted = [0] * len(tokens_splitted)
             tokens.extend(tokens_splitted)
             labels.extend(labels_splitted)
         # 符号化を行いBERTに入力できる形式にする。
         input_ids = self.convert_tokens_to_ids(tokens)
         encoding = self.prepare_for_model(
+            input_ids,
+            max_length=max_length,
             padding='max_length',
             truncation=True
+        )
         # ラベルに特殊トークンを追加
         # max_lengthで切り取って，その前後に[CLS]と[SEP]を追加するためのラベルを入れる
+        labels = [0] + labels[:max_length - 2] + [0]
         # max_lengthに満たない場合は，満たない分を後ろ側に追加する
+        labels = labels + [0] * (max_length - len(labels))
         encoding['labels'] = labels
         return encoding
     def encode_plus_untagged(
+            self, text, max_length=None, return_tensors=None
     ):
         """
         文章をトークン化し、それぞれのトークンの文章中の位置も特定しておく。
         """
         # 文章のトークン化を行い、
         # それぞれのトークンと文章中の文字列を対応づける。
+        tokens = []  # トークンを追加していく。
+        tokens_original = []  # トークンに対応する文章中の文字列を追加していく。
+        words = self.word_tokenizer.tokenize(text)  # MeCabで単語に分割
         for word in words:
             # 単語をサブワードに分割
+            tokens_word = self.subword_tokenizer.tokenize(word)
             tokens.extend(tokens_word)
+            if tokens_word[0] == '[UNK]':  # 未知語への対応
                 tokens_original.append(word)
             else:
                 tokens_original.extend([
+                    token.replace('##', '') for token in tokens_word
                 ])
         # 各トークンの文章中での位置を調べる。（空白の位置を考慮する）
         position = 0
+        spans = []  # トークンの位置を追加していく。
         for token in tokens_original:
             l = len(token)
             while 1:
+                if token != text[position:position + l]:
                     position += 1
                 else:
+                    spans.append([position, position + l])
                     position += l
                     break
         # 符号化を行いBERTに入力できる形式にする。
+        input_ids = self.convert_tokens_to_ids(tokens)
         encoding = self.prepare_for_model(
+            input_ids,
+            max_length=max_length,
+            padding='max_length' if max_length else False,
             truncation=True if max_length else False
         )
         sequence_length = len(encoding['input_ids'])
         # 特殊トークン[CLS]に対するダミーのspanを追加。
+        spans = [[-1, -1]] + spans[:sequence_length - 2]
         # 特殊トークン[SEP]、[PAD]に対するダミーのspanを追加。
+        spans = spans + [[-1, -1]] * (sequence_length - len(spans))
         # 必要に応じてtorch.Tensorにする。
         if return_tensors == 'pt':
+            encoding = {k: torch.tensor([v]) for k, v in encoding.items()}
         return encoding, spans
         """
         Viterbiアルゴリズムで最適解を求める。
         """
+        m = 2 * num_entity_type + 1
         penalty_matrix = np.zeros([m, m])
         for i in range(m):
+            for j in range(1 + num_entity_type, m):
+                if not ((i == j) or (i + num_entity_type == j)):
+                    penalty_matrix[i, j] = penalty
+        path = [[i] for i in range(m)]
+        scores_path = scores_bert[0] - penalty_matrix[0, :]
         scores_bert = scores_bert[1:]
         for scores in scores_bert:
+            assert len(scores) == 2 * num_entity_type + 1
+            score_matrix = np.array(scores_path).reshape(-1, 1) \
+                           + np.array(scores).reshape(1, -1) \
+                           - penalty_matrix
             scores_path = score_matrix.max(axis=0)
             argmax = score_matrix.argmax(axis=0)
             path_new = []
             for i, idx in enumerate(argmax):
+                path_new.append(path[idx] + [i])
             path = path_new
         labels_optimal = path[np.argmax(scores_path)]
         """
         assert len(spans) == len(scores)
         num_entity_type = self.num_entity_type
         # 特殊トークンに対応する部分を取り除く
+        scores = [score for score, span in zip(scores, spans) if span[0] != -1]
+        spans = [span for span in spans if span[0] != -1]
         # Viterbiアルゴリズムでラベルの予測値を決める。
         labels = self.Viterbi(scores, num_entity_type)
         # 同じラベルが連続するトークンをまとめて、固有表現を抽出する。
         entities = []
         for label, group \
+                in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
             group = list(group)
             start = spans[group[0][0]][0]
             end = spans[group[-1][0]][1]
+            if label != 0:  # 固有表現であれば
                 if 1 <= label <= num_entity_type:
+                    # ラベルが`B-`ならば、新しいentityを追加
                     entity = {
                         "name": text[start:end],
                         "span": [start, end],
                     entities.append(entity)
                 else:
                     # ラベルが`I-`ならば、直近のentityを更新
+                    entity['span'][1] = end
                     entity['name'] = text[entity['span'][0]:entity['span'][1]]
+        return entities

README.md CHANGED Viewed

@@ -12,26 +12,65 @@ metrics:
   - NTCIR-16 Real-MedNLP subtask 1
 ---
 This is a model for named entity recognition of Japanese medical documents.
-### How to use
-Download the following five files and put into the same folder.
-- id_to_tags.pkl
-- key_attr.pkl
-- NER_medNLP.py
-- predict.py
-- text.txt (This is an input file which should be predicted, which could be changed.)
-You can use this model by running `predict.py`.
 ```
 python3 predict.py
 ```
-#### Entity normalization
 This model supports entity normalization via dictionary matching. The dictionary is a list of medical terms or
 drugs and their standard forms.
@@ -39,23 +78,42 @@ drugs and their standard forms.
 Two different dictionaries are used for drug and disease normalization, stored in the `dictionaries` folder as
 `drug_dict.csv` and `disease_dict.csv`, respectively.
-To enable normalization you can add the `--normalize` flag to the `predict.py` command.
 ```
-python3 predict.py --normalize
 ```
 Normalization will add the `norm` attribute to the output XML tags. This attribute can be empty if a normalized form of
 the term is not found.
-The provided disease normalization dictionary (`dictionaties/disease_dict.csv`) is based on the [Manbyo Dictionary](https://sociocom.naist.jp/manbyo-dic-en/) and provides normalization to the standard ICD code for the diseases.
-The default drug dictionary (`dictionaties/drug_dict.csv`) is based on the [Hyakuyaku Dictionary](https://sociocom.naist.jp/hyakuyaku-dic-en/).
 The dictionary is a CSV file with three columns: the first column is the surface form term and the third column contain
 its standard form. The second column is not used.
-User can freely change the dictionary to fit their needs, as long as the format and filename are kept.
 ### Input Example
@@ -71,5 +129,17 @@ User can freely change the dictionary to fit their needs, as long as the format
 <timex3 type="med">治療経過中</timex3>に<d certainty="positive" norm="I472">非持続性心室頻拍</d>が認められたため<m-key state="executed" norm="アミオダロン塩酸塩">アミオダロン</m-key>が併用となった。
 ```
-### Publication

   - NTCIR-16 Real-MedNLP subtask 1
 ---
 This is a model for named entity recognition of Japanese medical documents.
+# Introduction
+This repository contains the base model and a support predict script for using the model and providing a XML tagged text output.
+The original model was trained on the [MedTxt-CR-JA](https://sociocom.naist.jp/medtxt/cr) dataset, so the provided prediction code outputs XML tags in the same format.
+The script also provide the normalization method for the output entities, which is not embedded in the model.
+If you want to re-train or update the model, we provide additional support scripts in [this GitHub repository](https://github.com/sociocom/MedNERN-CR-JA).
+Issues and suggestions can also be submitted there.
+### A note about loading the model using standard HuggingFace methods
+This model should also be loadable using standard HuggingFace `from_pretrained` methods. However, the model by itself only outputs labels in the format "LABEL_0", "LABEL1", etc.
+The conversion of model outputs to the actual labels ("<m-key>, "<m-val>", "<timex-3>" etc.) is not yet embedded into the model, so the extra `id_to_tags.pkl` file is necessary
+to make the conversion. It contains a mapping from the model output ids to the actual labels.
+Such process can be done manually if needed, but the `predict.py` script already does that.
+We are currently working to better standardize the model to HuggingFace's standards.
+## How to use
+Clone the repository and install the requirements:
+```
+pip install -r requirements.txt
+```
+The code has been developed tested with Python 3.9 in MacOS 14.1 (M1 MacBook Pro).
+### Prediction
+The prediction script will output the results in the same XML format as the input file. It can be run with the following
+command:
 ```
 python3 predict.py
 ```
+The default parameters will take the model located in `pytorch_model.bin` and the input file `text.txt`.
+The resulting predictions will be output to the screen.
+To select a different model or input file, use the `-m` and `-i` parameters, respectively:
+```
+python3 predict.py -m <model_path> -i <your_input_file>.txt
+```
+The input file can be a single text file or a folder containing multiple `.txt` files, for batch processing. For example:
+```
+python3 predict.py -m <model_path> -i <your_input_folder>
+```
+### Entity normalization
 This model supports entity normalization via dictionary matching. The dictionary is a list of medical terms or
 drugs and their standard forms.
 Two different dictionaries are used for drug and disease normalization, stored in the `dictionaries` folder as
 `drug_dict.csv` and `disease_dict.csv`, respectively.
+To enable normalization you can add the `--normalize` flag to the `predict.py` command.
 ```
+python3 predict.py -m <model_path> --normalize
 ```
 Normalization will add the `norm` attribute to the output XML tags. This attribute can be empty if a normalized form of
 the term is not found.
+The provided disease normalization dictionary (`dictionaties/disease_dict.csv`) is based on
+the [Manbyo Dictionary](https://sociocom.naist.jp/manbyo-dic-en/) and provides normalization to the standard ICD code
+for the diseases.
+The default drug dictionary (`dictionaties/drug_dict.csv`) is based on
+the [Hyakuyaku Dictionary](https://sociocom.naist.jp/hyakuyaku-dic-en/).
 The dictionary is a CSV file with three columns: the first column is the surface form term and the third column contain
 its standard form. The second column is not used.
+### Replacing the default dictionaries
+User can freely change the dictionary to fit their needs by passing the path to a custom dictionary file.
+The dictionary file must have at least a column containing the list of surface forms and a column containing the list of
+normalized forms.
+The parameters `--drug_dict` and `--disease_dict` can be used to specify the path to the drug and disease dictionaries,
+respectively.
+When doing so, the respective parameters informing the column index of the surface form and normalized form must also be
+provided.
+You don't need to replace both dictionaries at the same time, you can replace only one of them.
+E.g.:
+```
+python3 predict.py --normalize --drug_dict dictionaries/drug_dict.csv --drug_surface_form 0 --drug_norm_form 2 --disease_dict dictionaries/disease_dict.csv --disease_surface_form 0 --disease_norm_form 2
+```
 ### Input Example
 <timex3 type="med">治療経過中</timex3>に<d certainty="positive" norm="I472">非持続性心室頻拍</d>が認められたため<m-key state="executed" norm="アミオダロン塩酸塩">アミオダロン</m-key>が併用となった。
 ```
+## Publication
+This model can be cited as:
+```
+@misc {social_computing_lab_2023,
+	author       = { {Social Computing Lab} },
+	title        = { MedNERN-CR-JA (Revision 13dbcb6) },
+	year         = 2023,
+	url          = { https://huggingface.co/sociocom/MedNERN-CR-JA },
+	doi          = { 10.57967/hf/0620 },
+	publisher    = { Hugging Face }
+}
+```

config.json CHANGED Viewed

@@ -89,7 +89,13 @@
     "77": "LABEL_77",
     "78": "LABEL_78",
     "79": "LABEL_79",
-    "80": "LABEL_80"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
@@ -174,6 +180,12 @@
     "LABEL_79": 79,
     "LABEL_8": 8,
     "LABEL_80": 80,
     "LABEL_9": 9
   },
   "layer_norm_eps": 1e-12,

     "77": "LABEL_77",
     "78": "LABEL_78",
     "79": "LABEL_79",
+    "80": "LABEL_80",
+    "81": "LABEL_81",
+    "82": "LABEL_82",
+    "83": "LABEL_83",
+    "84": "LABEL_84",
+    "85": "LABEL_85",
+    "86": "LABEL_86"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
     "LABEL_79": 79,
     "LABEL_8": 8,
     "LABEL_80": 80,
+    "LABEL_81": 81,
+    "LABEL_82": 82,
+    "LABEL_83": 83,
+    "LABEL_84": 84,
+    "LABEL_85": 85,
+    "LABEL_86": 86,
     "LABEL_9": 9
   },
   "layer_norm_eps": 1e-12,

id_to_tags.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57e7ea0bc4bdcaf4b19f7eec5c6edf2fce867cc9895cb20079b48881bc32ee5a
-size 620

 version https://git-lfs.github.com/spec/v1
+oid sha256:26cbbc0594cf7a1c4439a1010c5e2c55c1f0fb0a9664d93248b7b7d7de0cc434
+size 671

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:802f29afc4eae3cbf49f3957f9b2d27ae247e6e079ad6d947ccf181dff7c754c
-size 440383704

predict.py CHANGED Viewed

@@ -1,23 +1,24 @@
 # %%
 import argparse
-from tqdm import tqdm
-import unicodedata
-import re
 import pickle
 import torch
-import NER_medNLP as ner
-from EntityNormalizer import EntityNormalizer, DiseaseDict, DrugDict
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 # %% global変数として使う
 dict_key = {}
 # %%
-def to_xml(data):
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
@@ -27,7 +28,11 @@ def to_xml(data):
         if entities == "":
             return
         span = entities['span']
-        type_id = id_to_tags[entities['type_id']].split('_')
         tag = type_id[0]
         if not type_id[1] == "":
@@ -49,17 +54,11 @@ def to_xml(data):
 def predict_entities(modelpath, sentences_list, len_num_entity_type):
-    # model = ner.BertForTokenClassification_pl.load_from_checkpoint(
-    #     checkpoint_path = modelpath + ".ckpt"
-    # )
-    # bert_tc = model.bert_tc.cuda()
-    model = ner.BertForTokenClassification_pl(modelpath, num_labels=81, lr=1e-5)
     bert_tc = model.bert_tc.to(device)
-    MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
     tokenizer = ner.NER_tokenizer_BIO.from_pretrained(
-        MODEL_NAME,
         num_entity_type=len_num_entity_type  # Entityの数を変え忘れないように！
     )
@@ -69,7 +68,7 @@ def predict_entities(modelpath, sentences_list, len_num_entity_type):
     text_entities_set = []
     for dataset in sentences_list:
         text_entities = []
-        for sample in tqdm(dataset):
             text = sample
             encoding, spans = tokenizer.encode_plus_untagged(
                 text, return_tensors='pt'
@@ -93,12 +92,12 @@ def predict_entities(modelpath, sentences_list, len_num_entity_type):
     return text_entities_set
-def combine_sentences(text_entities_set, insert: str):
     documents = []
     for text_entities in tqdm(text_entities_set):
         document = []
         for t in text_entities:
-            document.append(to_xml(t))
         documents.append('\n'.join(document))
     return documents
@@ -115,9 +114,19 @@ def value_to_key(value, key_attr):  # attributeから属性名を取得
 # %%
-def normalize_entities(text_entities_set):
-    disease_normalizer = EntityNormalizer(DiseaseDict(), matching_threshold=50)
-    drug_normalizer = EntityNormalizer(DrugDict(), matching_threshold=50)
     for entry in text_entities_set:
         for text_entities in entry:
@@ -136,31 +145,59 @@ def normalize_entities(text_entities_set):
                 entity['norm'] = str(normalization)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Predict entities from text')
-    parser.add_argument('--normalize', action=argparse.BooleanOptionalAction, help='Enable entity normalization')
-    args = parser.parse_args()
     with open("id_to_tags.pkl", "rb") as tf:
         id_to_tags = pickle.load(tf)
-    with open("key_attr.pkl", "rb") as tf:
-        key_attr = pickle.load(tf)
-    with open('text.txt') as f:
-        articles_raw = f.read()
-    article_norm = unicodedata.normalize('NFKC', articles_raw)
-    sentences_raw = [s for s in re.split(r'\n', articles_raw) if s != '']
-    sentences_norm = [s for s in re.split(r'\n', article_norm) if s != '']
-    text_entities_set = predict_entities("sociocom/RealMedNLP_CR_JA", [sentences_norm], len(id_to_tags))
-    for i, texts_ent in enumerate(text_entities_set[0]):
-        texts_ent['text'] = sentences_raw[i]
-    if args.normalize:
-        normalize_entities(text_entities_set)
-    documents = combine_sentences(text_entities_set, '\n')
-    print(documents[0])

 # %%
 import argparse
+import os.path
 import pickle
+import unicodedata
 import torch
+from tqdm import tqdm
+import NER_medNLP as ner
+import utils
+from EntityNormalizer import EntityNormalizer, EntityDictionary, DefaultDiseaseDict, DefaultDrugDict
+device = torch.device("mps" if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
 # %% global変数として使う
 dict_key = {}
 # %%
+def to_xml(data, id_to_tags):
     with open("key_attr.pkl", "rb") as tf:
         key_attr = pickle.load(tf)
         if entities == "":
             return
         span = entities['span']
+        try:
+            type_id = id_to_tags[entities['type_id']].split('_')
+        except:
+            print("out of rage type_id", entities)
+            continue
         tag = type_id[0]
         if not type_id[1] == "":
 def predict_entities(modelpath, sentences_list, len_num_entity_type):
+    model = ner.BertForTokenClassification_pl.from_pretrained_bin(model_path=modelpath, num_labels=2 * len_num_entity_type + 1)
     bert_tc = model.bert_tc.to(device)
     tokenizer = ner.NER_tokenizer_BIO.from_pretrained(
+        'cl-tohoku/bert-base-japanese-whole-word-masking',
         num_entity_type=len_num_entity_type  # Entityの数を変え忘れないように！
     )
     text_entities_set = []
     for dataset in sentences_list:
         text_entities = []
+        for sample in tqdm(dataset, desc='Predict'):
             text = sample
             encoding, spans = tokenizer.encode_plus_untagged(
                 text, return_tensors='pt'
     return text_entities_set
+def combine_sentences(text_entities_set, id_to_tags, insert: str):
     documents = []
     for text_entities in tqdm(text_entities_set):
         document = []
         for t in text_entities:
+            document.append(to_xml(t, id_to_tags))
         documents.append('\n'.join(document))
     return documents
 # %%
+def normalize_entities(text_entities_set, id_to_tags, disease_dict=None, disease_candidate_col=None, disease_normalization_col=None, disease_matching_threshold=None, drug_dict=None,
+                       drug_candidate_col=None, drug_normalization_col=None, drug_matching_threshold=None):
+    if disease_dict:
+        disease_dict = EntityDictionary(disease_dict, disease_candidate_col, disease_normalization_col)
+    else:
+        disease_dict = DefaultDiseaseDict()
+    disease_normalizer = EntityNormalizer(disease_dict, matching_threshold=disease_matching_threshold)
+    if drug_dict:
+        drug_dict = EntityDictionary(drug_dict, drug_candidate_col, drug_normalization_col)
+    else:
+        drug_dict = DefaultDrugDict()
+    drug_normalizer = EntityNormalizer(drug_dict, matching_threshold=drug_matching_threshold)
     for entry in text_entities_set:
         for text_entities in entry:
                 entity['norm'] = str(normalization)
+def run(model, input, output=None, normalize=False, **kwargs):
     with open("id_to_tags.pkl", "rb") as tf:
         id_to_tags = pickle.load(tf)
+    if (os.path.isdir(input)):
+        files = [f for f in os.listdir(input) if os.path.isfile(os.path.join(input, f))]
+    else:
+        files = [input]
+    for file in tqdm(files, desc="Input file"):
+        with open(file) as f:
+            articles_raw = f.read()
+        article_norm = unicodedata.normalize('NFKC', articles_raw)
+        sentences_raw = utils.split_sentences(articles_raw)
+        sentences_norm = utils.split_sentences(article_norm)
+        text_entities_set = predict_entities(model, [sentences_norm], len(id_to_tags))
+        for i, texts_ent in enumerate(text_entities_set[0]):
+            texts_ent['text'] = sentences_raw[i]
+        if normalize:
+            normalize_entities(text_entities_set, id_to_tags, **kwargs)
+        documents = combine_sentences(text_entities_set, id_to_tags, '\n')
+        print(documents[0])
+        if output:
+            with open(file.replace(input, output), 'w') as f:
+                f.write(documents[0])
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Predict entities from text')
+    parser.add_argument('-m', '--model', type=str, default='pytorch_model.bin', help='Path to model checkpoint')
+    parser.add_argument('-i', '--input', type=str, default='text.txt', help='Path to text file or directory')
+    parser.add_argument('-o', '--output', type=str, default=None, help='Path to output file or directory')
+    parser.add_argument('-n', '--normalize', action=argparse.BooleanOptionalAction, help='Enable entity normalization', default=False)
+    # Dictionary override arguments
+    parser.add_argument("--drug-dict", help="File path for overriding the default drug dictionary")
+    parser.add_argument("--drug-candidate-col", type=int, help="Column name for drug candidates in the CSV file (required if --drug-dict is specified)")
+    parser.add_argument("--drug-normalization-col", type=int, help="Column name for drug normalization in the CSV file (required if --drug-dict is specified")
+    parser.add_argument('--disease-matching-threshold', type=int, default=50, help='Matching threshold for disease dictionary')
+    parser.add_argument("--disease-dict", help="File path for overriding the default disease dictionary")
+    parser.add_argument("--disease-candidate-col", type=int, help="Column name for disease candidates in the CSV file (required if --disease-dict is specified)")
+    parser.add_argument("--disease-normalization-col", type=int, help="Column name for disease normalization in the CSV file (required if --disease-dict is specified)")
+    parser.add_argument('--drug-matching-threshold', type=int, default=50, help='Matching threshold for drug dictionary')
+    args = parser.parse_args()
+    argument_dict = vars(args)
+    run(**argument_dict)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ccce71084b8f6e81415e8f8e07cf27f59087aa2fda02c296959322ef8acb8a6a
-size 440439601

 version https://git-lfs.github.com/spec/v1
+oid sha256:a92d7a2fc876d379593b3425f14aa001f439e2c4a3ca882768fd8a7a35be363d
+size 440466875

requirements.txt CHANGED Viewed

@@ -1,45 +1,42 @@
-aiohttp==3.8.4
 aiosignal==1.3.1
-async-timeout==4.0.2
-attrs==22.2.0
-certifi==2022.12.7
-charset-normalizer==3.1.0
-et-xmlfile==1.1.0
-filelock==3.11.0
-frozenlist==1.3.3
-fsspec==2023.4.0
-fugashi==1.2.1
-huggingface-hub==0.13.4
 idna==3.4
 ipadic==1.0.0
 Jinja2==3.1.2
-Levenshtein==0.20.9
-lightning-utilities==0.8.0
-MarkupSafe==2.1.2
 mojimoji==0.0.12
 mpmath==1.3.0
 multidict==6.0.4
-networkx==3.1
-numpy==1.24.2
-openpyxl==3.1.2
-packaging==23.0
-pandas==2.0.0
 python-dateutil==2.8.2
-pytorch-lightning==2.0.1.post0
-pytz==2023.3
-PyYAML==6.0
-rapidfuzz==2.15.1
-regex==2023.3.23
-requests==2.28.2
 six==1.16.0
-soupsieve==2.4
-sympy==1.11.1
 tokenizers==0.13.3
-torch==2.0.0
-torchmetrics==0.11.4
-tqdm==4.65.0
 transformers==4.27.4
-typing_extensions==4.5.0
 tzdata==2023.3
-urllib3==1.26.15
-yarl==1.8.2

+aiohttp==3.8.6
 aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.2
+filelock==3.13.1
+frozenlist==1.4.0
+fsspec==2023.10.0
+fugashi==1.3.0
+huggingface-hub==0.17.3
 idna==3.4
 ipadic==1.0.0
 Jinja2==3.1.2
+lightning-utilities==0.9.0
+MarkupSafe==2.1.3
 mojimoji==0.0.12
 mpmath==1.3.0
 multidict==6.0.4
+networkx==3.2.1
+numpy==1.26.2
+packaging==23.2
+pandas==2.1.3
 python-dateutil==2.8.2
+pytorch-lightning==2.1.1
+pytz==2023.3.post1
+PyYAML==6.0.1
+rapidfuzz==3.5.2
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.0
 six==1.16.0
+sympy==1.12
 tokenizers==0.13.3
+torch==2.1.0
+torchmetrics==1.2.0
+tqdm==4.66.1
 transformers==4.27.4
+typing_extensions==4.8.0
 tzdata==2023.3
+urllib3==2.1.0
+yarl==1.9.2

tokenizer_config.json CHANGED Viewed

@@ -20,5 +20,5 @@
       "NER_medNLP.NER_tokenizer_BIO",
       null
   ],
-  "num_entity_type": "40"
 }

       "NER_medNLP.NER_tokenizer_BIO",
       null
   ],
+  "num_entity_type": "43"
 }

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import re
+def split_sentences(text):
+    """Given a string, split it into sentences.
+    :param text: The string to be processed.
+    :return: The list of split sentences.
+    """
+    processed_text = re.split(
+        "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?!])\s\n*|(?<=[^A-zＡ-ｚ0-9０-９ ].)(?<=[。．.?？!！])(?![\.」])\n*", text)
+    # processed_text = re.split("(? <=[。?？!！])")  # In case only a simple regex is necessary
+    processed_text = [x.strip() for x in processed_text]
+    processed_text = [x for x in processed_text if x != '']
+    return processed_text