NeMo / examples /nlp /entity_linking /data /umls_dataset_processing.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import pickle as pkl
import random
from argparse import ArgumentParser
import pandas as pd
from omegaconf import OmegaConf
from tqdm import tqdm
# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/
# section 3.3.4 Table 1
HEADERS = [
'CUI',
'LAT',
'TS',
'LUI',
'STT',
'SUI',
'ISPREF',
'AUI',
'SAUI',
'SCUI',
'SDUI',
'SAB',
'TTY',
'CODE',
'STR',
'SRL',
'SUPPRESS',
'CVF',
]
def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers):
"""
Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS
data file and creates different pair combinations for entities with the same CUI. Each row in the output
will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs.
Saves two .tsv output files, one for the train split and one for the validation split.
Only data marked as English is added to the train and val splits.
Arguments:
data_path (str): path to MRCONSO.RRF UMLS data file
train_save_name (str): path to where training data will be saved
val_save_name (str): path to where validation data will be saved
max_pairs (int): max number of pairs for any one CUI added to the train
or validation splits
train_split (float): precentage of raw data to be added to train set split
headers (list): column lables within MRCONSO.RRF
"""
print("Loading training data file...")
df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
train_file = open(train_save_name, 'w')
val_file = open(val_save_name, 'w')
cui = df["CUI"].iloc[0]
names = []
random.seed(2021)
for idx in tqdm(range(len(df))):
# Address incorrectly formatted data
if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]:
continue
# Collect all english concept strings matching the current CUI
if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG":
concept_string = df["STR"].iloc[idx]
names.append(concept_string)
else:
# Pair off concept synonyms to make training and val sets
pairs = list(itertools.combinations(names, 2))
if len(pairs) == 0:
# Not enough concepts gathered to make a pair
cui = df["CUI"].iloc[idx]
names = [df["STR"].iloc[idx]]
continue
# Removing leading C to convert label string to int
cui = int(cui[1:])
random.shuffle(pairs)
# Keep up to max pairs number pairs for any one concept
for pair in pairs[:max_pairs]:
# Want concepts in train and val splits to be randomly selected and mutually exclusive
add_to_train = random.random()
if add_to_train <= train_split:
train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
else:
val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
# Switch to next concept
cui = df["CUI"].iloc[idx]
names = [df["STR"].iloc[idx]]
train_file.close()
val_file.close()
print("Finished making training and validation data")
def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers):
"""
Generates data file needed to build a UMLS index and a hash table mapping each
CUI to one canonical concept string. Takes the raw .RRF data file and saves
a .tsv indec concept file as well as the a .pkl file of cui to concept string
mappings. Only data marked as English is added to the index data file.
Arguments:
data_path (str): path to MRCONSO.RRF UMLS data file
data_savename (str): path to where .tsv index data will be saved
id2string_savename (str): path to where .pkl cui to string mapping will
be saved
headers (list): column lables within MRCONSO.RRF
"""
print("Loading index data file...")
df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|')
id2string = {}
with open(data_savename, "w") as outfile:
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
# Address incorrectly formatted data
if type(row["STR"]) != str or "|" in row["STR"]:
continue
cui = row["CUI"]
sent = row["STR"]
# Removing leading C to convert label string to int
cui = int(cui[1:])
# Only keeping english concepts
if row["LAT"] == "ENG":
outfile.write(f'{cui}\t{sent}\n')
# Matching each cui to one canonical string represention
if cui not in id2string and ":" not in sent:
id2string[cui] = sent
outfile.close()
pkl.dump(id2string, open(id2string_savename, "wb"))
print("Finished saving index data and id to concept mapping")
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("--index", action="store_true", help="Whether to process data for building an index")
parser.add_argument("--project_dir", required=False, type=str, default=".")
parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml")
parser.add_argument(
"--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts"
)
parser.add_argument(
"--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set"
)
args = parser.parse_args()
cfg = OmegaConf.load(args.cfg)
cfg.project_dir = args.project_dir
if args.index:
process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS)
else:
process_umls_training_dataset(
cfg.model.raw_data,
cfg.model.train_ds.data_file,
cfg.model.validation_ds.data_file,
args.max_pairs,
args.train_split,
HEADERS,
)