NeMo / examples /nlp /entity_linking /data /umls_dataset_processing.py

thanks to NVIDIA ❤

7934b29 about 2 years ago

7 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import itertools
	import pickle as pkl
	import random
	from argparse import ArgumentParser

	import pandas as pd
	from omegaconf import OmegaConf
	from tqdm import tqdm

	# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/
	# section 3.3.4 Table 1
	HEADERS = [
	'CUI',
	'LAT',
	'TS',
	'LUI',
	'STT',
	'SUI',
	'ISPREF',
	'AUI',
	'SAUI',
	'SCUI',
	'SDUI',
	'SAB',
	'TTY',
	'CODE',
	'STR',
	'SRL',
	'SUPPRESS',
	'CVF',
	]


	def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers):
	"""
	Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS
	data file and creates different pair combinations for entities with the same CUI. Each row in the output
	will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs.
	Saves two .tsv output files, one for the train split and one for the validation split.
	Only data marked as English is added to the train and val splits.

	Arguments:
	data_path (str): path to MRCONSO.RRF UMLS data file
	train_save_name (str): path to where training data will be saved
	val_save_name (str): path to where validation data will be saved
	max_pairs (int): max number of pairs for any one CUI added to the train
	or validation splits
	train_split (float): precentage of raw data to be added to train set split
	headers (list): column lables within MRCONSO.RRF
	"""

	print("Loading training data file...")
	df = pd.read_table(data_path, names=headers, index_col=False, delimiter='\|')
	train_file = open(train_save_name, 'w')
	val_file = open(val_save_name, 'w')

	cui = df["CUI"].iloc[0]
	names = []
	random.seed(2021)

	for idx in tqdm(range(len(df))):
	# Address incorrectly formatted data
	if type(df["STR"].iloc[idx]) != str or "\|" in df["STR"].iloc[idx]:
	continue

	# Collect all english concept strings matching the current CUI
	if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG":
	concept_string = df["STR"].iloc[idx]
	names.append(concept_string)

	else:
	# Pair off concept synonyms to make training and val sets
	pairs = list(itertools.combinations(names, 2))

	if len(pairs) == 0:
	# Not enough concepts gathered to make a pair
	cui = df["CUI"].iloc[idx]
	names = [df["STR"].iloc[idx]]
	continue

	# Removing leading C to convert label string to int
	cui = int(cui[1:])
	random.shuffle(pairs)

	# Keep up to max pairs number pairs for any one concept
	for pair in pairs[:max_pairs]:

	# Want concepts in train and val splits to be randomly selected and mutually exclusive
	add_to_train = random.random()

	if add_to_train <= train_split:
	train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')
	else:
	val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n')

	# Switch to next concept
	cui = df["CUI"].iloc[idx]
	names = [df["STR"].iloc[idx]]

	train_file.close()
	val_file.close()
	print("Finished making training and validation data")


	def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers):
	"""
	Generates data file needed to build a UMLS index and a hash table mapping each
	CUI to one canonical concept string. Takes the raw .RRF data file and saves
	a .tsv indec concept file as well as the a .pkl file of cui to concept string
	mappings. Only data marked as English is added to the index data file.

	Arguments:
	data_path (str): path to MRCONSO.RRF UMLS data file
	data_savename (str): path to where .tsv index data will be saved
	id2string_savename (str): path to where .pkl cui to string mapping will
	be saved
	headers (list): column lables within MRCONSO.RRF
	"""

	print("Loading index data file...")
	df = pd.read_table(data_path, names=headers, index_col=False, delimiter='\|')
	id2string = {}

	with open(data_savename, "w") as outfile:
	for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
	# Address incorrectly formatted data
	if type(row["STR"]) != str or "\|" in row["STR"]:
	continue

	cui = row["CUI"]
	sent = row["STR"]

	# Removing leading C to convert label string to int
	cui = int(cui[1:])

	# Only keeping english concepts
	if row["LAT"] == "ENG":
	outfile.write(f'{cui}\t{sent}\n')

	# Matching each cui to one canonical string represention
	if cui not in id2string and ":" not in sent:
	id2string[cui] = sent

	outfile.close()
	pkl.dump(id2string, open(id2string_savename, "wb"))
	print("Finished saving index data and id to concept mapping")


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument("--index", action="store_true", help="Whether to process data for building an index")
	parser.add_argument("--project_dir", required=False, type=str, default=".")
	parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml")
	parser.add_argument(
	"--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts"
	)
	parser.add_argument(
	"--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set"
	)

	args = parser.parse_args()
	cfg = OmegaConf.load(args.cfg)
	cfg.project_dir = args.project_dir

	if args.index:
	process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS)
	else:
	process_umls_training_dataset(
	cfg.model.raw_data,
	cfg.model.train_ds.data_file,
	cfg.model.validation_ds.data_file,
	args.max_pairs,
	args.train_split,
	HEADERS,
	)