|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import itertools |
|
import pickle as pkl |
|
import random |
|
from argparse import ArgumentParser |
|
|
|
import pandas as pd |
|
from omegaconf import OmegaConf |
|
from tqdm import tqdm |
|
|
|
|
|
|
|
HEADERS = [ |
|
'CUI', |
|
'LAT', |
|
'TS', |
|
'LUI', |
|
'STT', |
|
'SUI', |
|
'ISPREF', |
|
'AUI', |
|
'SAUI', |
|
'SCUI', |
|
'SDUI', |
|
'SAB', |
|
'TTY', |
|
'CODE', |
|
'STR', |
|
'SRL', |
|
'SUPPRESS', |
|
'CVF', |
|
] |
|
|
|
|
|
def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers): |
|
""" |
|
Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS |
|
data file and creates different pair combinations for entities with the same CUI. Each row in the output |
|
will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs. |
|
Saves two .tsv output files, one for the train split and one for the validation split. |
|
Only data marked as English is added to the train and val splits. |
|
|
|
Arguments: |
|
data_path (str): path to MRCONSO.RRF UMLS data file |
|
train_save_name (str): path to where training data will be saved |
|
val_save_name (str): path to where validation data will be saved |
|
max_pairs (int): max number of pairs for any one CUI added to the train |
|
or validation splits |
|
train_split (float): precentage of raw data to be added to train set split |
|
headers (list): column lables within MRCONSO.RRF |
|
""" |
|
|
|
print("Loading training data file...") |
|
df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') |
|
train_file = open(train_save_name, 'w') |
|
val_file = open(val_save_name, 'w') |
|
|
|
cui = df["CUI"].iloc[0] |
|
names = [] |
|
random.seed(2021) |
|
|
|
for idx in tqdm(range(len(df))): |
|
|
|
if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]: |
|
continue |
|
|
|
|
|
if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG": |
|
concept_string = df["STR"].iloc[idx] |
|
names.append(concept_string) |
|
|
|
else: |
|
|
|
pairs = list(itertools.combinations(names, 2)) |
|
|
|
if len(pairs) == 0: |
|
|
|
cui = df["CUI"].iloc[idx] |
|
names = [df["STR"].iloc[idx]] |
|
continue |
|
|
|
|
|
cui = int(cui[1:]) |
|
random.shuffle(pairs) |
|
|
|
|
|
for pair in pairs[:max_pairs]: |
|
|
|
|
|
add_to_train = random.random() |
|
|
|
if add_to_train <= train_split: |
|
train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') |
|
else: |
|
val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') |
|
|
|
|
|
cui = df["CUI"].iloc[idx] |
|
names = [df["STR"].iloc[idx]] |
|
|
|
train_file.close() |
|
val_file.close() |
|
print("Finished making training and validation data") |
|
|
|
|
|
def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers): |
|
""" |
|
Generates data file needed to build a UMLS index and a hash table mapping each |
|
CUI to one canonical concept string. Takes the raw .RRF data file and saves |
|
a .tsv indec concept file as well as the a .pkl file of cui to concept string |
|
mappings. Only data marked as English is added to the index data file. |
|
|
|
Arguments: |
|
data_path (str): path to MRCONSO.RRF UMLS data file |
|
data_savename (str): path to where .tsv index data will be saved |
|
id2string_savename (str): path to where .pkl cui to string mapping will |
|
be saved |
|
headers (list): column lables within MRCONSO.RRF |
|
""" |
|
|
|
print("Loading index data file...") |
|
df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') |
|
id2string = {} |
|
|
|
with open(data_savename, "w") as outfile: |
|
for idx, row in tqdm(df.iterrows(), total=df.shape[0]): |
|
|
|
if type(row["STR"]) != str or "|" in row["STR"]: |
|
continue |
|
|
|
cui = row["CUI"] |
|
sent = row["STR"] |
|
|
|
|
|
cui = int(cui[1:]) |
|
|
|
|
|
if row["LAT"] == "ENG": |
|
outfile.write(f'{cui}\t{sent}\n') |
|
|
|
|
|
if cui not in id2string and ":" not in sent: |
|
id2string[cui] = sent |
|
|
|
outfile.close() |
|
pkl.dump(id2string, open(id2string_savename, "wb")) |
|
print("Finished saving index data and id to concept mapping") |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = ArgumentParser() |
|
parser.add_argument("--index", action="store_true", help="Whether to process data for building an index") |
|
parser.add_argument("--project_dir", required=False, type=str, default=".") |
|
parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml") |
|
parser.add_argument( |
|
"--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts" |
|
) |
|
parser.add_argument( |
|
"--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set" |
|
) |
|
|
|
args = parser.parse_args() |
|
cfg = OmegaConf.load(args.cfg) |
|
cfg.project_dir = args.project_dir |
|
|
|
if args.index: |
|
process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS) |
|
else: |
|
process_umls_training_dataset( |
|
cfg.model.raw_data, |
|
cfg.model.train_ds.data_file, |
|
cfg.model.validation_ds.data_file, |
|
args.max_pairs, |
|
args.train_split, |
|
HEADERS, |
|
) |
|
|