FusionGDA / src /utils /downstream_disgenet.py
ZhaohanM
FusionGDA
a1af661
raw
history blame
6.02 kB
import json
import sys
import os
import torch
from utils.data_loader import GDA_Dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
sys.path.append("../")
class DisGeNETProcessor:
def __init__(self,input_csv_path):
train_data = pd.read_csv('data/downstream/GDA_Data/train.csv')
valid_data = pd.read_csv('data/downstream/GDA_Data/valid.csv')
test_data = pd.read_csv(input_csv_path)
# test_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/GDA_Data/test.csv')
# valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42)
# train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/train.csv')
# valid_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/valid.csv')
# train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/disgenet_finetune.csv')
# train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=42)
# valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42)
# alzheimer and stomach dataset use [["proteinSeq", "diseaseDes", "Y"]].dropna()
self.name = "DisGeNET"
self.train_dataset_df = train_data[["proteinSeq", "diseaseDes", "score"]].dropna()
self.val_dataset_df = valid_data[["proteinSeq", "diseaseDes", "score"]].dropna()
self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "score"]].dropna()
# self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "Y"]].dropna()
def get_train_examples(self, test=False):
"""get training examples
Args:
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False.
Returns:
_type_: _description_
"""
if test == 1: # Small testing set, to reduce the running time
return (
self.train_dataset_df["proteinSeq"].values[:4096],
self.train_dataset_df["diseaseDes"].values[:4096],
self.train_dataset_df["score"].values[:4096],
)
elif test > 1:
return (
self.train_dataset_df["proteinSeq"].values[:test],
self.train_dataset_df["diseaseDes"].values[:test],
self.train_dataset_df["score"].values[:test],
)
else:
return GDA_Dataset( (
self.train_dataset_df["proteinSeq"].values,
self.train_dataset_df["diseaseDes"].values,
self.train_dataset_df["score"].values,
))
def get_val_examples(self, test=False):
"""get validation examples
Args:
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False.
Returns:
_type_: _description_
"""
if test == 1: # Small testing set, to reduce the running time
return (
self.val_dataset_df["proteinSeq"].values[:1024],
self.val_dataset_df["diseaseDes"].values[:1024],
self.val_dataset_df["score"].values[:1024],
)
elif test > 1:
return (
self.val_dataset_df["proteinSeq"].values[:test],
self.val_dataset_df["diseaseDes"].values[:test],
self.val_dataset_df["score"].values[:test],
)
else:
return GDA_Dataset((
self.val_dataset_df["proteinSeq"].values,
self.val_dataset_df["diseaseDes"].values,
self.val_dataset_df["score"].values,
))
# def get_test_examples(self, test=False):
# """get test examples
# Args:
# test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False.
# Returns:
# _type_: _description_
# """
# if test == 1: # Small testing set, to reduce the running time
# return (
# self.test_dataset_df["proteinSeq"].values[:1024],
# self.test_dataset_df["diseaseDes"].values[:1024],
# self.test_dataset_df["Y"].values[:1024],
# )
# elif test > 1:
# return (
# self.test_dataset_df["proteinSeq"].values[:test],
# self.test_dataset_df["diseaseDes"].values[:test],
# self.test_dataset_df["Y"].values[:test],
# )
# else:
# return GDA_Dataset( (
# self.test_dataset_df["proteinSeq"].values,
# self.test_dataset_df["diseaseDes"].values,
# self.test_dataset_df["Y"].values,
# ))
def get_test_examples(self, test=False):
"""get test examples
Args:
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False.
Returns:
_type_: _description_
"""
if test == 1: # Small testing set, to reduce the running time
return (
self.test_dataset_df["proteinSeq"].values[:1024],
self.test_dataset_df["diseaseDes"].values[:1024],
self.test_dataset_df["score"].values[:1024],
)
elif test > 1:
return (
self.test_dataset_df["proteinSeq"].values[:test],
self.test_dataset_df["diseaseDes"].values[:test],
self.test_dataset_df["score"].values[:test],
)
else:
return GDA_Dataset( (
self.test_dataset_df["proteinSeq"].values,
self.test_dataset_df["diseaseDes"].values,
self.test_dataset_df["score"].values,
))