Spaces:
Sleeping
Sleeping
import json | |
import sys | |
import os | |
import torch | |
from utils.data_loader import GDA_Dataset | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import KFold | |
import numpy as np | |
import pandas as pd | |
sys.path.append("../") | |
class DisGeNETProcessor: | |
def __init__(self,input_csv_path): | |
train_data = pd.read_csv('data/downstream/GDA_Data/train.csv') | |
valid_data = pd.read_csv('data/downstream/GDA_Data/valid.csv') | |
test_data = pd.read_csv(input_csv_path) | |
# test_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/GDA_Data/test.csv') | |
# valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42) | |
# train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/train.csv') | |
# valid_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/valid.csv') | |
# train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/disgenet_finetune.csv') | |
# train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=42) | |
# valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42) | |
# alzheimer and stomach dataset use [["proteinSeq", "diseaseDes", "Y"]].dropna() | |
self.name = "DisGeNET" | |
self.train_dataset_df = train_data[["proteinSeq", "diseaseDes", "score"]].dropna() | |
self.val_dataset_df = valid_data[["proteinSeq", "diseaseDes", "score"]].dropna() | |
self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "score"]].dropna() | |
# self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "Y"]].dropna() | |
def get_train_examples(self, test=False): | |
"""get training examples | |
Args: | |
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. | |
Returns: | |
_type_: _description_ | |
""" | |
if test == 1: # Small testing set, to reduce the running time | |
return ( | |
self.train_dataset_df["proteinSeq"].values[:4096], | |
self.train_dataset_df["diseaseDes"].values[:4096], | |
self.train_dataset_df["score"].values[:4096], | |
) | |
elif test > 1: | |
return ( | |
self.train_dataset_df["proteinSeq"].values[:test], | |
self.train_dataset_df["diseaseDes"].values[:test], | |
self.train_dataset_df["score"].values[:test], | |
) | |
else: | |
return GDA_Dataset( ( | |
self.train_dataset_df["proteinSeq"].values, | |
self.train_dataset_df["diseaseDes"].values, | |
self.train_dataset_df["score"].values, | |
)) | |
def get_val_examples(self, test=False): | |
"""get validation examples | |
Args: | |
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. | |
Returns: | |
_type_: _description_ | |
""" | |
if test == 1: # Small testing set, to reduce the running time | |
return ( | |
self.val_dataset_df["proteinSeq"].values[:1024], | |
self.val_dataset_df["diseaseDes"].values[:1024], | |
self.val_dataset_df["score"].values[:1024], | |
) | |
elif test > 1: | |
return ( | |
self.val_dataset_df["proteinSeq"].values[:test], | |
self.val_dataset_df["diseaseDes"].values[:test], | |
self.val_dataset_df["score"].values[:test], | |
) | |
else: | |
return GDA_Dataset(( | |
self.val_dataset_df["proteinSeq"].values, | |
self.val_dataset_df["diseaseDes"].values, | |
self.val_dataset_df["score"].values, | |
)) | |
# def get_test_examples(self, test=False): | |
# """get test examples | |
# Args: | |
# test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. | |
# Returns: | |
# _type_: _description_ | |
# """ | |
# if test == 1: # Small testing set, to reduce the running time | |
# return ( | |
# self.test_dataset_df["proteinSeq"].values[:1024], | |
# self.test_dataset_df["diseaseDes"].values[:1024], | |
# self.test_dataset_df["Y"].values[:1024], | |
# ) | |
# elif test > 1: | |
# return ( | |
# self.test_dataset_df["proteinSeq"].values[:test], | |
# self.test_dataset_df["diseaseDes"].values[:test], | |
# self.test_dataset_df["Y"].values[:test], | |
# ) | |
# else: | |
# return GDA_Dataset( ( | |
# self.test_dataset_df["proteinSeq"].values, | |
# self.test_dataset_df["diseaseDes"].values, | |
# self.test_dataset_df["Y"].values, | |
# )) | |
def get_test_examples(self, test=False): | |
"""get test examples | |
Args: | |
test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. | |
Returns: | |
_type_: _description_ | |
""" | |
if test == 1: # Small testing set, to reduce the running time | |
return ( | |
self.test_dataset_df["proteinSeq"].values[:1024], | |
self.test_dataset_df["diseaseDes"].values[:1024], | |
self.test_dataset_df["score"].values[:1024], | |
) | |
elif test > 1: | |
return ( | |
self.test_dataset_df["proteinSeq"].values[:test], | |
self.test_dataset_df["diseaseDes"].values[:test], | |
self.test_dataset_df["score"].values[:test], | |
) | |
else: | |
return GDA_Dataset( ( | |
self.test_dataset_df["proteinSeq"].values, | |
self.test_dataset_df["diseaseDes"].values, | |
self.test_dataset_df["score"].values, | |
)) |