Spaces:
Sleeping
Sleeping
File size: 1,701 Bytes
42b7ac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from abc import ABC, abstractmethod
from datasets import load_dataset, DatasetDict
from tasks.utils.evaluation import TextEvaluationRequest
class DataLoader(ABC):
@abstractmethod
def get_train_dataset(self):
pass
@abstractmethod
def get_test_dataset(self):
pass
class TextDataLoader(DataLoader):
def __init__(self, request: TextEvaluationRequest = TextEvaluationRequest(), light: bool = False):
self.label_mapping = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Load the dataset, and convert string labels to integers
dataset = load_dataset(request.dataset_name)
dataset = dataset.map(lambda x: {"label": self.label_mapping[x["label"]]})
self.dataset = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
# Create a smaller version of the dataset for quick testing
if light:
self.dataset = DatasetDict({
"train": self.dataset["train"].shuffle(seed=42).select(range(10)),
"test": self.dataset["test"].shuffle(seed=42).select(range(2))
})
def get_train_dataset(self):
return self.dataset["train"]
def get_test_dataset(self):
return self.dataset["test"]
def get_label_to_id_mapping(self):
return self.label_mapping
def get_id_to_label_mapping(self):
return {v: k for k, v in self.label_mapping.items()}
|