""" Module for storing the Dataset class which will compartmentalize things like the train-test split and shuffling (if needed.) Feel free to extend the class if you want to implement something specific to your method like dataset shuffling and batching for DL methods. """ from typing import Tuple, Optional from scipy.sparse import hstack import pandas as pd from datasets import ( load_dataset, Dataset, DatasetDict, ClassLabel, Features, Value ) from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from torch.utils.data import DataLoader from transformers import AutoTokenizer, DataCollatorWithPadding from .utils import NegClassRandomSampler class JobDataset: """ Wrapper around the AEGEAN dataset """ def __init__(self, batch_size: int = 16, train_test_split: Tuple[float, float, float] = (0.7, 0.1, 0.2)): _dataset = load_dataset("victor/real-or-fake-fake-jobposting-prediction") self._dataset: pd.DataFrame = _dataset['train'].to_pandas() self._dataset['fraudulent'] = self._dataset['fraudulent'].astype(int) self._size: int = len(self._dataset) self._batch_size = batch_size self.clean_dataset() self.add_features() self.set_train_test_split(*train_test_split) def clean_dataset(self): """ Clean up the dataset. - Fills None strings - Converts label to an int """ self._dataset[["title", "description"]] = \ self._dataset[["title", "description"]].fillna("") def add_features(self): """ Computes additional features on its own data. - Concatenates the strings """ self._dataset["full_text"] = \ self._dataset[["title", "description"]].agg(' '.join, axis=1) def set_train_test_split(self, train_fr: float, eval_fr: float, test_fr: float, seed: int = 42): """ Sets the train-test split. A seed is used for consistency. """ eval_fr = eval_fr / (train_fr + eval_fr) _train_df, self._test_df = \ train_test_split(self._dataset, test_size=test_fr, random_state=seed) self._train_df, self._eval_df = \ train_test_split(_train_df, test_size=eval_fr, random_state=seed) # Functions for getting the training, eval, and test dataset # The format of the dataset will depend on the model, so I'll leave this unimplemented for now def get_training_set(self): raise NotImplementedError def get_validation_set(self): raise NotImplementedError def get_test_set(self): raise NotImplementedError class SVMJobDataset(JobDataset): def __init__(self, vectorizer_params: Optional[dict] = None): super().__init__() if vectorizer_params is None: vectorizer_params = { 'lowercase': True, 'stop_words': 'english', 'max_features': 1_000 } self._title_vectorizer = TfidfVectorizer(**vectorizer_params) self._description_vectorizer = TfidfVectorizer(**vectorizer_params) def vectorize(self): self._train_set = hstack([ self._title_vectorizer.fit_transform(self._train_df["title"]), self._description_vectorizer.fit_transform(self._train_df["description"]) ]) self._eval_set = hstack([ self._title_vectorizer.transform(self._eval_df["title"]), self._description_vectorizer.transform(self._eval_df["description"]) ]) self._test_set = hstack([ self._title_vectorizer.transform(self._test_df["title"]), self._description_vectorizer.transform(self._test_df["description"]) ]) def get_training_set(self): return self._train_set, self._train_df["fraudulent"] def get_validation_set(self): return self._eval_set, self._eval_df["fraudulent"] def get_test_set(self): return self._test_set, self._test_df["fraudulent"] class HuggingFaceJobDataset(JobDataset): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._hf_dataset = None self._tokenized_dataset = None self._tokenizer = None self._data_collator = None self.set_hf_dataset_dict() self.set_tokenized_hf_dataset() def set_hf_dataset_dict(self, recompute: bool = False): if (self._hf_dataset is not None) and (not recompute): print("HF dataset already exists, recompute not set to True, returning") return hf_dataset = DatasetDict() # Set the splits features = Features({ "full_text": Value("string"), "fraudulent": ClassLabel(num_classes=2, names=[0,1]), "__index_level_0__": Value("uint32") }) columns = ["full_text", "fraudulent"] hf_dataset['train'] = Dataset.from_pandas(self._train_df[columns], features=features) hf_dataset['validation'] = Dataset.from_pandas(self._eval_df[columns], features=features) hf_dataset['test'] = Dataset.from_pandas(self._test_df[columns], features=features) # Set proper names hf_dataset = hf_dataset \ .rename_column("full_text", "text") \ .rename_column("fraudulent", "labels") # Remove the index hf_dataset = hf_dataset.remove_columns("__index_level_0__") self._sampler_ratio: float = None self._hf_dataset = hf_dataset def set_tokenized_hf_dataset(self, recompute: bool = False): if (self._data_collator is not None) and (self._tokenized_dataset is not None) and (not recompute): print("Tokenized dataset already exists, recompute not set to True, returning") return self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") hf_dataset = self._hf_dataset tokenized_dataset = DatasetDict() tokenized_dataset["train"] = hf_dataset["train"].map(self._preprocess_function, batched=True) tokenized_dataset["validation"] = hf_dataset["validation"].map(self._preprocess_function, batched=True) tokenized_dataset["test"] = hf_dataset["test"].map(self._preprocess_function, batched=True) self._data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer) self._tokenized_dataset = tokenized_dataset def set_random_sampler_ratio(self, neg_class_ratio: float = 0.2): """ For randomly subsampling the negative class """ self._sampler_ratio = neg_class_ratio def _get_set(self, dataset_name, dataloader, subsample): _ds = self._tokenized_dataset[dataset_name] if subsample: sample_size = min(512, len(self._tokenized_dataset[dataset_name])) _ds = self._tokenized_dataset[dataset_name].shuffle(seed=42).select(list(range(sample_size))) if dataloader: _dst = _ds.remove_columns("text") if self._sampler_ratio is None: _ds = DataLoader( _dst, shuffle=True, batch_size=self._batch_size, collate_fn=self._data_collator ) else: _ds = DataLoader( _dst, batch_size=self._batch_size, collate_fn=self._data_collator, sampler=NegClassRandomSampler(_dst, self._sampler_ratio) ) return _ds def get_training_set(self, dataloader=True, subsample=False): return self._get_set("train", dataloader, subsample) def get_validation_set(self, dataloader=True, subsample=False): return self._get_set("validation", dataloader, subsample) def get_test_set(self, dataloader=True, subsample=False): return self._get_set("test", dataloader, subsample) def get_preprocessors(self): return { "tokenizer": self._tokenizer, "data_collator": self._data_collator } def _preprocess_function(self, examples): return self._tokenizer(examples["text"], padding="max_length", truncation=True)