|
""" |
|
Module for storing the Dataset class which will compartmentalize things like the |
|
train-test split and shuffling (if needed.) |
|
|
|
Feel free to extend the class if you want to implement something specific to |
|
your method like dataset shuffling and batching for DL methods. |
|
""" |
|
|
|
from typing import Tuple, Optional |
|
|
|
from scipy.sparse import hstack |
|
import pandas as pd |
|
from datasets import ( |
|
load_dataset, |
|
Dataset, |
|
DatasetDict, |
|
ClassLabel, |
|
Features, |
|
Value |
|
) |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
from torch.utils.data import DataLoader |
|
from transformers import AutoTokenizer, DataCollatorWithPadding |
|
|
|
from .utils import NegClassRandomSampler |
|
|
|
|
|
class JobDataset: |
|
""" |
|
Wrapper around the AEGEAN dataset |
|
""" |
|
def __init__(self, |
|
batch_size: int = 16, |
|
train_test_split: Tuple[float, float, float] = (0.7, 0.1, 0.2)): |
|
_dataset = load_dataset("victor/real-or-fake-fake-jobposting-prediction") |
|
self._dataset: pd.DataFrame = _dataset['train'].to_pandas() |
|
self._dataset['fraudulent'] = self._dataset['fraudulent'].astype(int) |
|
self._size: int = len(self._dataset) |
|
self._batch_size = batch_size |
|
self.clean_dataset() |
|
self.add_features() |
|
self.set_train_test_split(*train_test_split) |
|
|
|
def clean_dataset(self): |
|
""" |
|
Clean up the dataset. |
|
- Fills None strings |
|
- Converts label to an int |
|
""" |
|
self._dataset[["title", "description"]] = \ |
|
self._dataset[["title", "description"]].fillna("") |
|
|
|
def add_features(self): |
|
""" |
|
Computes additional features on its own data. |
|
- Concatenates the strings |
|
""" |
|
self._dataset["full_text"] = \ |
|
self._dataset[["title", "description"]].agg(' '.join, axis=1) |
|
|
|
def set_train_test_split(self, |
|
train_fr: float, |
|
eval_fr: float, |
|
test_fr: float, |
|
seed: int = 42): |
|
""" |
|
Sets the train-test split. A seed is used for consistency. |
|
""" |
|
eval_fr = eval_fr / (train_fr + eval_fr) |
|
_train_df, self._test_df = \ |
|
train_test_split(self._dataset, test_size=test_fr, random_state=seed) |
|
self._train_df, self._eval_df = \ |
|
train_test_split(_train_df, test_size=eval_fr, random_state=seed) |
|
|
|
|
|
|
|
|
|
def get_training_set(self): |
|
raise NotImplementedError |
|
|
|
def get_validation_set(self): |
|
raise NotImplementedError |
|
|
|
def get_test_set(self): |
|
raise NotImplementedError |
|
|
|
|
|
class SVMJobDataset(JobDataset): |
|
def __init__(self, vectorizer_params: Optional[dict] = None): |
|
super().__init__() |
|
if vectorizer_params is None: |
|
vectorizer_params = { |
|
'lowercase': True, |
|
'stop_words': 'english', |
|
'max_features': 1_000 |
|
} |
|
self._title_vectorizer = TfidfVectorizer(**vectorizer_params) |
|
self._description_vectorizer = TfidfVectorizer(**vectorizer_params) |
|
|
|
def vectorize(self): |
|
self._train_set = hstack([ |
|
self._title_vectorizer.fit_transform(self._train_df["title"]), |
|
self._description_vectorizer.fit_transform(self._train_df["description"]) |
|
]) |
|
self._eval_set = hstack([ |
|
self._title_vectorizer.transform(self._eval_df["title"]), |
|
self._description_vectorizer.transform(self._eval_df["description"]) |
|
]) |
|
self._test_set = hstack([ |
|
self._title_vectorizer.transform(self._test_df["title"]), |
|
self._description_vectorizer.transform(self._test_df["description"]) |
|
]) |
|
|
|
def get_training_set(self): |
|
return self._train_set, self._train_df["fraudulent"] |
|
|
|
def get_validation_set(self): |
|
return self._eval_set, self._eval_df["fraudulent"] |
|
|
|
def get_test_set(self): |
|
return self._test_set, self._test_df["fraudulent"] |
|
|
|
|
|
class HuggingFaceJobDataset(JobDataset): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
self._hf_dataset = None |
|
self._tokenized_dataset = None |
|
self._tokenizer = None |
|
self._data_collator = None |
|
|
|
self.set_hf_dataset_dict() |
|
self.set_tokenized_hf_dataset() |
|
|
|
def set_hf_dataset_dict(self, recompute: bool = False): |
|
if (self._hf_dataset is not None) and (not recompute): |
|
print("HF dataset already exists, recompute not set to True, returning") |
|
return |
|
|
|
hf_dataset = DatasetDict() |
|
|
|
|
|
features = Features({ |
|
"full_text": Value("string"), |
|
"fraudulent": ClassLabel(num_classes=2, names=[0,1]), |
|
"__index_level_0__": Value("uint32") |
|
}) |
|
columns = ["full_text", "fraudulent"] |
|
hf_dataset['train'] = Dataset.from_pandas(self._train_df[columns], features=features) |
|
hf_dataset['validation'] = Dataset.from_pandas(self._eval_df[columns], features=features) |
|
hf_dataset['test'] = Dataset.from_pandas(self._test_df[columns], features=features) |
|
|
|
|
|
hf_dataset = hf_dataset \ |
|
.rename_column("full_text", "text") \ |
|
.rename_column("fraudulent", "labels") |
|
|
|
|
|
hf_dataset = hf_dataset.remove_columns("__index_level_0__") |
|
|
|
self._sampler_ratio: float = None |
|
self._hf_dataset = hf_dataset |
|
|
|
def set_tokenized_hf_dataset(self, recompute: bool = False): |
|
if (self._data_collator is not None) and (self._tokenized_dataset is not None) and (not recompute): |
|
print("Tokenized dataset already exists, recompute not set to True, returning") |
|
return |
|
|
|
self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") |
|
hf_dataset = self._hf_dataset |
|
|
|
tokenized_dataset = DatasetDict() |
|
tokenized_dataset["train"] = hf_dataset["train"].map(self._preprocess_function, batched=True) |
|
tokenized_dataset["validation"] = hf_dataset["validation"].map(self._preprocess_function, batched=True) |
|
tokenized_dataset["test"] = hf_dataset["test"].map(self._preprocess_function, batched=True) |
|
|
|
self._data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer) |
|
self._tokenized_dataset = tokenized_dataset |
|
|
|
def set_random_sampler_ratio(self, neg_class_ratio: float = 0.2): |
|
""" |
|
For randomly subsampling the negative class |
|
""" |
|
self._sampler_ratio = neg_class_ratio |
|
|
|
def _get_set(self, dataset_name, dataloader, subsample): |
|
_ds = self._tokenized_dataset[dataset_name] |
|
if subsample: |
|
sample_size = min(512, len(self._tokenized_dataset[dataset_name])) |
|
_ds = self._tokenized_dataset[dataset_name].shuffle(seed=42).select(list(range(sample_size))) |
|
if dataloader: |
|
_dst = _ds.remove_columns("text") |
|
if self._sampler_ratio is None: |
|
_ds = DataLoader( |
|
_dst, |
|
shuffle=True, |
|
batch_size=self._batch_size, |
|
collate_fn=self._data_collator |
|
) |
|
else: |
|
_ds = DataLoader( |
|
_dst, |
|
batch_size=self._batch_size, |
|
collate_fn=self._data_collator, |
|
sampler=NegClassRandomSampler(_dst, self._sampler_ratio) |
|
) |
|
return _ds |
|
|
|
def get_training_set(self, dataloader=True, subsample=False): |
|
return self._get_set("train", dataloader, subsample) |
|
|
|
def get_validation_set(self, dataloader=True, subsample=False): |
|
return self._get_set("validation", dataloader, subsample) |
|
|
|
def get_test_set(self, dataloader=True, subsample=False): |
|
return self._get_set("test", dataloader, subsample) |
|
|
|
def get_preprocessors(self): |
|
return { |
|
"tokenizer": self._tokenizer, |
|
"data_collator": self._data_collator |
|
} |
|
|
|
def _preprocess_function(self, examples): |
|
return self._tokenizer(examples["text"], padding="max_length", truncation=True) |