bvishnu123's picture
setup
1212df0 verified
"""
Module for storing the Dataset class which will compartmentalize things like the
train-test split and shuffling (if needed.)
Feel free to extend the class if you want to implement something specific to
your method like dataset shuffling and batching for DL methods.
"""
from typing import Tuple, Optional
from scipy.sparse import hstack
import pandas as pd
from datasets import (
load_dataset,
Dataset,
DatasetDict,
ClassLabel,
Features,
Value
)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
from .utils import NegClassRandomSampler
class JobDataset:
"""
Wrapper around the AEGEAN dataset
"""
def __init__(self,
batch_size: int = 16,
train_test_split: Tuple[float, float, float] = (0.7, 0.1, 0.2)):
_dataset = load_dataset("victor/real-or-fake-fake-jobposting-prediction")
self._dataset: pd.DataFrame = _dataset['train'].to_pandas()
self._dataset['fraudulent'] = self._dataset['fraudulent'].astype(int)
self._size: int = len(self._dataset)
self._batch_size = batch_size
self.clean_dataset()
self.add_features()
self.set_train_test_split(*train_test_split)
def clean_dataset(self):
"""
Clean up the dataset.
- Fills None strings
- Converts label to an int
"""
self._dataset[["title", "description"]] = \
self._dataset[["title", "description"]].fillna("")
def add_features(self):
"""
Computes additional features on its own data.
- Concatenates the strings
"""
self._dataset["full_text"] = \
self._dataset[["title", "description"]].agg(' '.join, axis=1)
def set_train_test_split(self,
train_fr: float,
eval_fr: float,
test_fr: float,
seed: int = 42):
"""
Sets the train-test split. A seed is used for consistency.
"""
eval_fr = eval_fr / (train_fr + eval_fr)
_train_df, self._test_df = \
train_test_split(self._dataset, test_size=test_fr, random_state=seed)
self._train_df, self._eval_df = \
train_test_split(_train_df, test_size=eval_fr, random_state=seed)
# Functions for getting the training, eval, and test dataset
# The format of the dataset will depend on the model, so I'll leave this unimplemented for now
def get_training_set(self):
raise NotImplementedError
def get_validation_set(self):
raise NotImplementedError
def get_test_set(self):
raise NotImplementedError
class SVMJobDataset(JobDataset):
def __init__(self, vectorizer_params: Optional[dict] = None):
super().__init__()
if vectorizer_params is None:
vectorizer_params = {
'lowercase': True,
'stop_words': 'english',
'max_features': 1_000
}
self._title_vectorizer = TfidfVectorizer(**vectorizer_params)
self._description_vectorizer = TfidfVectorizer(**vectorizer_params)
def vectorize(self):
self._train_set = hstack([
self._title_vectorizer.fit_transform(self._train_df["title"]),
self._description_vectorizer.fit_transform(self._train_df["description"])
])
self._eval_set = hstack([
self._title_vectorizer.transform(self._eval_df["title"]),
self._description_vectorizer.transform(self._eval_df["description"])
])
self._test_set = hstack([
self._title_vectorizer.transform(self._test_df["title"]),
self._description_vectorizer.transform(self._test_df["description"])
])
def get_training_set(self):
return self._train_set, self._train_df["fraudulent"]
def get_validation_set(self):
return self._eval_set, self._eval_df["fraudulent"]
def get_test_set(self):
return self._test_set, self._test_df["fraudulent"]
class HuggingFaceJobDataset(JobDataset):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._hf_dataset = None
self._tokenized_dataset = None
self._tokenizer = None
self._data_collator = None
self.set_hf_dataset_dict()
self.set_tokenized_hf_dataset()
def set_hf_dataset_dict(self, recompute: bool = False):
if (self._hf_dataset is not None) and (not recompute):
print("HF dataset already exists, recompute not set to True, returning")
return
hf_dataset = DatasetDict()
# Set the splits
features = Features({
"full_text": Value("string"),
"fraudulent": ClassLabel(num_classes=2, names=[0,1]),
"__index_level_0__": Value("uint32")
})
columns = ["full_text", "fraudulent"]
hf_dataset['train'] = Dataset.from_pandas(self._train_df[columns], features=features)
hf_dataset['validation'] = Dataset.from_pandas(self._eval_df[columns], features=features)
hf_dataset['test'] = Dataset.from_pandas(self._test_df[columns], features=features)
# Set proper names
hf_dataset = hf_dataset \
.rename_column("full_text", "text") \
.rename_column("fraudulent", "labels")
# Remove the index
hf_dataset = hf_dataset.remove_columns("__index_level_0__")
self._sampler_ratio: float = None
self._hf_dataset = hf_dataset
def set_tokenized_hf_dataset(self, recompute: bool = False):
if (self._data_collator is not None) and (self._tokenized_dataset is not None) and (not recompute):
print("Tokenized dataset already exists, recompute not set to True, returning")
return
self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
hf_dataset = self._hf_dataset
tokenized_dataset = DatasetDict()
tokenized_dataset["train"] = hf_dataset["train"].map(self._preprocess_function, batched=True)
tokenized_dataset["validation"] = hf_dataset["validation"].map(self._preprocess_function, batched=True)
tokenized_dataset["test"] = hf_dataset["test"].map(self._preprocess_function, batched=True)
self._data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer)
self._tokenized_dataset = tokenized_dataset
def set_random_sampler_ratio(self, neg_class_ratio: float = 0.2):
"""
For randomly subsampling the negative class
"""
self._sampler_ratio = neg_class_ratio
def _get_set(self, dataset_name, dataloader, subsample):
_ds = self._tokenized_dataset[dataset_name]
if subsample:
sample_size = min(512, len(self._tokenized_dataset[dataset_name]))
_ds = self._tokenized_dataset[dataset_name].shuffle(seed=42).select(list(range(sample_size)))
if dataloader:
_dst = _ds.remove_columns("text")
if self._sampler_ratio is None:
_ds = DataLoader(
_dst,
shuffle=True,
batch_size=self._batch_size,
collate_fn=self._data_collator
)
else:
_ds = DataLoader(
_dst,
batch_size=self._batch_size,
collate_fn=self._data_collator,
sampler=NegClassRandomSampler(_dst, self._sampler_ratio)
)
return _ds
def get_training_set(self, dataloader=True, subsample=False):
return self._get_set("train", dataloader, subsample)
def get_validation_set(self, dataloader=True, subsample=False):
return self._get_set("validation", dataloader, subsample)
def get_test_set(self, dataloader=True, subsample=False):
return self._get_set("test", dataloader, subsample)
def get_preprocessors(self):
return {
"tokenizer": self._tokenizer,
"data_collator": self._data_collator
}
def _preprocess_function(self, examples):
return self._tokenizer(examples["text"], padding="max_length", truncation=True)