bvishnu123 commited on
Commit
1212df0
1 Parent(s): e7e226b
fake_job_detector/__init__.py ADDED
File without changes
fake_job_detector/cli.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ # Define the models we can choose from
5
+ class ModelEnum:
6
+ SVM = "svm"
7
+ DISTILBERT = "distilbert"
8
+
9
+ @classmethod
10
+ def choices(cls):
11
+ return [cls.SVM, cls.DISTILBERT]
12
+
13
+ # Define the CLI parser
14
+ parser = argparse.ArgumentParser(description="CLI for predicting if a job is fake based on the title and description")
15
+ subparsers = parser.add_subparsers(title="subcommands", dest="subcommand")
16
+
17
+ # Prediction sub-command
18
+ predict_parser = subparsers.add_parser("predict", help="Make predictions using a trained model")
19
+ predict_parser.add_argument("-m", "--model", choices=ModelEnum.choices(), required=True, help="Model to choose between SVM baseline and fine-tuned DistilBERT")
20
+ predict_parser.add_argument("-f", "--file", required=True, help="Path to trained model folder")
21
+ predict_parser.add_argument("--title", required=True, help="Job title to classify")
22
+ predict_parser.add_argument("--description", required=True, help="Job description to classify")
23
+
24
+ # Parse the arguments
25
+ args = parser.parse_args()
26
+
27
+ if args.subcommand == "predict":
28
+ print(f"""===\n\nPredicting with {args.model} using
29
+ title '{args.title[:50]}{'...' if len(args.title) > 50 else ''}' and
30
+ description '{args.description[:50]}{'...' if len(args.description) > 50 else ''}'""")
31
+
32
+ if args.model == ModelEnum.SVM:
33
+ print("Loading SVM model...")
34
+ from fake_job_detector.models import BaselineSVMModel
35
+ model = BaselineSVMModel()
36
+ model.load_model(args.file)
37
+
38
+ elif args.model == ModelEnum.DISTILBERT:
39
+ print("Loading DistilBERT model for CPU inference...")
40
+ from fake_job_detector.models import DistilBERTBaseModel
41
+ model = DistilBERTBaseModel(pretrained_model=args.file, cpu=True)
42
+
43
+ print(f"===\n\nJob is {'fake' if model(args.title, args.description) else 'real'}")
fake_job_detector/dataset.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module for storing the Dataset class which will compartmentalize things like the
3
+ train-test split and shuffling (if needed.)
4
+
5
+ Feel free to extend the class if you want to implement something specific to
6
+ your method like dataset shuffling and batching for DL methods.
7
+ """
8
+
9
+ from typing import Tuple, Optional
10
+
11
+ from scipy.sparse import hstack
12
+ import pandas as pd
13
+ from datasets import (
14
+ load_dataset,
15
+ Dataset,
16
+ DatasetDict,
17
+ ClassLabel,
18
+ Features,
19
+ Value
20
+ )
21
+ from sklearn.model_selection import train_test_split
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+
24
+ from torch.utils.data import DataLoader
25
+ from transformers import AutoTokenizer, DataCollatorWithPadding
26
+
27
+ from .utils import NegClassRandomSampler
28
+
29
+
30
+ class JobDataset:
31
+ """
32
+ Wrapper around the AEGEAN dataset
33
+ """
34
+ def __init__(self,
35
+ batch_size: int = 16,
36
+ train_test_split: Tuple[float, float, float] = (0.7, 0.1, 0.2)):
37
+ _dataset = load_dataset("victor/real-or-fake-fake-jobposting-prediction")
38
+ self._dataset: pd.DataFrame = _dataset['train'].to_pandas()
39
+ self._dataset['fraudulent'] = self._dataset['fraudulent'].astype(int)
40
+ self._size: int = len(self._dataset)
41
+ self._batch_size = batch_size
42
+ self.clean_dataset()
43
+ self.add_features()
44
+ self.set_train_test_split(*train_test_split)
45
+
46
+ def clean_dataset(self):
47
+ """
48
+ Clean up the dataset.
49
+ - Fills None strings
50
+ - Converts label to an int
51
+ """
52
+ self._dataset[["title", "description"]] = \
53
+ self._dataset[["title", "description"]].fillna("")
54
+
55
+ def add_features(self):
56
+ """
57
+ Computes additional features on its own data.
58
+ - Concatenates the strings
59
+ """
60
+ self._dataset["full_text"] = \
61
+ self._dataset[["title", "description"]].agg(' '.join, axis=1)
62
+
63
+ def set_train_test_split(self,
64
+ train_fr: float,
65
+ eval_fr: float,
66
+ test_fr: float,
67
+ seed: int = 42):
68
+ """
69
+ Sets the train-test split. A seed is used for consistency.
70
+ """
71
+ eval_fr = eval_fr / (train_fr + eval_fr)
72
+ _train_df, self._test_df = \
73
+ train_test_split(self._dataset, test_size=test_fr, random_state=seed)
74
+ self._train_df, self._eval_df = \
75
+ train_test_split(_train_df, test_size=eval_fr, random_state=seed)
76
+
77
+ # Functions for getting the training, eval, and test dataset
78
+ # The format of the dataset will depend on the model, so I'll leave this unimplemented for now
79
+
80
+ def get_training_set(self):
81
+ raise NotImplementedError
82
+
83
+ def get_validation_set(self):
84
+ raise NotImplementedError
85
+
86
+ def get_test_set(self):
87
+ raise NotImplementedError
88
+
89
+
90
+ class SVMJobDataset(JobDataset):
91
+ def __init__(self, vectorizer_params: Optional[dict] = None):
92
+ super().__init__()
93
+ if vectorizer_params is None:
94
+ vectorizer_params = {
95
+ 'lowercase': True,
96
+ 'stop_words': 'english',
97
+ 'max_features': 1_000
98
+ }
99
+ self._title_vectorizer = TfidfVectorizer(**vectorizer_params)
100
+ self._description_vectorizer = TfidfVectorizer(**vectorizer_params)
101
+
102
+ def vectorize(self):
103
+ self._train_set = hstack([
104
+ self._title_vectorizer.fit_transform(self._train_df["title"]),
105
+ self._description_vectorizer.fit_transform(self._train_df["description"])
106
+ ])
107
+ self._eval_set = hstack([
108
+ self._title_vectorizer.transform(self._eval_df["title"]),
109
+ self._description_vectorizer.transform(self._eval_df["description"])
110
+ ])
111
+ self._test_set = hstack([
112
+ self._title_vectorizer.transform(self._test_df["title"]),
113
+ self._description_vectorizer.transform(self._test_df["description"])
114
+ ])
115
+
116
+ def get_training_set(self):
117
+ return self._train_set, self._train_df["fraudulent"]
118
+
119
+ def get_validation_set(self):
120
+ return self._eval_set, self._eval_df["fraudulent"]
121
+
122
+ def get_test_set(self):
123
+ return self._test_set, self._test_df["fraudulent"]
124
+
125
+
126
+ class HuggingFaceJobDataset(JobDataset):
127
+ def __init__(self, *args, **kwargs):
128
+ super().__init__(*args, **kwargs)
129
+ self._hf_dataset = None
130
+ self._tokenized_dataset = None
131
+ self._tokenizer = None
132
+ self._data_collator = None
133
+
134
+ self.set_hf_dataset_dict()
135
+ self.set_tokenized_hf_dataset()
136
+
137
+ def set_hf_dataset_dict(self, recompute: bool = False):
138
+ if (self._hf_dataset is not None) and (not recompute):
139
+ print("HF dataset already exists, recompute not set to True, returning")
140
+ return
141
+
142
+ hf_dataset = DatasetDict()
143
+
144
+ # Set the splits
145
+ features = Features({
146
+ "full_text": Value("string"),
147
+ "fraudulent": ClassLabel(num_classes=2, names=[0,1]),
148
+ "__index_level_0__": Value("uint32")
149
+ })
150
+ columns = ["full_text", "fraudulent"]
151
+ hf_dataset['train'] = Dataset.from_pandas(self._train_df[columns], features=features)
152
+ hf_dataset['validation'] = Dataset.from_pandas(self._eval_df[columns], features=features)
153
+ hf_dataset['test'] = Dataset.from_pandas(self._test_df[columns], features=features)
154
+
155
+ # Set proper names
156
+ hf_dataset = hf_dataset \
157
+ .rename_column("full_text", "text") \
158
+ .rename_column("fraudulent", "labels")
159
+
160
+ # Remove the index
161
+ hf_dataset = hf_dataset.remove_columns("__index_level_0__")
162
+
163
+ self._sampler_ratio: float = None
164
+ self._hf_dataset = hf_dataset
165
+
166
+ def set_tokenized_hf_dataset(self, recompute: bool = False):
167
+ if (self._data_collator is not None) and (self._tokenized_dataset is not None) and (not recompute):
168
+ print("Tokenized dataset already exists, recompute not set to True, returning")
169
+ return
170
+
171
+ self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
172
+ hf_dataset = self._hf_dataset
173
+
174
+ tokenized_dataset = DatasetDict()
175
+ tokenized_dataset["train"] = hf_dataset["train"].map(self._preprocess_function, batched=True)
176
+ tokenized_dataset["validation"] = hf_dataset["validation"].map(self._preprocess_function, batched=True)
177
+ tokenized_dataset["test"] = hf_dataset["test"].map(self._preprocess_function, batched=True)
178
+
179
+ self._data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer)
180
+ self._tokenized_dataset = tokenized_dataset
181
+
182
+ def set_random_sampler_ratio(self, neg_class_ratio: float = 0.2):
183
+ """
184
+ For randomly subsampling the negative class
185
+ """
186
+ self._sampler_ratio = neg_class_ratio
187
+
188
+ def _get_set(self, dataset_name, dataloader, subsample):
189
+ _ds = self._tokenized_dataset[dataset_name]
190
+ if subsample:
191
+ sample_size = min(512, len(self._tokenized_dataset[dataset_name]))
192
+ _ds = self._tokenized_dataset[dataset_name].shuffle(seed=42).select(list(range(sample_size)))
193
+ if dataloader:
194
+ _dst = _ds.remove_columns("text")
195
+ if self._sampler_ratio is None:
196
+ _ds = DataLoader(
197
+ _dst,
198
+ shuffle=True,
199
+ batch_size=self._batch_size,
200
+ collate_fn=self._data_collator
201
+ )
202
+ else:
203
+ _ds = DataLoader(
204
+ _dst,
205
+ batch_size=self._batch_size,
206
+ collate_fn=self._data_collator,
207
+ sampler=NegClassRandomSampler(_dst, self._sampler_ratio)
208
+ )
209
+ return _ds
210
+
211
+ def get_training_set(self, dataloader=True, subsample=False):
212
+ return self._get_set("train", dataloader, subsample)
213
+
214
+ def get_validation_set(self, dataloader=True, subsample=False):
215
+ return self._get_set("validation", dataloader, subsample)
216
+
217
+ def get_test_set(self, dataloader=True, subsample=False):
218
+ return self._get_set("test", dataloader, subsample)
219
+
220
+ def get_preprocessors(self):
221
+ return {
222
+ "tokenizer": self._tokenizer,
223
+ "data_collator": self._data_collator
224
+ }
225
+
226
+ def _preprocess_function(self, examples):
227
+ return self._tokenizer(examples["text"], padding="max_length", truncation=True)
fake_job_detector/models.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module for storing the Model class, which can be used for wrapping sklearn or
3
+ PyTorch models. This is more so that evaluation can be abstracted.
4
+ """
5
+
6
+ import pickle
7
+ import os
8
+ from abc import ABC, abstractmethod
9
+ from typing import Optional
10
+
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+
14
+ # Sklearn
15
+ from scipy.sparse import hstack
16
+ from sklearn.svm import SVC
17
+
18
+ # Torch
19
+ import torch
20
+ from torch.optim import AdamW
21
+ from transformers import (
22
+ AutoModelForSequenceClassification,
23
+ AutoTokenizer,
24
+ get_scheduler
25
+ )
26
+ from torch.utils.data import DataLoader
27
+ from tqdm.auto import tqdm
28
+
29
+ from .dataset import JobDataset, SVMJobDataset, HuggingFaceJobDataset
30
+ from .utils import FocalLoss, compute_metrics
31
+
32
+
33
+ class Model(ABC):
34
+
35
+ # Saving and loading
36
+ @abstractmethod
37
+ def save_model(self, path: str, *args):
38
+ """Save the model into a serialized format (e.g. pickle, tensors)"""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def load_model(self, path: str, *args):
43
+ """Loads the model from the serialized format"""
44
+ pass
45
+
46
+ # Training, inference, evaluation
47
+ @abstractmethod
48
+ def fit(self, dataset: JobDataset):
49
+ """Given the dataset class, train the underlying model"""
50
+ pass
51
+
52
+ @abstractmethod
53
+ def evaluate(self, dataset: JobDataset):
54
+ """Given the dataset class, output the evaluation metrics"""
55
+ pass
56
+
57
+ @abstractmethod
58
+ def __call__(self, *args, **kwargs):
59
+ """Given model inputs, predict the test set labels"""
60
+ pass
61
+
62
+
63
+
64
+ class DistilBERTBaseModel(Model):
65
+ def __init__(self,
66
+ pretrained_model="distilbert-base-uncased",
67
+ num_labels=2,
68
+ freeze=False,
69
+ class_frequencies: Optional[torch.Tensor] = None,
70
+ cpu=False):
71
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
+ if cpu:
73
+ self._device = torch.device("cpu")
74
+ print("Torch device: ", repr(self._device))
75
+
76
+ self._model = AutoModelForSequenceClassification.from_pretrained(
77
+ pretrained_model, num_labels=num_labels
78
+ ).to(self._device)
79
+ self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
80
+
81
+ # Initially we trained it with the earlier layers frozen to try and
82
+ # speed up training, however we eventually undid this once we've
83
+ # established the training loop.
84
+ if freeze:
85
+ self.freeze_layers()
86
+
87
+ # If class frequencies were provided, use them to construct the focal
88
+ # loss formulation
89
+ self._loss = None
90
+ if class_frequencies is not None:
91
+ print(f"Loading a-balanced focal loss with weights {str(class_frequencies)}")
92
+ self._loss = FocalLoss(
93
+ class_frequencies=class_frequencies
94
+ )
95
+
96
+ # Set defaults, calling this after the fact to re-set parameters is
97
+ # simple enough
98
+ self.set_training_args()
99
+
100
+ def freeze_layers(self, layer_prefixes: Optional[set] = None):
101
+ """
102
+ Freezes certain layers by prefixes in order to focus training on only
103
+ certain layers.
104
+ """
105
+
106
+ if layer_prefixes is None:
107
+ layer_prefixes = set([
108
+ "distilbert.embeddings",
109
+ "distilbert.transformer.layer.0",
110
+ "distilbert.transformer.layer.1",
111
+ "distilbert.transformer.layer.2",
112
+ "distilbert.transformer.layer.3",
113
+ ])
114
+
115
+ for name, params in self._model.named_parameters():
116
+ if any(prefix for prefix in layer_prefixes if name.startswith(prefix)):
117
+ params.requires_grad = False
118
+
119
+ def set_training_args(self, **training_args):
120
+ training_args.setdefault("output_dir", "../models/DistilBERTBase")
121
+ training_args.setdefault("learning_rate", 2e-5)
122
+ training_args.setdefault("per_device_train_batch_size", 16)
123
+ training_args.setdefault("per_device_eval_batch_size", 16)
124
+ training_args.setdefault("num_train_epochs", 3)
125
+ training_args.setdefault("weight_decay", 0.01)
126
+ training_args.setdefault("save_strategy", "epoch")
127
+ training_args.setdefault("evaluation_strategy", "epoch")
128
+ training_args.setdefault("logging_strategy", "epoch")
129
+
130
+ self._train_args = training_args
131
+
132
+ def save_model(self, path, checkpoint_name: str = "checkpoint"):
133
+ path = os.path.join(path, checkpoint_name)
134
+ self._model.save_pretrained(path)
135
+
136
+ def load_model(self, path):
137
+ self._model = AutoModelForSequenceClassification \
138
+ .from_pretrained(path) \
139
+ .to(self._device)
140
+
141
+ def fit(self,
142
+ dataset: HuggingFaceJobDataset,
143
+ subsample: bool = False,
144
+ plot_loss: bool = False,
145
+ eval_loss: bool = False):
146
+
147
+ # Set up optimizer and LR scheduler
148
+ train_dataloader = dataset.get_training_set(dataloader=True, subsample=subsample)
149
+ eval_dataloader = dataset.get_validation_set(dataloader=True, subsample=subsample)
150
+ num_epochs = self._train_args["num_train_epochs"]
151
+ num_batches = len(train_dataloader)
152
+ num_training_steps = num_epochs * num_batches
153
+ optimizer = AdamW(self._model.parameters(), lr=5e-5)
154
+ lr_scheduler = get_scheduler(
155
+ name="linear",
156
+ optimizer=optimizer,
157
+ num_warmup_steps=0,
158
+ num_training_steps=num_training_steps
159
+ )
160
+ progress_bar = tqdm(range(num_training_steps))
161
+ losses = []
162
+ eval_losses = []
163
+
164
+ # Training loop
165
+ self._model.train()
166
+ for epoch in range(num_epochs):
167
+ epoch_loss = 0.0
168
+ for batch in train_dataloader:
169
+ batch = {k: v.to(self._device) for k, v in batch.items()}
170
+ outputs = self._model(**batch)
171
+ if self._loss is None:
172
+ loss = outputs.loss
173
+ else:
174
+ logits = outputs.logits
175
+ labels = batch["labels"]
176
+ scores = torch.softmax(logits, dim=-1)[:len(labels), 1]
177
+ loss = self._loss(scores, labels)
178
+ loss.backward()
179
+
180
+ optimizer.step()
181
+ lr_scheduler.step()
182
+ optimizer.zero_grad()
183
+ progress_bar.update(1)
184
+
185
+ epoch_loss += loss.item()
186
+ losses.append(loss.item())
187
+ avg_loss = epoch_loss / num_batches
188
+ print(f"Epoch {epoch+1} avg_loss: {avg_loss:.5f}")
189
+
190
+ if eval_loss:
191
+ eval_epoch_loss = 0.0
192
+ num_eval_batches = len(eval_dataloader)
193
+ for batch in eval_dataloader:
194
+ batch = {k: v.to(self._device) for k, v in batch.items()}
195
+ with torch.no_grad():
196
+ outputs = self._model(**batch)
197
+ loss = outputs.loss
198
+ eval_epoch_loss += loss.item()
199
+ eval_losses.append(loss.item())
200
+ avg_loss = eval_epoch_loss / num_eval_batches
201
+ print(f" eval avg_loss: {avg_loss:.5f}")
202
+
203
+ # Plot the loss if requested
204
+ # Note that this is a moving average of the per-batch loss, which is
205
+ # different from the usual per-epoch loss, as we only fine-tune for a
206
+ # small number of epochs
207
+ if plot_loss:
208
+ kernel = np.ones(8) / 8
209
+ losses = np.convolve(np.array(losses), kernel, mode='valid')
210
+ fig, ax = plt.subplots(figsize=(10, 5))
211
+ ax.plot(losses, label='Training Loss (MA-8)')
212
+ if eval_losses:
213
+ ax2 = ax.twiny()
214
+ eval_losses = np.convolve(np.array(eval_losses), kernel, mode='valid')
215
+ ax2.plot(eval_losses, color='orange', label='Eval Loss (MA-8)')
216
+ ax2.legend()
217
+ ax.set_xlabel('Batch')
218
+ ax.set_ylabel('Average Loss')
219
+ ax.set_title('Loss over Batches')
220
+ ax.legend()
221
+ fig.show()
222
+
223
+ def evaluate(self, dataset: DataLoader, get_raw_results: bool = False, plot_pr_curve: bool = True):
224
+ self._model.eval()
225
+ targs_list = []
226
+ score_list = []
227
+ preds_list = []
228
+
229
+ for batch in tqdm(dataset):
230
+ batch = {k: v.to(self._device) for k, v in batch.items()}
231
+ with torch.no_grad():
232
+ outputs = self._model(**batch)
233
+
234
+ logits = outputs.logits
235
+ labels = batch["labels"]
236
+ scores = torch.softmax(logits, dim=-1)[:len(labels), 1]
237
+ predictions = torch.argmax(logits, dim=-1)
238
+
239
+ targs_list.append(labels)
240
+ score_list.append(scores)
241
+ preds_list.append(predictions)
242
+
243
+ targs = torch.concat(targs_list).cpu()
244
+ scores = torch.concat(score_list).cpu()
245
+ preds = torch.concat(preds_list).cpu()
246
+
247
+ if get_raw_results:
248
+ return targs, scores, preds
249
+ else:
250
+ return compute_metrics(targs, scores, preds, plot_pr_curve)
251
+
252
+ def __call__(self, title: str, description: str) -> bool:
253
+ inputs = self._tokenizer(
254
+ title + " " + description,
255
+ return_tensors="pt",
256
+ truncation=True,
257
+ padding=True
258
+ ).to(self._device)
259
+
260
+ with torch.inference_mode():
261
+ outputs = self._model(**inputs)
262
+ predictions = torch.argmax(outputs.logits, dim=-1).tolist()[0]
263
+ return bool(predictions)
fake_job_detector/train.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from .dataset import JobDataset, HuggingFaceJobDataset
5
+ from .utils import compute_metrics
6
+ from .models import Model, DistilBERTBaseModel
7
+
8
+
9
+ def train_model_from_cli(args):
10
+ model_name = args.model
11
+ model_dir = args.model_dir
12
+ experiment = args.experiment
13
+
14
+ if model_name == "distilbert":
15
+ model_title = "DistilBERTBase"
16
+ model_path = Path(model_dir, model_title, experiment)
17
+ model_path.mkdir(parents=True, exist_ok=True)
18
+ subsample = args.subsample
19
+
20
+ training_args = {
21
+ "learning_rate": args.learning_rate,
22
+ "per_device_train_batch_size": args.batch_size,
23
+ "per_device_eval_batch_size": args.batch_size,
24
+ "num_train_epochs": args.epochs,
25
+ "weight_decay": args.weight_decay,
26
+ "save_steps": args.save_steps,
27
+ }
28
+
29
+ dataset = HuggingFaceJobDataset()
30
+
31
+ model = DistilBERTBaseModel()
32
+ model.set_training_args(**training_args)
33
+ model.fit(dataset, subsample=subsample)
34
+ print(model.evaluate(subsample=subsample))
35
+
36
+
37
+ def main():
38
+ parser = argparse.ArgumentParser(description='Trains the fake job detector model.')
39
+ parser.add_argument("model", type=str, choices=["distilbert"], help="Which model to train.")
40
+ parser.add_argument("--model_dir", type=str, default="./models", help="Where to store the models after training.")
41
+ parser.add_argument("--experiment", type=str, default="base", help="Name of experiment.")
42
+
43
+ distilbert_group = parser.add_argument_group("DistilBERT training arguments")
44
+ distilbert_group.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate of model.")
45
+ distilbert_group.add_argument("--batch_size", type=int, default=16, help="Batch size when training or evaluating the model.")
46
+ distilbert_group.add_argument("--epochs", type=int, default=3, help="Number of epochs to train the model.")
47
+ distilbert_group.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay induced.")
48
+ distilbert_group.add_argument("--save_steps", type=int, default=5, help="Number of training steps in between checkpoints.")
49
+ distilbert_group.add_argument("--subsample", type=bool, default=False, help="Whether or not to use only a subsample.")
50
+
51
+ args = parser.parse_args()
52
+ train_model_from_cli(args)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
fake_job_detector/utils.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module for defining utilities for training such as the negative class sampler
3
+ and focal loss function.
4
+ """
5
+
6
+ import numpy as np
7
+ from sklearn.metrics import (
8
+ precision_recall_fscore_support,
9
+ precision_recall_curve,
10
+ auc,
11
+ PrecisionRecallDisplay
12
+ )
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.nn.functional as F
16
+ from torch.utils.data import Sampler
17
+
18
+
19
+ def compute_metrics(targs, scores, preds, plot_pr_curve: bool = True):
20
+ precision, recall, f1, _ = precision_recall_fscore_support(targs, preds, average="binary")
21
+ prs, rcs, _ = precision_recall_curve(targs, scores)
22
+
23
+ if plot_pr_curve:
24
+ display = PrecisionRecallDisplay.from_predictions(
25
+ targs, scores, plot_chance_level=True
26
+ )
27
+ display.ax_.set_title("Precision-Recall curve of subsample")
28
+ display.figure_.show()
29
+
30
+ try:
31
+ pr_auc = auc(prs, rcs)
32
+ except ValueError:
33
+ print("Warning: curve is non-monotonic, returning None")
34
+ pr_auc = None
35
+
36
+ return {
37
+ 'precision': precision,
38
+ 'recall': recall,
39
+ 'f1': f1,
40
+ 'pr_auc': pr_auc
41
+ }
42
+
43
+
44
+ class FocalLoss(nn.Module):
45
+ def __init__(self, class_frequencies: torch.Tensor, gamma: int = 2):
46
+ super(FocalLoss, self).__init__()
47
+ self.alpha = (1 / class_frequencies).to(
48
+ torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+ )
50
+ self.alpha = (self.alpha / self.alpha.sum())
51
+ self.gamma = gamma
52
+
53
+ def forward(self, inputs, targets):
54
+ alpha_targets = self.alpha[targets]
55
+ if inputs.data.type() != targets.data.type():
56
+ targets = targets.type_as(inputs.data)
57
+ if self.alpha.type() != inputs.data.type():
58
+ self.alpha = self.alpha.type_as(inputs.data)
59
+ ce_loss = F.cross_entropy(inputs, targets, reduction='none')
60
+ pt = torch.exp(-ce_loss)
61
+ loss = (alpha_targets * (1 - pt) ** self.gamma * ce_loss).mean()
62
+ return loss
63
+
64
+
65
+ class NegClassRandomSampler(Sampler):
66
+ """
67
+ Dataloader Sampler that subsamples the negative class after each epoch.
68
+ The idea is that we want to keep the positive samples but select a random
69
+ subset of negative samples each epoch for a fresh set.
70
+
71
+ With the current settings, the sampling is done without replacement, and we
72
+ end up with a roughly 20% data imbalance, which should hopefully be more
73
+ manageable.
74
+ """
75
+
76
+ def __init__(self, data_source, neg_class_ratio: float = 0.2, seed: int = 42):
77
+ self._random_gen = np.random.default_rng(seed)
78
+ self.data_source = data_source
79
+ self._neg_class_ratio = neg_class_ratio
80
+
81
+ # Get indices of the positive and negative cases
82
+ self._pos_indices = np.argwhere(np.array(data_source['labels']) == 1).flatten()
83
+ self._neg_indices = np.argwhere(np.array(data_source['labels']) == 0).flatten()
84
+ self._neg_num_samples = int(len(self._neg_indices) * neg_class_ratio)
85
+ self._pos_num_samples = len(self._pos_indices)
86
+
87
+ @property
88
+ def num_samples(self):
89
+ return self._pos_num_samples + self._neg_num_samples
90
+
91
+ def __iter__(self):
92
+ """
93
+ Each time an iteration of this is requested, the resampling is done.
94
+ """
95
+ _neg_samples = self._random_gen.choice(self._neg_indices, self._neg_num_samples, replace=False)
96
+ _samples = np.concatenate((_neg_samples, self._pos_indices), axis=0)
97
+ self._random_gen.shuffle(_samples)
98
+ if (len(_samples) != len(self)):
99
+ raise ValueError("Length of output samples (%d) does not match expected (%d)", len(_samples), len(self))
100
+ return iter(_samples.tolist())
101
+
102
+ def __len__(self):
103
+ return self.num_samples
104
+