bvishnu123
commited on
Commit
•
1212df0
1
Parent(s):
e7e226b
setup
Browse files- fake_job_detector/__init__.py +0 -0
- fake_job_detector/cli.py +43 -0
- fake_job_detector/dataset.py +227 -0
- fake_job_detector/models.py +263 -0
- fake_job_detector/train.py +56 -0
- fake_job_detector/utils.py +104 -0
fake_job_detector/__init__.py
ADDED
File without changes
|
fake_job_detector/cli.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
# Define the models we can choose from
|
5 |
+
class ModelEnum:
|
6 |
+
SVM = "svm"
|
7 |
+
DISTILBERT = "distilbert"
|
8 |
+
|
9 |
+
@classmethod
|
10 |
+
def choices(cls):
|
11 |
+
return [cls.SVM, cls.DISTILBERT]
|
12 |
+
|
13 |
+
# Define the CLI parser
|
14 |
+
parser = argparse.ArgumentParser(description="CLI for predicting if a job is fake based on the title and description")
|
15 |
+
subparsers = parser.add_subparsers(title="subcommands", dest="subcommand")
|
16 |
+
|
17 |
+
# Prediction sub-command
|
18 |
+
predict_parser = subparsers.add_parser("predict", help="Make predictions using a trained model")
|
19 |
+
predict_parser.add_argument("-m", "--model", choices=ModelEnum.choices(), required=True, help="Model to choose between SVM baseline and fine-tuned DistilBERT")
|
20 |
+
predict_parser.add_argument("-f", "--file", required=True, help="Path to trained model folder")
|
21 |
+
predict_parser.add_argument("--title", required=True, help="Job title to classify")
|
22 |
+
predict_parser.add_argument("--description", required=True, help="Job description to classify")
|
23 |
+
|
24 |
+
# Parse the arguments
|
25 |
+
args = parser.parse_args()
|
26 |
+
|
27 |
+
if args.subcommand == "predict":
|
28 |
+
print(f"""===\n\nPredicting with {args.model} using
|
29 |
+
title '{args.title[:50]}{'...' if len(args.title) > 50 else ''}' and
|
30 |
+
description '{args.description[:50]}{'...' if len(args.description) > 50 else ''}'""")
|
31 |
+
|
32 |
+
if args.model == ModelEnum.SVM:
|
33 |
+
print("Loading SVM model...")
|
34 |
+
from fake_job_detector.models import BaselineSVMModel
|
35 |
+
model = BaselineSVMModel()
|
36 |
+
model.load_model(args.file)
|
37 |
+
|
38 |
+
elif args.model == ModelEnum.DISTILBERT:
|
39 |
+
print("Loading DistilBERT model for CPU inference...")
|
40 |
+
from fake_job_detector.models import DistilBERTBaseModel
|
41 |
+
model = DistilBERTBaseModel(pretrained_model=args.file, cpu=True)
|
42 |
+
|
43 |
+
print(f"===\n\nJob is {'fake' if model(args.title, args.description) else 'real'}")
|
fake_job_detector/dataset.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module for storing the Dataset class which will compartmentalize things like the
|
3 |
+
train-test split and shuffling (if needed.)
|
4 |
+
|
5 |
+
Feel free to extend the class if you want to implement something specific to
|
6 |
+
your method like dataset shuffling and batching for DL methods.
|
7 |
+
"""
|
8 |
+
|
9 |
+
from typing import Tuple, Optional
|
10 |
+
|
11 |
+
from scipy.sparse import hstack
|
12 |
+
import pandas as pd
|
13 |
+
from datasets import (
|
14 |
+
load_dataset,
|
15 |
+
Dataset,
|
16 |
+
DatasetDict,
|
17 |
+
ClassLabel,
|
18 |
+
Features,
|
19 |
+
Value
|
20 |
+
)
|
21 |
+
from sklearn.model_selection import train_test_split
|
22 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
23 |
+
|
24 |
+
from torch.utils.data import DataLoader
|
25 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
26 |
+
|
27 |
+
from .utils import NegClassRandomSampler
|
28 |
+
|
29 |
+
|
30 |
+
class JobDataset:
|
31 |
+
"""
|
32 |
+
Wrapper around the AEGEAN dataset
|
33 |
+
"""
|
34 |
+
def __init__(self,
|
35 |
+
batch_size: int = 16,
|
36 |
+
train_test_split: Tuple[float, float, float] = (0.7, 0.1, 0.2)):
|
37 |
+
_dataset = load_dataset("victor/real-or-fake-fake-jobposting-prediction")
|
38 |
+
self._dataset: pd.DataFrame = _dataset['train'].to_pandas()
|
39 |
+
self._dataset['fraudulent'] = self._dataset['fraudulent'].astype(int)
|
40 |
+
self._size: int = len(self._dataset)
|
41 |
+
self._batch_size = batch_size
|
42 |
+
self.clean_dataset()
|
43 |
+
self.add_features()
|
44 |
+
self.set_train_test_split(*train_test_split)
|
45 |
+
|
46 |
+
def clean_dataset(self):
|
47 |
+
"""
|
48 |
+
Clean up the dataset.
|
49 |
+
- Fills None strings
|
50 |
+
- Converts label to an int
|
51 |
+
"""
|
52 |
+
self._dataset[["title", "description"]] = \
|
53 |
+
self._dataset[["title", "description"]].fillna("")
|
54 |
+
|
55 |
+
def add_features(self):
|
56 |
+
"""
|
57 |
+
Computes additional features on its own data.
|
58 |
+
- Concatenates the strings
|
59 |
+
"""
|
60 |
+
self._dataset["full_text"] = \
|
61 |
+
self._dataset[["title", "description"]].agg(' '.join, axis=1)
|
62 |
+
|
63 |
+
def set_train_test_split(self,
|
64 |
+
train_fr: float,
|
65 |
+
eval_fr: float,
|
66 |
+
test_fr: float,
|
67 |
+
seed: int = 42):
|
68 |
+
"""
|
69 |
+
Sets the train-test split. A seed is used for consistency.
|
70 |
+
"""
|
71 |
+
eval_fr = eval_fr / (train_fr + eval_fr)
|
72 |
+
_train_df, self._test_df = \
|
73 |
+
train_test_split(self._dataset, test_size=test_fr, random_state=seed)
|
74 |
+
self._train_df, self._eval_df = \
|
75 |
+
train_test_split(_train_df, test_size=eval_fr, random_state=seed)
|
76 |
+
|
77 |
+
# Functions for getting the training, eval, and test dataset
|
78 |
+
# The format of the dataset will depend on the model, so I'll leave this unimplemented for now
|
79 |
+
|
80 |
+
def get_training_set(self):
|
81 |
+
raise NotImplementedError
|
82 |
+
|
83 |
+
def get_validation_set(self):
|
84 |
+
raise NotImplementedError
|
85 |
+
|
86 |
+
def get_test_set(self):
|
87 |
+
raise NotImplementedError
|
88 |
+
|
89 |
+
|
90 |
+
class SVMJobDataset(JobDataset):
|
91 |
+
def __init__(self, vectorizer_params: Optional[dict] = None):
|
92 |
+
super().__init__()
|
93 |
+
if vectorizer_params is None:
|
94 |
+
vectorizer_params = {
|
95 |
+
'lowercase': True,
|
96 |
+
'stop_words': 'english',
|
97 |
+
'max_features': 1_000
|
98 |
+
}
|
99 |
+
self._title_vectorizer = TfidfVectorizer(**vectorizer_params)
|
100 |
+
self._description_vectorizer = TfidfVectorizer(**vectorizer_params)
|
101 |
+
|
102 |
+
def vectorize(self):
|
103 |
+
self._train_set = hstack([
|
104 |
+
self._title_vectorizer.fit_transform(self._train_df["title"]),
|
105 |
+
self._description_vectorizer.fit_transform(self._train_df["description"])
|
106 |
+
])
|
107 |
+
self._eval_set = hstack([
|
108 |
+
self._title_vectorizer.transform(self._eval_df["title"]),
|
109 |
+
self._description_vectorizer.transform(self._eval_df["description"])
|
110 |
+
])
|
111 |
+
self._test_set = hstack([
|
112 |
+
self._title_vectorizer.transform(self._test_df["title"]),
|
113 |
+
self._description_vectorizer.transform(self._test_df["description"])
|
114 |
+
])
|
115 |
+
|
116 |
+
def get_training_set(self):
|
117 |
+
return self._train_set, self._train_df["fraudulent"]
|
118 |
+
|
119 |
+
def get_validation_set(self):
|
120 |
+
return self._eval_set, self._eval_df["fraudulent"]
|
121 |
+
|
122 |
+
def get_test_set(self):
|
123 |
+
return self._test_set, self._test_df["fraudulent"]
|
124 |
+
|
125 |
+
|
126 |
+
class HuggingFaceJobDataset(JobDataset):
|
127 |
+
def __init__(self, *args, **kwargs):
|
128 |
+
super().__init__(*args, **kwargs)
|
129 |
+
self._hf_dataset = None
|
130 |
+
self._tokenized_dataset = None
|
131 |
+
self._tokenizer = None
|
132 |
+
self._data_collator = None
|
133 |
+
|
134 |
+
self.set_hf_dataset_dict()
|
135 |
+
self.set_tokenized_hf_dataset()
|
136 |
+
|
137 |
+
def set_hf_dataset_dict(self, recompute: bool = False):
|
138 |
+
if (self._hf_dataset is not None) and (not recompute):
|
139 |
+
print("HF dataset already exists, recompute not set to True, returning")
|
140 |
+
return
|
141 |
+
|
142 |
+
hf_dataset = DatasetDict()
|
143 |
+
|
144 |
+
# Set the splits
|
145 |
+
features = Features({
|
146 |
+
"full_text": Value("string"),
|
147 |
+
"fraudulent": ClassLabel(num_classes=2, names=[0,1]),
|
148 |
+
"__index_level_0__": Value("uint32")
|
149 |
+
})
|
150 |
+
columns = ["full_text", "fraudulent"]
|
151 |
+
hf_dataset['train'] = Dataset.from_pandas(self._train_df[columns], features=features)
|
152 |
+
hf_dataset['validation'] = Dataset.from_pandas(self._eval_df[columns], features=features)
|
153 |
+
hf_dataset['test'] = Dataset.from_pandas(self._test_df[columns], features=features)
|
154 |
+
|
155 |
+
# Set proper names
|
156 |
+
hf_dataset = hf_dataset \
|
157 |
+
.rename_column("full_text", "text") \
|
158 |
+
.rename_column("fraudulent", "labels")
|
159 |
+
|
160 |
+
# Remove the index
|
161 |
+
hf_dataset = hf_dataset.remove_columns("__index_level_0__")
|
162 |
+
|
163 |
+
self._sampler_ratio: float = None
|
164 |
+
self._hf_dataset = hf_dataset
|
165 |
+
|
166 |
+
def set_tokenized_hf_dataset(self, recompute: bool = False):
|
167 |
+
if (self._data_collator is not None) and (self._tokenized_dataset is not None) and (not recompute):
|
168 |
+
print("Tokenized dataset already exists, recompute not set to True, returning")
|
169 |
+
return
|
170 |
+
|
171 |
+
self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
172 |
+
hf_dataset = self._hf_dataset
|
173 |
+
|
174 |
+
tokenized_dataset = DatasetDict()
|
175 |
+
tokenized_dataset["train"] = hf_dataset["train"].map(self._preprocess_function, batched=True)
|
176 |
+
tokenized_dataset["validation"] = hf_dataset["validation"].map(self._preprocess_function, batched=True)
|
177 |
+
tokenized_dataset["test"] = hf_dataset["test"].map(self._preprocess_function, batched=True)
|
178 |
+
|
179 |
+
self._data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer)
|
180 |
+
self._tokenized_dataset = tokenized_dataset
|
181 |
+
|
182 |
+
def set_random_sampler_ratio(self, neg_class_ratio: float = 0.2):
|
183 |
+
"""
|
184 |
+
For randomly subsampling the negative class
|
185 |
+
"""
|
186 |
+
self._sampler_ratio = neg_class_ratio
|
187 |
+
|
188 |
+
def _get_set(self, dataset_name, dataloader, subsample):
|
189 |
+
_ds = self._tokenized_dataset[dataset_name]
|
190 |
+
if subsample:
|
191 |
+
sample_size = min(512, len(self._tokenized_dataset[dataset_name]))
|
192 |
+
_ds = self._tokenized_dataset[dataset_name].shuffle(seed=42).select(list(range(sample_size)))
|
193 |
+
if dataloader:
|
194 |
+
_dst = _ds.remove_columns("text")
|
195 |
+
if self._sampler_ratio is None:
|
196 |
+
_ds = DataLoader(
|
197 |
+
_dst,
|
198 |
+
shuffle=True,
|
199 |
+
batch_size=self._batch_size,
|
200 |
+
collate_fn=self._data_collator
|
201 |
+
)
|
202 |
+
else:
|
203 |
+
_ds = DataLoader(
|
204 |
+
_dst,
|
205 |
+
batch_size=self._batch_size,
|
206 |
+
collate_fn=self._data_collator,
|
207 |
+
sampler=NegClassRandomSampler(_dst, self._sampler_ratio)
|
208 |
+
)
|
209 |
+
return _ds
|
210 |
+
|
211 |
+
def get_training_set(self, dataloader=True, subsample=False):
|
212 |
+
return self._get_set("train", dataloader, subsample)
|
213 |
+
|
214 |
+
def get_validation_set(self, dataloader=True, subsample=False):
|
215 |
+
return self._get_set("validation", dataloader, subsample)
|
216 |
+
|
217 |
+
def get_test_set(self, dataloader=True, subsample=False):
|
218 |
+
return self._get_set("test", dataloader, subsample)
|
219 |
+
|
220 |
+
def get_preprocessors(self):
|
221 |
+
return {
|
222 |
+
"tokenizer": self._tokenizer,
|
223 |
+
"data_collator": self._data_collator
|
224 |
+
}
|
225 |
+
|
226 |
+
def _preprocess_function(self, examples):
|
227 |
+
return self._tokenizer(examples["text"], padding="max_length", truncation=True)
|
fake_job_detector/models.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module for storing the Model class, which can be used for wrapping sklearn or
|
3 |
+
PyTorch models. This is more so that evaluation can be abstracted.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import pickle
|
7 |
+
import os
|
8 |
+
from abc import ABC, abstractmethod
|
9 |
+
from typing import Optional
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
|
14 |
+
# Sklearn
|
15 |
+
from scipy.sparse import hstack
|
16 |
+
from sklearn.svm import SVC
|
17 |
+
|
18 |
+
# Torch
|
19 |
+
import torch
|
20 |
+
from torch.optim import AdamW
|
21 |
+
from transformers import (
|
22 |
+
AutoModelForSequenceClassification,
|
23 |
+
AutoTokenizer,
|
24 |
+
get_scheduler
|
25 |
+
)
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
+
from tqdm.auto import tqdm
|
28 |
+
|
29 |
+
from .dataset import JobDataset, SVMJobDataset, HuggingFaceJobDataset
|
30 |
+
from .utils import FocalLoss, compute_metrics
|
31 |
+
|
32 |
+
|
33 |
+
class Model(ABC):
|
34 |
+
|
35 |
+
# Saving and loading
|
36 |
+
@abstractmethod
|
37 |
+
def save_model(self, path: str, *args):
|
38 |
+
"""Save the model into a serialized format (e.g. pickle, tensors)"""
|
39 |
+
pass
|
40 |
+
|
41 |
+
@abstractmethod
|
42 |
+
def load_model(self, path: str, *args):
|
43 |
+
"""Loads the model from the serialized format"""
|
44 |
+
pass
|
45 |
+
|
46 |
+
# Training, inference, evaluation
|
47 |
+
@abstractmethod
|
48 |
+
def fit(self, dataset: JobDataset):
|
49 |
+
"""Given the dataset class, train the underlying model"""
|
50 |
+
pass
|
51 |
+
|
52 |
+
@abstractmethod
|
53 |
+
def evaluate(self, dataset: JobDataset):
|
54 |
+
"""Given the dataset class, output the evaluation metrics"""
|
55 |
+
pass
|
56 |
+
|
57 |
+
@abstractmethod
|
58 |
+
def __call__(self, *args, **kwargs):
|
59 |
+
"""Given model inputs, predict the test set labels"""
|
60 |
+
pass
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
class DistilBERTBaseModel(Model):
|
65 |
+
def __init__(self,
|
66 |
+
pretrained_model="distilbert-base-uncased",
|
67 |
+
num_labels=2,
|
68 |
+
freeze=False,
|
69 |
+
class_frequencies: Optional[torch.Tensor] = None,
|
70 |
+
cpu=False):
|
71 |
+
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
72 |
+
if cpu:
|
73 |
+
self._device = torch.device("cpu")
|
74 |
+
print("Torch device: ", repr(self._device))
|
75 |
+
|
76 |
+
self._model = AutoModelForSequenceClassification.from_pretrained(
|
77 |
+
pretrained_model, num_labels=num_labels
|
78 |
+
).to(self._device)
|
79 |
+
self._tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
80 |
+
|
81 |
+
# Initially we trained it with the earlier layers frozen to try and
|
82 |
+
# speed up training, however we eventually undid this once we've
|
83 |
+
# established the training loop.
|
84 |
+
if freeze:
|
85 |
+
self.freeze_layers()
|
86 |
+
|
87 |
+
# If class frequencies were provided, use them to construct the focal
|
88 |
+
# loss formulation
|
89 |
+
self._loss = None
|
90 |
+
if class_frequencies is not None:
|
91 |
+
print(f"Loading a-balanced focal loss with weights {str(class_frequencies)}")
|
92 |
+
self._loss = FocalLoss(
|
93 |
+
class_frequencies=class_frequencies
|
94 |
+
)
|
95 |
+
|
96 |
+
# Set defaults, calling this after the fact to re-set parameters is
|
97 |
+
# simple enough
|
98 |
+
self.set_training_args()
|
99 |
+
|
100 |
+
def freeze_layers(self, layer_prefixes: Optional[set] = None):
|
101 |
+
"""
|
102 |
+
Freezes certain layers by prefixes in order to focus training on only
|
103 |
+
certain layers.
|
104 |
+
"""
|
105 |
+
|
106 |
+
if layer_prefixes is None:
|
107 |
+
layer_prefixes = set([
|
108 |
+
"distilbert.embeddings",
|
109 |
+
"distilbert.transformer.layer.0",
|
110 |
+
"distilbert.transformer.layer.1",
|
111 |
+
"distilbert.transformer.layer.2",
|
112 |
+
"distilbert.transformer.layer.3",
|
113 |
+
])
|
114 |
+
|
115 |
+
for name, params in self._model.named_parameters():
|
116 |
+
if any(prefix for prefix in layer_prefixes if name.startswith(prefix)):
|
117 |
+
params.requires_grad = False
|
118 |
+
|
119 |
+
def set_training_args(self, **training_args):
|
120 |
+
training_args.setdefault("output_dir", "../models/DistilBERTBase")
|
121 |
+
training_args.setdefault("learning_rate", 2e-5)
|
122 |
+
training_args.setdefault("per_device_train_batch_size", 16)
|
123 |
+
training_args.setdefault("per_device_eval_batch_size", 16)
|
124 |
+
training_args.setdefault("num_train_epochs", 3)
|
125 |
+
training_args.setdefault("weight_decay", 0.01)
|
126 |
+
training_args.setdefault("save_strategy", "epoch")
|
127 |
+
training_args.setdefault("evaluation_strategy", "epoch")
|
128 |
+
training_args.setdefault("logging_strategy", "epoch")
|
129 |
+
|
130 |
+
self._train_args = training_args
|
131 |
+
|
132 |
+
def save_model(self, path, checkpoint_name: str = "checkpoint"):
|
133 |
+
path = os.path.join(path, checkpoint_name)
|
134 |
+
self._model.save_pretrained(path)
|
135 |
+
|
136 |
+
def load_model(self, path):
|
137 |
+
self._model = AutoModelForSequenceClassification \
|
138 |
+
.from_pretrained(path) \
|
139 |
+
.to(self._device)
|
140 |
+
|
141 |
+
def fit(self,
|
142 |
+
dataset: HuggingFaceJobDataset,
|
143 |
+
subsample: bool = False,
|
144 |
+
plot_loss: bool = False,
|
145 |
+
eval_loss: bool = False):
|
146 |
+
|
147 |
+
# Set up optimizer and LR scheduler
|
148 |
+
train_dataloader = dataset.get_training_set(dataloader=True, subsample=subsample)
|
149 |
+
eval_dataloader = dataset.get_validation_set(dataloader=True, subsample=subsample)
|
150 |
+
num_epochs = self._train_args["num_train_epochs"]
|
151 |
+
num_batches = len(train_dataloader)
|
152 |
+
num_training_steps = num_epochs * num_batches
|
153 |
+
optimizer = AdamW(self._model.parameters(), lr=5e-5)
|
154 |
+
lr_scheduler = get_scheduler(
|
155 |
+
name="linear",
|
156 |
+
optimizer=optimizer,
|
157 |
+
num_warmup_steps=0,
|
158 |
+
num_training_steps=num_training_steps
|
159 |
+
)
|
160 |
+
progress_bar = tqdm(range(num_training_steps))
|
161 |
+
losses = []
|
162 |
+
eval_losses = []
|
163 |
+
|
164 |
+
# Training loop
|
165 |
+
self._model.train()
|
166 |
+
for epoch in range(num_epochs):
|
167 |
+
epoch_loss = 0.0
|
168 |
+
for batch in train_dataloader:
|
169 |
+
batch = {k: v.to(self._device) for k, v in batch.items()}
|
170 |
+
outputs = self._model(**batch)
|
171 |
+
if self._loss is None:
|
172 |
+
loss = outputs.loss
|
173 |
+
else:
|
174 |
+
logits = outputs.logits
|
175 |
+
labels = batch["labels"]
|
176 |
+
scores = torch.softmax(logits, dim=-1)[:len(labels), 1]
|
177 |
+
loss = self._loss(scores, labels)
|
178 |
+
loss.backward()
|
179 |
+
|
180 |
+
optimizer.step()
|
181 |
+
lr_scheduler.step()
|
182 |
+
optimizer.zero_grad()
|
183 |
+
progress_bar.update(1)
|
184 |
+
|
185 |
+
epoch_loss += loss.item()
|
186 |
+
losses.append(loss.item())
|
187 |
+
avg_loss = epoch_loss / num_batches
|
188 |
+
print(f"Epoch {epoch+1} avg_loss: {avg_loss:.5f}")
|
189 |
+
|
190 |
+
if eval_loss:
|
191 |
+
eval_epoch_loss = 0.0
|
192 |
+
num_eval_batches = len(eval_dataloader)
|
193 |
+
for batch in eval_dataloader:
|
194 |
+
batch = {k: v.to(self._device) for k, v in batch.items()}
|
195 |
+
with torch.no_grad():
|
196 |
+
outputs = self._model(**batch)
|
197 |
+
loss = outputs.loss
|
198 |
+
eval_epoch_loss += loss.item()
|
199 |
+
eval_losses.append(loss.item())
|
200 |
+
avg_loss = eval_epoch_loss / num_eval_batches
|
201 |
+
print(f" eval avg_loss: {avg_loss:.5f}")
|
202 |
+
|
203 |
+
# Plot the loss if requested
|
204 |
+
# Note that this is a moving average of the per-batch loss, which is
|
205 |
+
# different from the usual per-epoch loss, as we only fine-tune for a
|
206 |
+
# small number of epochs
|
207 |
+
if plot_loss:
|
208 |
+
kernel = np.ones(8) / 8
|
209 |
+
losses = np.convolve(np.array(losses), kernel, mode='valid')
|
210 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
211 |
+
ax.plot(losses, label='Training Loss (MA-8)')
|
212 |
+
if eval_losses:
|
213 |
+
ax2 = ax.twiny()
|
214 |
+
eval_losses = np.convolve(np.array(eval_losses), kernel, mode='valid')
|
215 |
+
ax2.plot(eval_losses, color='orange', label='Eval Loss (MA-8)')
|
216 |
+
ax2.legend()
|
217 |
+
ax.set_xlabel('Batch')
|
218 |
+
ax.set_ylabel('Average Loss')
|
219 |
+
ax.set_title('Loss over Batches')
|
220 |
+
ax.legend()
|
221 |
+
fig.show()
|
222 |
+
|
223 |
+
def evaluate(self, dataset: DataLoader, get_raw_results: bool = False, plot_pr_curve: bool = True):
|
224 |
+
self._model.eval()
|
225 |
+
targs_list = []
|
226 |
+
score_list = []
|
227 |
+
preds_list = []
|
228 |
+
|
229 |
+
for batch in tqdm(dataset):
|
230 |
+
batch = {k: v.to(self._device) for k, v in batch.items()}
|
231 |
+
with torch.no_grad():
|
232 |
+
outputs = self._model(**batch)
|
233 |
+
|
234 |
+
logits = outputs.logits
|
235 |
+
labels = batch["labels"]
|
236 |
+
scores = torch.softmax(logits, dim=-1)[:len(labels), 1]
|
237 |
+
predictions = torch.argmax(logits, dim=-1)
|
238 |
+
|
239 |
+
targs_list.append(labels)
|
240 |
+
score_list.append(scores)
|
241 |
+
preds_list.append(predictions)
|
242 |
+
|
243 |
+
targs = torch.concat(targs_list).cpu()
|
244 |
+
scores = torch.concat(score_list).cpu()
|
245 |
+
preds = torch.concat(preds_list).cpu()
|
246 |
+
|
247 |
+
if get_raw_results:
|
248 |
+
return targs, scores, preds
|
249 |
+
else:
|
250 |
+
return compute_metrics(targs, scores, preds, plot_pr_curve)
|
251 |
+
|
252 |
+
def __call__(self, title: str, description: str) -> bool:
|
253 |
+
inputs = self._tokenizer(
|
254 |
+
title + " " + description,
|
255 |
+
return_tensors="pt",
|
256 |
+
truncation=True,
|
257 |
+
padding=True
|
258 |
+
).to(self._device)
|
259 |
+
|
260 |
+
with torch.inference_mode():
|
261 |
+
outputs = self._model(**inputs)
|
262 |
+
predictions = torch.argmax(outputs.logits, dim=-1).tolist()[0]
|
263 |
+
return bool(predictions)
|
fake_job_detector/train.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from .dataset import JobDataset, HuggingFaceJobDataset
|
5 |
+
from .utils import compute_metrics
|
6 |
+
from .models import Model, DistilBERTBaseModel
|
7 |
+
|
8 |
+
|
9 |
+
def train_model_from_cli(args):
|
10 |
+
model_name = args.model
|
11 |
+
model_dir = args.model_dir
|
12 |
+
experiment = args.experiment
|
13 |
+
|
14 |
+
if model_name == "distilbert":
|
15 |
+
model_title = "DistilBERTBase"
|
16 |
+
model_path = Path(model_dir, model_title, experiment)
|
17 |
+
model_path.mkdir(parents=True, exist_ok=True)
|
18 |
+
subsample = args.subsample
|
19 |
+
|
20 |
+
training_args = {
|
21 |
+
"learning_rate": args.learning_rate,
|
22 |
+
"per_device_train_batch_size": args.batch_size,
|
23 |
+
"per_device_eval_batch_size": args.batch_size,
|
24 |
+
"num_train_epochs": args.epochs,
|
25 |
+
"weight_decay": args.weight_decay,
|
26 |
+
"save_steps": args.save_steps,
|
27 |
+
}
|
28 |
+
|
29 |
+
dataset = HuggingFaceJobDataset()
|
30 |
+
|
31 |
+
model = DistilBERTBaseModel()
|
32 |
+
model.set_training_args(**training_args)
|
33 |
+
model.fit(dataset, subsample=subsample)
|
34 |
+
print(model.evaluate(subsample=subsample))
|
35 |
+
|
36 |
+
|
37 |
+
def main():
|
38 |
+
parser = argparse.ArgumentParser(description='Trains the fake job detector model.')
|
39 |
+
parser.add_argument("model", type=str, choices=["distilbert"], help="Which model to train.")
|
40 |
+
parser.add_argument("--model_dir", type=str, default="./models", help="Where to store the models after training.")
|
41 |
+
parser.add_argument("--experiment", type=str, default="base", help="Name of experiment.")
|
42 |
+
|
43 |
+
distilbert_group = parser.add_argument_group("DistilBERT training arguments")
|
44 |
+
distilbert_group.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate of model.")
|
45 |
+
distilbert_group.add_argument("--batch_size", type=int, default=16, help="Batch size when training or evaluating the model.")
|
46 |
+
distilbert_group.add_argument("--epochs", type=int, default=3, help="Number of epochs to train the model.")
|
47 |
+
distilbert_group.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay induced.")
|
48 |
+
distilbert_group.add_argument("--save_steps", type=int, default=5, help="Number of training steps in between checkpoints.")
|
49 |
+
distilbert_group.add_argument("--subsample", type=bool, default=False, help="Whether or not to use only a subsample.")
|
50 |
+
|
51 |
+
args = parser.parse_args()
|
52 |
+
train_model_from_cli(args)
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
main()
|
fake_job_detector/utils.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module for defining utilities for training such as the negative class sampler
|
3 |
+
and focal loss function.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from sklearn.metrics import (
|
8 |
+
precision_recall_fscore_support,
|
9 |
+
precision_recall_curve,
|
10 |
+
auc,
|
11 |
+
PrecisionRecallDisplay
|
12 |
+
)
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
import torch.nn.functional as F
|
16 |
+
from torch.utils.data import Sampler
|
17 |
+
|
18 |
+
|
19 |
+
def compute_metrics(targs, scores, preds, plot_pr_curve: bool = True):
|
20 |
+
precision, recall, f1, _ = precision_recall_fscore_support(targs, preds, average="binary")
|
21 |
+
prs, rcs, _ = precision_recall_curve(targs, scores)
|
22 |
+
|
23 |
+
if plot_pr_curve:
|
24 |
+
display = PrecisionRecallDisplay.from_predictions(
|
25 |
+
targs, scores, plot_chance_level=True
|
26 |
+
)
|
27 |
+
display.ax_.set_title("Precision-Recall curve of subsample")
|
28 |
+
display.figure_.show()
|
29 |
+
|
30 |
+
try:
|
31 |
+
pr_auc = auc(prs, rcs)
|
32 |
+
except ValueError:
|
33 |
+
print("Warning: curve is non-monotonic, returning None")
|
34 |
+
pr_auc = None
|
35 |
+
|
36 |
+
return {
|
37 |
+
'precision': precision,
|
38 |
+
'recall': recall,
|
39 |
+
'f1': f1,
|
40 |
+
'pr_auc': pr_auc
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
class FocalLoss(nn.Module):
|
45 |
+
def __init__(self, class_frequencies: torch.Tensor, gamma: int = 2):
|
46 |
+
super(FocalLoss, self).__init__()
|
47 |
+
self.alpha = (1 / class_frequencies).to(
|
48 |
+
torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
49 |
+
)
|
50 |
+
self.alpha = (self.alpha / self.alpha.sum())
|
51 |
+
self.gamma = gamma
|
52 |
+
|
53 |
+
def forward(self, inputs, targets):
|
54 |
+
alpha_targets = self.alpha[targets]
|
55 |
+
if inputs.data.type() != targets.data.type():
|
56 |
+
targets = targets.type_as(inputs.data)
|
57 |
+
if self.alpha.type() != inputs.data.type():
|
58 |
+
self.alpha = self.alpha.type_as(inputs.data)
|
59 |
+
ce_loss = F.cross_entropy(inputs, targets, reduction='none')
|
60 |
+
pt = torch.exp(-ce_loss)
|
61 |
+
loss = (alpha_targets * (1 - pt) ** self.gamma * ce_loss).mean()
|
62 |
+
return loss
|
63 |
+
|
64 |
+
|
65 |
+
class NegClassRandomSampler(Sampler):
|
66 |
+
"""
|
67 |
+
Dataloader Sampler that subsamples the negative class after each epoch.
|
68 |
+
The idea is that we want to keep the positive samples but select a random
|
69 |
+
subset of negative samples each epoch for a fresh set.
|
70 |
+
|
71 |
+
With the current settings, the sampling is done without replacement, and we
|
72 |
+
end up with a roughly 20% data imbalance, which should hopefully be more
|
73 |
+
manageable.
|
74 |
+
"""
|
75 |
+
|
76 |
+
def __init__(self, data_source, neg_class_ratio: float = 0.2, seed: int = 42):
|
77 |
+
self._random_gen = np.random.default_rng(seed)
|
78 |
+
self.data_source = data_source
|
79 |
+
self._neg_class_ratio = neg_class_ratio
|
80 |
+
|
81 |
+
# Get indices of the positive and negative cases
|
82 |
+
self._pos_indices = np.argwhere(np.array(data_source['labels']) == 1).flatten()
|
83 |
+
self._neg_indices = np.argwhere(np.array(data_source['labels']) == 0).flatten()
|
84 |
+
self._neg_num_samples = int(len(self._neg_indices) * neg_class_ratio)
|
85 |
+
self._pos_num_samples = len(self._pos_indices)
|
86 |
+
|
87 |
+
@property
|
88 |
+
def num_samples(self):
|
89 |
+
return self._pos_num_samples + self._neg_num_samples
|
90 |
+
|
91 |
+
def __iter__(self):
|
92 |
+
"""
|
93 |
+
Each time an iteration of this is requested, the resampling is done.
|
94 |
+
"""
|
95 |
+
_neg_samples = self._random_gen.choice(self._neg_indices, self._neg_num_samples, replace=False)
|
96 |
+
_samples = np.concatenate((_neg_samples, self._pos_indices), axis=0)
|
97 |
+
self._random_gen.shuffle(_samples)
|
98 |
+
if (len(_samples) != len(self)):
|
99 |
+
raise ValueError("Length of output samples (%d) does not match expected (%d)", len(_samples), len(self))
|
100 |
+
return iter(_samples.tolist())
|
101 |
+
|
102 |
+
def __len__(self):
|
103 |
+
return self.num_samples
|
104 |
+
|