Eappelson commited on
Commit
037b0cf
1 Parent(s): c09fb66

Upload final_classifier.py

Browse files
Files changed (1) hide show
  1. final_classifier.py +245 -0
final_classifier.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """final_classifier.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1i2uCPCvqnax-vpQBo43Ri8ivTe0HnqKK
8
+
9
+ # Installing Packages
10
+ """
11
+
12
+ # Only install once and then reset runtime
13
+ !pip install accelerate
14
+ !pip install optuna
15
+
16
+ """# Loading Libraries"""
17
+
18
+ # Loading Packages
19
+ import pandas as pd
20
+ from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
21
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
22
+ import torch
23
+ from torch.utils.data import Dataset, DataLoader
24
+ import torch.nn.functional as F
25
+ from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, confusion_matrix
26
+ from sklearn.utils.class_weight import compute_class_weight
27
+ import optuna
28
+ import numpy as np
29
+ import random
30
+ import accelerate
31
+ from sklearn.pipeline import Pipeline
32
+ from sklearn.preprocessing import StandardScaler
33
+ from google.colab import drive
34
+ from transformers import DataCollatorWithPadding
35
+
36
+ """# Importing and Cleaning Data"""
37
+
38
+ # Read the data
39
+ drive.mount('/content/drive')
40
+
41
+ bias = pd.read_csv('/content/drive/MyDrive/hackathon/misdirection.csv')
42
+
43
+ # Selecting out badly formatted columns
44
+ clean_bias = bias.loc[:, 'conversation_id':'unique_id']
45
+
46
+ # Filtering to just accepted vs. rejected
47
+ clean_bias = clean_bias[clean_bias['submission_grade'].isin(['accepted', 'rejected'])]
48
+
49
+ # Removing all NA under user (these do not help)
50
+ clean_bias = clean_bias.dropna(subset=['user'])
51
+
52
+ # Grouping by unique_id and joining each prompt into a single paragraph
53
+ grouped = clean_bias.groupby('unique_id')['user'].apply(lambda x: ' '.join(x)).reset_index()
54
+
55
+ # Selecting the predictor variable to be these paragraphs
56
+ X = grouped["user"].astype(str).tolist()
57
+
58
+ # Creating the predicted variable to be rejected and accepted as binary
59
+ y = clean_bias.groupby('unique_id')['submission_grade'].apply(lambda x: x.iloc[-1]).map({'rejected': 'non-violation','accepted': 'violation'}).tolist()
60
+
61
+ # Split the data in such a way that y is stratified
62
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)
63
+
64
+ """# Tokenizing Data"""
65
+
66
+ # Load tokenizer and model
67
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
68
+ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
69
+
70
+ # Tokenize the data
71
+ train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=256)
72
+ test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=256)
73
+
74
+ # Creating a customdataset
75
+ class CustomDataset(Dataset):
76
+ def __init__(self, encodings, labels):
77
+ self.encodings = encodings
78
+ self.labels = labels
79
+
80
+ def __getitem__(self, idx):
81
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
82
+ label = 0 if self.labels[idx] == 'non-violation' else 1
83
+ item['labels'] = torch.tensor(label, dtype=torch.long)
84
+ return item
85
+
86
+ def __len__(self):
87
+ return len(self.labels)
88
+
89
+ # Create the dataset objects
90
+ train_dataset = CustomDataset(train_encodings, [0 if label == 'non-violation' else 1 for label in y_train])
91
+ test_dataset = CustomDataset(test_encodings, [0 if label == 'non-violation' else 1 for label in y_test])
92
+
93
+ """# Creating Model"""
94
+
95
+ # Defining the metrics
96
+ def compute_metrics(pred):
97
+ labels = pred.label_ids
98
+ preds = pred.predictions.argmax(-1)
99
+ accuracy = accuracy_score(labels, preds)
100
+ precision = precision_score(labels, preds, average='weighted')
101
+ recall = recall_score(labels, preds, average='weighted')
102
+ f1 = f1_score(labels, preds, average='weighted')
103
+
104
+ return {
105
+ "accuracy": accuracy,
106
+ "precision": precision,
107
+ "recall": recall,
108
+ "f1": f1
109
+ }
110
+
111
+ # Objective function for Optuna
112
+ def objective(trial):
113
+ # Preventing overfitting and defining hyperparameters
114
+ dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
115
+ training_args = TrainingArguments(
116
+ output_dir="./misdirection_classification",
117
+ learning_rate=trial.suggest_loguniform('learning_rate', 1e-5, 5e-5),
118
+ per_device_train_batch_size=trial.suggest_categorical('batch_size', [8, 16, 32]),
119
+ gradient_accumulation_steps=2,
120
+ num_train_epochs=trial.suggest_int('num_train_epochs', 3, 10),
121
+ weight_decay=trial.suggest_loguniform('weight_decay', 1e-4, 1e-1),
122
+ save_strategy="epoch",
123
+ evaluation_strategy="epoch",
124
+ logging_dir="./logs",
125
+ logging_steps=10,
126
+ load_best_model_at_end=True,
127
+ metric_for_best_model="f1",
128
+ push_to_hub=False,
129
+ )
130
+
131
+ # Tokenizing the data
132
+ train_encodings_fold = tokenizer(X_train, truncation=True, padding=True, max_length=256)
133
+ val_encodings_fold = tokenizer(X_test, truncation=True, padding=True, max_length=256)
134
+
135
+ # Creating dataset objects
136
+ train_dataset_fold = CustomDataset(train_encodings_fold, y_train)
137
+ val_dataset_fold = CustomDataset(val_encodings_fold, y_test)
138
+
139
+ # Initializing a new model
140
+ model_fold = model_init(dropout_rate)
141
+
142
+ # Defining the trainer
143
+ trainer = Trainer(
144
+ model=model_fold,
145
+ args=training_args,
146
+ train_dataset=train_dataset_fold,
147
+ eval_dataset=val_dataset_fold,
148
+ tokenizer=tokenizer,
149
+ compute_metrics=compute_metrics,
150
+ )
151
+
152
+ # Training the model
153
+ trainer.train()
154
+
155
+ eval_result = trainer.evaluate(eval_dataset=val_dataset_fold)
156
+ accuracy = eval_result['eval_accuracy']
157
+ precision = eval_result['eval_precision']
158
+ recall = eval_result['eval_recall']
159
+ f1 = eval_result['eval_f1']
160
+
161
+ # Calculate the composite score using average metrics (f1 yielded best results in end)
162
+ composite_score = (
163
+ 0.25 * accuracy +
164
+ 0.25 * precision +
165
+ 0.25 * recall +
166
+ 0.25 * f1
167
+ )
168
+
169
+ return f1
170
+
171
+ # Model initialization function
172
+ def model_init(dropout_rate):
173
+ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
174
+ model.classifier.dropout = torch.nn.Dropout(p=dropout_rate)
175
+
176
+ return model
177
+
178
+ # Run Optuna optimization
179
+ study = optuna.create_study(direction='maximize')
180
+ study.optimize(objective, n_trials=15)
181
+
182
+ """# Final Model"""
183
+
184
+ # Retrieve the best parameters from the Optuna study
185
+ best_params = study.best_params
186
+
187
+ # Define training arguments using the best parameters
188
+ training_args = TrainingArguments(
189
+ output_dir="predicting_misdirection",
190
+ learning_rate=best_params['learning_rate'],
191
+ per_device_train_batch_size=best_params['batch_size'],
192
+ gradient_accumulation_steps=2,
193
+ num_train_epochs=best_params['num_train_epochs'],
194
+ weight_decay=best_params['weight_decay'],
195
+ save_strategy="epoch",
196
+ evaluation_strategy="epoch",
197
+ logging_dir="logs",
198
+ logging_steps=10,
199
+ load_best_model_at_end=True,
200
+ metric_for_best_model="f1",
201
+ push_to_hub=False,
202
+ )
203
+
204
+ # Define a data collator
205
+ data_collator = DataCollatorWithPadding(tokenizer)
206
+
207
+ # Initialize the trainer with the specified arguments
208
+ trainer = Trainer(
209
+ model=model,
210
+ args=training_args,
211
+ train_dataset=train_dataset,
212
+ eval_dataset=test_dataset,
213
+ tokenizer=tokenizer,
214
+ data_collator=data_collator,
215
+ compute_metrics=compute_metrics,
216
+ )
217
+
218
+ """# Training Final Model"""
219
+
220
+ # Training the final model on hyperparameters
221
+ trainer.train()
222
+
223
+ """# Evaluating Final Mode"""
224
+
225
+ # Getting evaluation results
226
+ eval_result = trainer.evaluate(eval_dataset=test_dataset)
227
+ for key, value in eval_result.items():
228
+ print(f"{key}: {value}")
229
+
230
+ # Getting confusion matrix
231
+ predictions = trainer.predict(test_dataset)
232
+ predicted_labels = np.argmax(predictions.predictions, axis=1)
233
+
234
+ true_labels = [item['labels'].item() for item in test_dataset]
235
+
236
+ cm = confusion_matrix(true_labels, predicted_labels)
237
+ import matplotlib.pyplot as plt
238
+ import seaborn as sns
239
+
240
+ plt.figure(figsize=(10, 7))
241
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
242
+ plt.xlabel('Predicted Labels')
243
+ plt.ylabel('True Labels')
244
+ plt.title('Confusion Matrix')
245
+ plt.show()