Akhil0-o commited on
Commit
0971934
1 Parent(s): 0a1f7de

Upload RoBERTa_model.ipynb

Browse files
Files changed (1) hide show
  1. RoBERTa_model.ipynb +235 -0
RoBERTa_model.ipynb ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#Load the required libraries\n",
10
+ "import torch\n",
11
+ "from torch.utils.data import Dataset\n",
12
+ "import pandas as pd\n",
13
+ "from sklearn.model_selection import train_test_split\n",
14
+ "from sklearn.metrics import classification_report\n",
15
+ "from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments\n",
16
+ "from transformers import TrainerCallback\n",
17
+ "import os\n",
18
+ "from transformers import TrainingArguments, Trainer\n",
19
+ "#Create directory to save model\n",
20
+ "os.makedirs(\"./best_model\", exist_ok=True)\n",
21
+ "\n",
22
+ "#Create a callback class to save the best model\n",
23
+ "class SaveBestModelCallback(TrainerCallback):\n",
24
+ " #Initialize the class variables and values\n",
25
+ " def __init__(self):\n",
26
+ " self.best_f1_score = 0\n",
27
+ " #Get the evaluation metrics\n",
28
+ " def on_evaluate(self, args, state, control, metrics, **kwargs):\n",
29
+ " metrics = trainer.evaluate()\n",
30
+ " f1_score = metrics[\"eval_f1\"]\n",
31
+ " #Save the model if the current f1 score is higher that the best f1 score so far\n",
32
+ " if f1_score > self.best_f1_score:\n",
33
+ " self.best_f1_score = f1_score\n",
34
+ " model.save_pretrained(\"./best_model\")\n",
35
+ " tokenizer.save_pretrained(\"./best_model\")\n",
36
+ " #Print the f1 score\n",
37
+ " print(f\"New best model saved with F1 score: {f1_score}\")\n",
38
+ "\n",
39
+ "# Load and preprocess the data\n",
40
+ "train_data = pd.read_csv(\"train_links.csv\", encoding='utf-8', encoding_errors='ignore')\n",
41
+ "test_data = pd.read_csv(\"test_links.csv\", encoding='utf-8', encoding_errors='ignore')\n",
42
+ "\n",
43
+ "test_data=test_data[:16171]\n",
44
+ "\n",
45
+ "train_data=train_data[['email', 'label']]\n",
46
+ "test_data=test_data[['email', 'label']]\n",
47
+ "\n",
48
+ "\n",
49
+ "#print(len(train_data))\n",
50
+ "#print(train_data[train_data['label'].isnull()])\n",
51
+ "\n",
52
+ "\n",
53
+ "train_data['label'] = train_data['label'].astype(int)\n",
54
+ "test_data['label'] = test_data['label'].astype(int)\n",
55
+ "\n",
56
+ "#Convert all column data to strings\n",
57
+ "train_email_list=train_data[\"email\"].tolist()\n",
58
+ "for i in range(len(train_email_list)):\n",
59
+ " if type(train_email_list[i]) != type('a'):\n",
60
+ " temp=str(train_email_list[i])\n",
61
+ " train_email_list[i]=temp\n",
62
+ "\n",
63
+ "#Get the label lists\n",
64
+ "train_label_list=train_data[\"label\"].tolist()\n",
65
+ "\n",
66
+ "#print(len(train_email_list))\n",
67
+ "#print(len(train_label_list))\n",
68
+ "\n",
69
+ "\n",
70
+ "for i in range(len(train_label_list)):\n",
71
+ " if type(train_label_list[i]) != type(1):\n",
72
+ " temp=int(train_label_list[i])\n",
73
+ " train_label_list[i]=temp\n",
74
+ "\n",
75
+ "#Convert null values in labels to 0\n",
76
+ "count=0\n",
77
+ "#print(count)\n",
78
+ "for i in (train_data[\"label\"].tolist()):\n",
79
+ " if type(i) != type(1):\n",
80
+ " count+=1\n",
81
+ "\n",
82
+ "#print(count)\n",
83
+ "\n",
84
+ "#print(len(train_data))\n",
85
+ "#print(train_data[train_data['label'].isnull()])\n",
86
+ "\n",
87
+ "\n",
88
+ "#Get test email and label lists\n",
89
+ "test_email_list=test_data[\"email\"].tolist()\n",
90
+ "for i in range(len(test_email_list)):\n",
91
+ " if type(test_email_list[i]) != type('a'):\n",
92
+ " temp=str(test_email_list[i])\n",
93
+ " test_email_list[i]=temp\n",
94
+ "\n",
95
+ "\n",
96
+ "test_label_list=test_data[\"label\"].tolist()\n",
97
+ "\n",
98
+ "#print(len(train_email_list))\n",
99
+ "#print(len(train_label_list))\n",
100
+ "\n",
101
+ "\n",
102
+ "for i in range(len(test_label_list)):\n",
103
+ " if type(test_label_list[i]) != type(1):\n",
104
+ " temp=int(test_label_list[i])\n",
105
+ " test_label_list[i]=temp\n",
106
+ "\n",
107
+ "count=0\n",
108
+ "#print(count)\n",
109
+ "for i in (test_data[\"label\"].tolist()):\n",
110
+ " if type(i) != type(1):\n",
111
+ " count+=1\n",
112
+ "\n",
113
+ "#print(count)\n",
114
+ "\n",
115
+ "train_data=train_data[['email', 'label']]\n",
116
+ "test_data=test_data[['email', 'label']]\n",
117
+ "\n",
118
+ "train_data['label'] = train_data['label'].astype(int)\n",
119
+ "test_data['label'] = test_data['label'].astype(int)\n",
120
+ "\n",
121
+ "#Load the RoBERTa tokenizer\n",
122
+ "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
123
+ "\n",
124
+ "#Preprocess the data\n",
125
+ "def preprocess(df):\n",
126
+ " inputs = tokenizer(df[\"email\"].tolist(), return_tensors=\"pt\", padding=True, truncation=True, max_length=512)\n",
127
+ " labels = torch.tensor(df[\"label\"].tolist())\n",
128
+ " return inputs, labels\n",
129
+ "\n",
130
+ "train_inputs, train_labels = preprocess(train_data)\n",
131
+ "test_inputs, test_labels = preprocess(test_data)\n",
132
+ "\n",
133
+ "# Custom dataset class\n",
134
+ "class CustomDataset(Dataset):\n",
135
+ " def __init__(self, inputs, labels):\n",
136
+ " self.inputs = inputs\n",
137
+ " self.labels = labels\n",
138
+ "\n",
139
+ " def __len__(self):\n",
140
+ " return len(self.labels)\n",
141
+ "\n",
142
+ " def __getitem__(self, idx):\n",
143
+ " item = {key: val[idx] for key, val in self.inputs.items()}\n",
144
+ " item[\"labels\"] = self.labels[idx]\n",
145
+ " return item\n",
146
+ "\n",
147
+ "# Prepare the RoBERTa model for training\n",
148
+ "model = RobertaForSequenceClassification.from_pretrained(\"roberta-base\", num_labels=2)\n",
149
+ "\n",
150
+ "# Define the Trainer and TrainingArguments\n",
151
+ "training_args = TrainingArguments(\n",
152
+ " output_dir=\"./results\",\n",
153
+ " num_train_epochs=1,\n",
154
+ " per_device_train_batch_size=8,\n",
155
+ " per_device_eval_batch_size=16,\n",
156
+ " logging_dir=\"./logs\",\n",
157
+ " logging_steps=100,\n",
158
+ " save_steps=1000,\n",
159
+ " evaluation_strategy=\"epoch\",\n",
160
+ " learning_rate=2e-5,\n",
161
+ " weight_decay=0.01,\n",
162
+ ")\n",
163
+ "\n",
164
+ "#Define the compute metrics function\n",
165
+ "def compute_metrics(pred):\n",
166
+ " labels = pred.label_ids\n",
167
+ " preds = pred.predictions.argmax(-1)\n",
168
+ " metrics = classification_report(labels, preds, output_dict=True)[\"weighted avg\"]\n",
169
+ " return {\"f1\": metrics[\"f1-score\"]}\n",
170
+ "\n",
171
+ "\n",
172
+ "#Initialize the trainer\n",
173
+ "trainer = Trainer(\n",
174
+ " model=model,\n",
175
+ " args=training_args,\n",
176
+ " train_dataset=CustomDataset(train_inputs, train_labels),\n",
177
+ " eval_dataset=CustomDataset(test_inputs, test_labels),\n",
178
+ " compute_metrics=compute_metrics,\n",
179
+ ")\n",
180
+ "\n",
181
+ "#trainer.add_callback(SaveBestModelCallback())\n",
182
+ "trainer.train()\n",
183
+ "\n",
184
+ "# Evaluate the model\n",
185
+ "eval_results = trainer.evaluate()\n",
186
+ "\n",
187
+ "#Printing the results\n",
188
+ "print(\"Evaluation results:\", eval_results)\n",
189
+ "\n",
190
+ "\n",
191
+ "#Save the best model\n",
192
+ "model.save_pretrained('./best_model')\n",
193
+ "model.save_pretrained('./best_model.h5')\n",
194
+ "tokenizer.save_pretrained(\"./best_model\")\n",
195
+ "\n",
196
+ "\"\"\"\n",
197
+ "best_model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n",
198
+ "best_tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")\n",
199
+ "For using the saved model in a Google Chrome extension, you would need to use a server-side solution or a cloud-based API to connect your extension to the trained model.\n",
200
+ "\"\"\""
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": null,
206
+ "metadata": {},
207
+ "outputs": [],
208
+ "source": [
209
+ "model = RobertaForSequenceClassification.from_pretrained(\"./best_model\")\n",
210
+ "tokenizer = RobertaTokenizer.from_pretrained(\"./best_model\")"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": null,
216
+ "metadata": {},
217
+ "outputs": [],
218
+ "source": [
219
+ "inputs = tokenizer(\"www.tiem.utk.edu/~gross/bioed/bealsmodules/spider.html\", return_tensors=\"pt\")\n",
220
+ "outputs = model(**inputs)\n",
221
+ "predictions = torch.argmax(outputs.logits, dim=-1)\n",
222
+ "\n",
223
+ "print(predictions)"
224
+ ]
225
+ }
226
+ ],
227
+ "metadata": {
228
+ "language_info": {
229
+ "name": "python"
230
+ },
231
+ "orig_nbformat": 4
232
+ },
233
+ "nbformat": 4,
234
+ "nbformat_minor": 2
235
+ }