andyqin18 commited on
Commit
9f7b182
1 Parent(s): c0f871b

Finished MS3

Browse files
milestone3/comp/sample_submission.csv DELETED
The diff for this file is too large to render. See raw diff
 
milestone3/comp/test_comment.csv DELETED
Binary file (60.4 MB)
 
milestone3/comp/test_labels.csv DELETED
The diff for this file is too large to render. See raw diff
 
milestone3/finetune_notebook.ipynb CHANGED
@@ -1,117 +1,1236 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "80baea1a",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "# 1 Prepate dataset\n",
11
- "# 2 Load pretrained Tokenizer, call it with dataset -> encoding\n",
12
- "# 3 Build PyTorch Dataset with encodings\n",
13
- "# 4 Load pretrained model\n",
14
- "# 5 a) Load Trainer and train it\n",
15
- "# b) or use native Pytorch training pipeline\n",
16
- "from pathlib import Path\n",
17
- "from sklearn.model_selection import train_test_split\n",
18
- "import torch\n",
19
- "from torch.utils.data import Dataset\n",
20
- "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification\n",
21
- "from transformers import Trainer, TrainingArguments\n",
22
- "\n",
23
- "model_name = \"distilbert-base-uncased\"\n",
24
- "\n",
25
- "def read_imdb_split(split_dir): # helper function to get text and label\n",
26
- " split_dir = Path(split_dir)\n",
27
- " texts = []\n",
28
- " labels = []\n",
29
- " for label_dir in [\"pos\", \"neg\"]:\n",
30
- " thres = 0\n",
31
- " for text_file in (split_dir/label_dir).iterdir():\n",
32
- " if thres < 100:\n",
33
- " f = open(text_file, encoding='utf8')\n",
34
- " texts.append(f.read())\n",
35
- " labels.append(0 if label_dir == \"neg\" else 1)\n",
36
- " thres += 1\n",
37
- "\n",
38
- " return texts, labels\n",
39
- "\n",
40
- "train_texts, train_labels = read_imdb_split(\"aclImdb/train\")\n",
41
- "test_texts, test_labels = read_imdb_split(\"aclImdb/test\")\n",
42
- "\n",
43
- "train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)\n",
44
- "\n",
45
- "\n",
46
- "class IMDBDataset(Dataset):\n",
47
- " def __init__(self, encodings, labels):\n",
48
- " self.encodings = encodings\n",
49
- " self.labels = labels\n",
50
- "\n",
51
- " def __getitem__(self, idx):\n",
52
- " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
53
- " item[\"labels\"] = torch.tensor(self.labels[idx])\n",
54
- " return item\n",
55
- " \n",
56
- " def __len__(self):\n",
57
- " return len(self.labels)\n",
58
- " \n",
59
- "tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)\n",
60
- "\n",
61
- "train_encodings = tokenizer(train_texts, truncation=True, padding=True)\n",
62
- "val_encodings = tokenizer(val_texts, truncation=True, padding=True)\n",
63
- "test_encodings = tokenizer(test_texts, truncation=True, padding=True)\n",
64
- "\n",
65
- "train_dataset = IMDBDataset(train_encodings, train_labels)\n",
66
- "val_dataset = IMDBDataset(val_encodings, val_labels)\n",
67
- "test_dataset = IMDBDataset(test_encodings, test_labels)\n",
68
- "\n",
69
- "training_args = TrainingArguments(\n",
70
- " output_dir='./results',\n",
71
- " num_train_epochs=2,\n",
72
- " per_device_train_batch_size=16,\n",
73
- " per_device_eval_batch_size=64,\n",
74
- " warmup_steps=500,\n",
75
- " learning_rate=5e-5,\n",
76
- " weight_decay=0.01,\n",
77
- " logging_dir='./logs',\n",
78
- " logging_steps=10\n",
79
- ")\n",
80
- "\n",
81
- "model = DistilBertForSequenceClassification.from_pretrained(model_name)\n",
82
- "trainer = Trainer(\n",
83
- " model=model,\n",
84
- " args=training_args,\n",
85
- " train_dataset=train_dataset,\n",
86
- " eval_dataset=val_dataset\n",
87
- ")\n",
88
- "\n",
89
- "trainer.train() \n",
90
- "\n",
91
- "\n",
92
- "\n"
93
- ]
94
- }
95
- ],
96
- "metadata": {
97
- "kernelspec": {
98
- "display_name": "Python 3 (ipykernel)",
99
- "language": "python",
100
- "name": "python3"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  },
102
- "language_info": {
103
- "codemirror_mode": {
104
- "name": "ipython",
105
- "version": 3
106
- },
107
- "file_extension": ".py",
108
- "mimetype": "text/x-python",
109
- "name": "python",
110
- "nbconvert_exporter": "python",
111
- "pygments_lexer": "ipython3",
112
- "version": "3.10.6"
113
- }
114
- },
115
- "nbformat": 4,
116
- "nbformat_minor": 5
117
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ },
15
+ "accelerator": "GPU",
16
+ "gpuClass": "standard",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "5777416c505a42619da32a0cb9707d82": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "VBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "VBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "VBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_40b2c027600349f6b026ed63ce056a99",
35
+ "IPY_MODEL_c0b5b216c5de43d39a5653c6159727c1",
36
+ "IPY_MODEL_dc816285a79147888b8558524148d042"
37
+ ],
38
+ "layout": "IPY_MODEL_98572cb68e274cb99844efa8a661f668"
39
+ }
40
+ },
41
+ "9e6e8abf4d324a7b8535172edbd7c954": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_1ed2e107cb794dc7923089751ac41dc3",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_29e71af41da9418db9aaf3a08e3e7d21",
59
+ "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
60
+ }
61
+ },
62
+ "1315624eb80a4517b79d814770cbe189": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "PasswordModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "PasswordModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "PasswordView",
75
+ "continuous_update": true,
76
+ "description": "Token:",
77
+ "description_tooltip": null,
78
+ "disabled": false,
79
+ "layout": "IPY_MODEL_1f7be06c638148d5b11ecee631598e26",
80
+ "placeholder": "​",
81
+ "style": "IPY_MODEL_ee9944452aee42d1bd04de0986249c08",
82
+ "value": ""
83
+ }
84
+ },
85
+ "7ec85b07ed1b4fccbab20a3b3183b173": {
86
+ "model_module": "@jupyter-widgets/controls",
87
+ "model_name": "CheckboxModel",
88
+ "model_module_version": "1.5.0",
89
+ "state": {
90
+ "_dom_classes": [],
91
+ "_model_module": "@jupyter-widgets/controls",
92
+ "_model_module_version": "1.5.0",
93
+ "_model_name": "CheckboxModel",
94
+ "_view_count": null,
95
+ "_view_module": "@jupyter-widgets/controls",
96
+ "_view_module_version": "1.5.0",
97
+ "_view_name": "CheckboxView",
98
+ "description": "Add token as git credential?",
99
+ "description_tooltip": null,
100
+ "disabled": false,
101
+ "indent": true,
102
+ "layout": "IPY_MODEL_95a9fb94616a4c07b1c2e167a876c3fc",
103
+ "style": "IPY_MODEL_435f1dfcc2264876a2ef31a634abc13a",
104
+ "value": false
105
+ }
106
+ },
107
+ "00aad7e6e5404b2f8f43182d660698ae": {
108
+ "model_module": "@jupyter-widgets/controls",
109
+ "model_name": "ButtonModel",
110
+ "model_module_version": "1.5.0",
111
+ "state": {
112
+ "_dom_classes": [],
113
+ "_model_module": "@jupyter-widgets/controls",
114
+ "_model_module_version": "1.5.0",
115
+ "_model_name": "ButtonModel",
116
+ "_view_count": null,
117
+ "_view_module": "@jupyter-widgets/controls",
118
+ "_view_module_version": "1.5.0",
119
+ "_view_name": "ButtonView",
120
+ "button_style": "",
121
+ "description": "Login",
122
+ "disabled": false,
123
+ "icon": "",
124
+ "layout": "IPY_MODEL_da38a43d5848415280acf6bcc8408d9f",
125
+ "style": "IPY_MODEL_c0fcb22e27f64aeda77cdb31ba1dbc21",
126
+ "tooltip": ""
127
+ }
128
+ },
129
+ "2182e718553d4959bfffc2ef5aa24d53": {
130
+ "model_module": "@jupyter-widgets/controls",
131
+ "model_name": "HTMLModel",
132
+ "model_module_version": "1.5.0",
133
+ "state": {
134
+ "_dom_classes": [],
135
+ "_model_module": "@jupyter-widgets/controls",
136
+ "_model_module_version": "1.5.0",
137
+ "_model_name": "HTMLModel",
138
+ "_view_count": null,
139
+ "_view_module": "@jupyter-widgets/controls",
140
+ "_view_module_version": "1.5.0",
141
+ "_view_name": "HTMLView",
142
+ "description": "",
143
+ "description_tooltip": null,
144
+ "layout": "IPY_MODEL_5f35bbd6fb9f4c7babb294a1cdb65cb6",
145
+ "placeholder": "​",
146
+ "style": "IPY_MODEL_d380510cfb9e40b09c98a16ad5c67f51",
147
+ "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
148
+ }
149
+ },
150
+ "98572cb68e274cb99844efa8a661f668": {
151
+ "model_module": "@jupyter-widgets/base",
152
+ "model_name": "LayoutModel",
153
+ "model_module_version": "1.2.0",
154
+ "state": {
155
+ "_model_module": "@jupyter-widgets/base",
156
+ "_model_module_version": "1.2.0",
157
+ "_model_name": "LayoutModel",
158
+ "_view_count": null,
159
+ "_view_module": "@jupyter-widgets/base",
160
+ "_view_module_version": "1.2.0",
161
+ "_view_name": "LayoutView",
162
+ "align_content": null,
163
+ "align_items": "center",
164
+ "align_self": null,
165
+ "border": null,
166
+ "bottom": null,
167
+ "display": "flex",
168
+ "flex": null,
169
+ "flex_flow": "column",
170
+ "grid_area": null,
171
+ "grid_auto_columns": null,
172
+ "grid_auto_flow": null,
173
+ "grid_auto_rows": null,
174
+ "grid_column": null,
175
+ "grid_gap": null,
176
+ "grid_row": null,
177
+ "grid_template_areas": null,
178
+ "grid_template_columns": null,
179
+ "grid_template_rows": null,
180
+ "height": null,
181
+ "justify_content": null,
182
+ "justify_items": null,
183
+ "left": null,
184
+ "margin": null,
185
+ "max_height": null,
186
+ "max_width": null,
187
+ "min_height": null,
188
+ "min_width": null,
189
+ "object_fit": null,
190
+ "object_position": null,
191
+ "order": null,
192
+ "overflow": null,
193
+ "overflow_x": null,
194
+ "overflow_y": null,
195
+ "padding": null,
196
+ "right": null,
197
+ "top": null,
198
+ "visibility": null,
199
+ "width": "50%"
200
+ }
201
+ },
202
+ "1ed2e107cb794dc7923089751ac41dc3": {
203
+ "model_module": "@jupyter-widgets/base",
204
+ "model_name": "LayoutModel",
205
+ "model_module_version": "1.2.0",
206
+ "state": {
207
+ "_model_module": "@jupyter-widgets/base",
208
+ "_model_module_version": "1.2.0",
209
+ "_model_name": "LayoutModel",
210
+ "_view_count": null,
211
+ "_view_module": "@jupyter-widgets/base",
212
+ "_view_module_version": "1.2.0",
213
+ "_view_name": "LayoutView",
214
+ "align_content": null,
215
+ "align_items": null,
216
+ "align_self": null,
217
+ "border": null,
218
+ "bottom": null,
219
+ "display": null,
220
+ "flex": null,
221
+ "flex_flow": null,
222
+ "grid_area": null,
223
+ "grid_auto_columns": null,
224
+ "grid_auto_flow": null,
225
+ "grid_auto_rows": null,
226
+ "grid_column": null,
227
+ "grid_gap": null,
228
+ "grid_row": null,
229
+ "grid_template_areas": null,
230
+ "grid_template_columns": null,
231
+ "grid_template_rows": null,
232
+ "height": null,
233
+ "justify_content": null,
234
+ "justify_items": null,
235
+ "left": null,
236
+ "margin": null,
237
+ "max_height": null,
238
+ "max_width": null,
239
+ "min_height": null,
240
+ "min_width": null,
241
+ "object_fit": null,
242
+ "object_position": null,
243
+ "order": null,
244
+ "overflow": null,
245
+ "overflow_x": null,
246
+ "overflow_y": null,
247
+ "padding": null,
248
+ "right": null,
249
+ "top": null,
250
+ "visibility": null,
251
+ "width": null
252
+ }
253
+ },
254
+ "29e71af41da9418db9aaf3a08e3e7d21": {
255
+ "model_module": "@jupyter-widgets/controls",
256
+ "model_name": "DescriptionStyleModel",
257
+ "model_module_version": "1.5.0",
258
+ "state": {
259
+ "_model_module": "@jupyter-widgets/controls",
260
+ "_model_module_version": "1.5.0",
261
+ "_model_name": "DescriptionStyleModel",
262
+ "_view_count": null,
263
+ "_view_module": "@jupyter-widgets/base",
264
+ "_view_module_version": "1.2.0",
265
+ "_view_name": "StyleView",
266
+ "description_width": ""
267
+ }
268
+ },
269
+ "1f7be06c638148d5b11ecee631598e26": {
270
+ "model_module": "@jupyter-widgets/base",
271
+ "model_name": "LayoutModel",
272
+ "model_module_version": "1.2.0",
273
+ "state": {
274
+ "_model_module": "@jupyter-widgets/base",
275
+ "_model_module_version": "1.2.0",
276
+ "_model_name": "LayoutModel",
277
+ "_view_count": null,
278
+ "_view_module": "@jupyter-widgets/base",
279
+ "_view_module_version": "1.2.0",
280
+ "_view_name": "LayoutView",
281
+ "align_content": null,
282
+ "align_items": null,
283
+ "align_self": null,
284
+ "border": null,
285
+ "bottom": null,
286
+ "display": null,
287
+ "flex": null,
288
+ "flex_flow": null,
289
+ "grid_area": null,
290
+ "grid_auto_columns": null,
291
+ "grid_auto_flow": null,
292
+ "grid_auto_rows": null,
293
+ "grid_column": null,
294
+ "grid_gap": null,
295
+ "grid_row": null,
296
+ "grid_template_areas": null,
297
+ "grid_template_columns": null,
298
+ "grid_template_rows": null,
299
+ "height": null,
300
+ "justify_content": null,
301
+ "justify_items": null,
302
+ "left": null,
303
+ "margin": null,
304
+ "max_height": null,
305
+ "max_width": null,
306
+ "min_height": null,
307
+ "min_width": null,
308
+ "object_fit": null,
309
+ "object_position": null,
310
+ "order": null,
311
+ "overflow": null,
312
+ "overflow_x": null,
313
+ "overflow_y": null,
314
+ "padding": null,
315
+ "right": null,
316
+ "top": null,
317
+ "visibility": null,
318
+ "width": null
319
+ }
320
+ },
321
+ "ee9944452aee42d1bd04de0986249c08": {
322
+ "model_module": "@jupyter-widgets/controls",
323
+ "model_name": "DescriptionStyleModel",
324
+ "model_module_version": "1.5.0",
325
+ "state": {
326
+ "_model_module": "@jupyter-widgets/controls",
327
+ "_model_module_version": "1.5.0",
328
+ "_model_name": "DescriptionStyleModel",
329
+ "_view_count": null,
330
+ "_view_module": "@jupyter-widgets/base",
331
+ "_view_module_version": "1.2.0",
332
+ "_view_name": "StyleView",
333
+ "description_width": ""
334
+ }
335
+ },
336
+ "95a9fb94616a4c07b1c2e167a876c3fc": {
337
+ "model_module": "@jupyter-widgets/base",
338
+ "model_name": "LayoutModel",
339
+ "model_module_version": "1.2.0",
340
+ "state": {
341
+ "_model_module": "@jupyter-widgets/base",
342
+ "_model_module_version": "1.2.0",
343
+ "_model_name": "LayoutModel",
344
+ "_view_count": null,
345
+ "_view_module": "@jupyter-widgets/base",
346
+ "_view_module_version": "1.2.0",
347
+ "_view_name": "LayoutView",
348
+ "align_content": null,
349
+ "align_items": null,
350
+ "align_self": null,
351
+ "border": null,
352
+ "bottom": null,
353
+ "display": null,
354
+ "flex": null,
355
+ "flex_flow": null,
356
+ "grid_area": null,
357
+ "grid_auto_columns": null,
358
+ "grid_auto_flow": null,
359
+ "grid_auto_rows": null,
360
+ "grid_column": null,
361
+ "grid_gap": null,
362
+ "grid_row": null,
363
+ "grid_template_areas": null,
364
+ "grid_template_columns": null,
365
+ "grid_template_rows": null,
366
+ "height": null,
367
+ "justify_content": null,
368
+ "justify_items": null,
369
+ "left": null,
370
+ "margin": null,
371
+ "max_height": null,
372
+ "max_width": null,
373
+ "min_height": null,
374
+ "min_width": null,
375
+ "object_fit": null,
376
+ "object_position": null,
377
+ "order": null,
378
+ "overflow": null,
379
+ "overflow_x": null,
380
+ "overflow_y": null,
381
+ "padding": null,
382
+ "right": null,
383
+ "top": null,
384
+ "visibility": null,
385
+ "width": null
386
+ }
387
+ },
388
+ "435f1dfcc2264876a2ef31a634abc13a": {
389
+ "model_module": "@jupyter-widgets/controls",
390
+ "model_name": "DescriptionStyleModel",
391
+ "model_module_version": "1.5.0",
392
+ "state": {
393
+ "_model_module": "@jupyter-widgets/controls",
394
+ "_model_module_version": "1.5.0",
395
+ "_model_name": "DescriptionStyleModel",
396
+ "_view_count": null,
397
+ "_view_module": "@jupyter-widgets/base",
398
+ "_view_module_version": "1.2.0",
399
+ "_view_name": "StyleView",
400
+ "description_width": ""
401
+ }
402
+ },
403
+ "da38a43d5848415280acf6bcc8408d9f": {
404
+ "model_module": "@jupyter-widgets/base",
405
+ "model_name": "LayoutModel",
406
+ "model_module_version": "1.2.0",
407
+ "state": {
408
+ "_model_module": "@jupyter-widgets/base",
409
+ "_model_module_version": "1.2.0",
410
+ "_model_name": "LayoutModel",
411
+ "_view_count": null,
412
+ "_view_module": "@jupyter-widgets/base",
413
+ "_view_module_version": "1.2.0",
414
+ "_view_name": "LayoutView",
415
+ "align_content": null,
416
+ "align_items": null,
417
+ "align_self": null,
418
+ "border": null,
419
+ "bottom": null,
420
+ "display": null,
421
+ "flex": null,
422
+ "flex_flow": null,
423
+ "grid_area": null,
424
+ "grid_auto_columns": null,
425
+ "grid_auto_flow": null,
426
+ "grid_auto_rows": null,
427
+ "grid_column": null,
428
+ "grid_gap": null,
429
+ "grid_row": null,
430
+ "grid_template_areas": null,
431
+ "grid_template_columns": null,
432
+ "grid_template_rows": null,
433
+ "height": null,
434
+ "justify_content": null,
435
+ "justify_items": null,
436
+ "left": null,
437
+ "margin": null,
438
+ "max_height": null,
439
+ "max_width": null,
440
+ "min_height": null,
441
+ "min_width": null,
442
+ "object_fit": null,
443
+ "object_position": null,
444
+ "order": null,
445
+ "overflow": null,
446
+ "overflow_x": null,
447
+ "overflow_y": null,
448
+ "padding": null,
449
+ "right": null,
450
+ "top": null,
451
+ "visibility": null,
452
+ "width": null
453
+ }
454
+ },
455
+ "c0fcb22e27f64aeda77cdb31ba1dbc21": {
456
+ "model_module": "@jupyter-widgets/controls",
457
+ "model_name": "ButtonStyleModel",
458
+ "model_module_version": "1.5.0",
459
+ "state": {
460
+ "_model_module": "@jupyter-widgets/controls",
461
+ "_model_module_version": "1.5.0",
462
+ "_model_name": "ButtonStyleModel",
463
+ "_view_count": null,
464
+ "_view_module": "@jupyter-widgets/base",
465
+ "_view_module_version": "1.2.0",
466
+ "_view_name": "StyleView",
467
+ "button_color": null,
468
+ "font_weight": ""
469
+ }
470
+ },
471
+ "5f35bbd6fb9f4c7babb294a1cdb65cb6": {
472
+ "model_module": "@jupyter-widgets/base",
473
+ "model_name": "LayoutModel",
474
+ "model_module_version": "1.2.0",
475
+ "state": {
476
+ "_model_module": "@jupyter-widgets/base",
477
+ "_model_module_version": "1.2.0",
478
+ "_model_name": "LayoutModel",
479
+ "_view_count": null,
480
+ "_view_module": "@jupyter-widgets/base",
481
+ "_view_module_version": "1.2.0",
482
+ "_view_name": "LayoutView",
483
+ "align_content": null,
484
+ "align_items": null,
485
+ "align_self": null,
486
+ "border": null,
487
+ "bottom": null,
488
+ "display": null,
489
+ "flex": null,
490
+ "flex_flow": null,
491
+ "grid_area": null,
492
+ "grid_auto_columns": null,
493
+ "grid_auto_flow": null,
494
+ "grid_auto_rows": null,
495
+ "grid_column": null,
496
+ "grid_gap": null,
497
+ "grid_row": null,
498
+ "grid_template_areas": null,
499
+ "grid_template_columns": null,
500
+ "grid_template_rows": null,
501
+ "height": null,
502
+ "justify_content": null,
503
+ "justify_items": null,
504
+ "left": null,
505
+ "margin": null,
506
+ "max_height": null,
507
+ "max_width": null,
508
+ "min_height": null,
509
+ "min_width": null,
510
+ "object_fit": null,
511
+ "object_position": null,
512
+ "order": null,
513
+ "overflow": null,
514
+ "overflow_x": null,
515
+ "overflow_y": null,
516
+ "padding": null,
517
+ "right": null,
518
+ "top": null,
519
+ "visibility": null,
520
+ "width": null
521
+ }
522
+ },
523
+ "d380510cfb9e40b09c98a16ad5c67f51": {
524
+ "model_module": "@jupyter-widgets/controls",
525
+ "model_name": "DescriptionStyleModel",
526
+ "model_module_version": "1.5.0",
527
+ "state": {
528
+ "_model_module": "@jupyter-widgets/controls",
529
+ "_model_module_version": "1.5.0",
530
+ "_model_name": "DescriptionStyleModel",
531
+ "_view_count": null,
532
+ "_view_module": "@jupyter-widgets/base",
533
+ "_view_module_version": "1.2.0",
534
+ "_view_name": "StyleView",
535
+ "description_width": ""
536
+ }
537
+ },
538
+ "09cae48b80da40b39f358d1310f397c3": {
539
+ "model_module": "@jupyter-widgets/controls",
540
+ "model_name": "LabelModel",
541
+ "model_module_version": "1.5.0",
542
+ "state": {
543
+ "_dom_classes": [],
544
+ "_model_module": "@jupyter-widgets/controls",
545
+ "_model_module_version": "1.5.0",
546
+ "_model_name": "LabelModel",
547
+ "_view_count": null,
548
+ "_view_module": "@jupyter-widgets/controls",
549
+ "_view_module_version": "1.5.0",
550
+ "_view_name": "LabelView",
551
+ "description": "",
552
+ "description_tooltip": null,
553
+ "layout": "IPY_MODEL_8cb9ff0be0904b459ffee0cdd91ecf53",
554
+ "placeholder": "​",
555
+ "style": "IPY_MODEL_6cae637442ac47be92792afefb5eca1b",
556
+ "value": "Connecting..."
557
+ }
558
+ },
559
+ "8cb9ff0be0904b459ffee0cdd91ecf53": {
560
+ "model_module": "@jupyter-widgets/base",
561
+ "model_name": "LayoutModel",
562
+ "model_module_version": "1.2.0",
563
+ "state": {
564
+ "_model_module": "@jupyter-widgets/base",
565
+ "_model_module_version": "1.2.0",
566
+ "_model_name": "LayoutModel",
567
+ "_view_count": null,
568
+ "_view_module": "@jupyter-widgets/base",
569
+ "_view_module_version": "1.2.0",
570
+ "_view_name": "LayoutView",
571
+ "align_content": null,
572
+ "align_items": null,
573
+ "align_self": null,
574
+ "border": null,
575
+ "bottom": null,
576
+ "display": null,
577
+ "flex": null,
578
+ "flex_flow": null,
579
+ "grid_area": null,
580
+ "grid_auto_columns": null,
581
+ "grid_auto_flow": null,
582
+ "grid_auto_rows": null,
583
+ "grid_column": null,
584
+ "grid_gap": null,
585
+ "grid_row": null,
586
+ "grid_template_areas": null,
587
+ "grid_template_columns": null,
588
+ "grid_template_rows": null,
589
+ "height": null,
590
+ "justify_content": null,
591
+ "justify_items": null,
592
+ "left": null,
593
+ "margin": null,
594
+ "max_height": null,
595
+ "max_width": null,
596
+ "min_height": null,
597
+ "min_width": null,
598
+ "object_fit": null,
599
+ "object_position": null,
600
+ "order": null,
601
+ "overflow": null,
602
+ "overflow_x": null,
603
+ "overflow_y": null,
604
+ "padding": null,
605
+ "right": null,
606
+ "top": null,
607
+ "visibility": null,
608
+ "width": null
609
+ }
610
+ },
611
+ "6cae637442ac47be92792afefb5eca1b": {
612
+ "model_module": "@jupyter-widgets/controls",
613
+ "model_name": "DescriptionStyleModel",
614
+ "model_module_version": "1.5.0",
615
+ "state": {
616
+ "_model_module": "@jupyter-widgets/controls",
617
+ "_model_module_version": "1.5.0",
618
+ "_model_name": "DescriptionStyleModel",
619
+ "_view_count": null,
620
+ "_view_module": "@jupyter-widgets/base",
621
+ "_view_module_version": "1.2.0",
622
+ "_view_name": "StyleView",
623
+ "description_width": ""
624
+ }
625
+ },
626
+ "40b2c027600349f6b026ed63ce056a99": {
627
+ "model_module": "@jupyter-widgets/controls",
628
+ "model_name": "LabelModel",
629
+ "model_module_version": "1.5.0",
630
+ "state": {
631
+ "_dom_classes": [],
632
+ "_model_module": "@jupyter-widgets/controls",
633
+ "_model_module_version": "1.5.0",
634
+ "_model_name": "LabelModel",
635
+ "_view_count": null,
636
+ "_view_module": "@jupyter-widgets/controls",
637
+ "_view_module_version": "1.5.0",
638
+ "_view_name": "LabelView",
639
+ "description": "",
640
+ "description_tooltip": null,
641
+ "layout": "IPY_MODEL_ef039b61681e42d59db86fea20dd3019",
642
+ "placeholder": "​",
643
+ "style": "IPY_MODEL_7b110bf457514812b02fa2c20c57eb8b",
644
+ "value": "Token is valid."
645
+ }
646
+ },
647
+ "c0b5b216c5de43d39a5653c6159727c1": {
648
+ "model_module": "@jupyter-widgets/controls",
649
+ "model_name": "LabelModel",
650
+ "model_module_version": "1.5.0",
651
+ "state": {
652
+ "_dom_classes": [],
653
+ "_model_module": "@jupyter-widgets/controls",
654
+ "_model_module_version": "1.5.0",
655
+ "_model_name": "LabelModel",
656
+ "_view_count": null,
657
+ "_view_module": "@jupyter-widgets/controls",
658
+ "_view_module_version": "1.5.0",
659
+ "_view_name": "LabelView",
660
+ "description": "",
661
+ "description_tooltip": null,
662
+ "layout": "IPY_MODEL_40b166a8206a42e8982a8331fe38a13c",
663
+ "placeholder": "​",
664
+ "style": "IPY_MODEL_aad77133528e44f7831786c99f185ed3",
665
+ "value": "Your token has been saved to /root/.cache/huggingface/token"
666
+ }
667
+ },
668
+ "dc816285a79147888b8558524148d042": {
669
+ "model_module": "@jupyter-widgets/controls",
670
+ "model_name": "LabelModel",
671
+ "model_module_version": "1.5.0",
672
+ "state": {
673
+ "_dom_classes": [],
674
+ "_model_module": "@jupyter-widgets/controls",
675
+ "_model_module_version": "1.5.0",
676
+ "_model_name": "LabelModel",
677
+ "_view_count": null,
678
+ "_view_module": "@jupyter-widgets/controls",
679
+ "_view_module_version": "1.5.0",
680
+ "_view_name": "LabelView",
681
+ "description": "",
682
+ "description_tooltip": null,
683
+ "layout": "IPY_MODEL_c2178ad5d0c6491a8bdf9aaa8e840f84",
684
+ "placeholder": "​",
685
+ "style": "IPY_MODEL_290c727552a84c2fba378fcd448aae4f",
686
+ "value": "Login successful"
687
+ }
688
+ },
689
+ "ef039b61681e42d59db86fea20dd3019": {
690
+ "model_module": "@jupyter-widgets/base",
691
+ "model_name": "LayoutModel",
692
+ "model_module_version": "1.2.0",
693
+ "state": {
694
+ "_model_module": "@jupyter-widgets/base",
695
+ "_model_module_version": "1.2.0",
696
+ "_model_name": "LayoutModel",
697
+ "_view_count": null,
698
+ "_view_module": "@jupyter-widgets/base",
699
+ "_view_module_version": "1.2.0",
700
+ "_view_name": "LayoutView",
701
+ "align_content": null,
702
+ "align_items": null,
703
+ "align_self": null,
704
+ "border": null,
705
+ "bottom": null,
706
+ "display": null,
707
+ "flex": null,
708
+ "flex_flow": null,
709
+ "grid_area": null,
710
+ "grid_auto_columns": null,
711
+ "grid_auto_flow": null,
712
+ "grid_auto_rows": null,
713
+ "grid_column": null,
714
+ "grid_gap": null,
715
+ "grid_row": null,
716
+ "grid_template_areas": null,
717
+ "grid_template_columns": null,
718
+ "grid_template_rows": null,
719
+ "height": null,
720
+ "justify_content": null,
721
+ "justify_items": null,
722
+ "left": null,
723
+ "margin": null,
724
+ "max_height": null,
725
+ "max_width": null,
726
+ "min_height": null,
727
+ "min_width": null,
728
+ "object_fit": null,
729
+ "object_position": null,
730
+ "order": null,
731
+ "overflow": null,
732
+ "overflow_x": null,
733
+ "overflow_y": null,
734
+ "padding": null,
735
+ "right": null,
736
+ "top": null,
737
+ "visibility": null,
738
+ "width": null
739
+ }
740
+ },
741
+ "7b110bf457514812b02fa2c20c57eb8b": {
742
+ "model_module": "@jupyter-widgets/controls",
743
+ "model_name": "DescriptionStyleModel",
744
+ "model_module_version": "1.5.0",
745
+ "state": {
746
+ "_model_module": "@jupyter-widgets/controls",
747
+ "_model_module_version": "1.5.0",
748
+ "_model_name": "DescriptionStyleModel",
749
+ "_view_count": null,
750
+ "_view_module": "@jupyter-widgets/base",
751
+ "_view_module_version": "1.2.0",
752
+ "_view_name": "StyleView",
753
+ "description_width": ""
754
+ }
755
+ },
756
+ "40b166a8206a42e8982a8331fe38a13c": {
757
+ "model_module": "@jupyter-widgets/base",
758
+ "model_name": "LayoutModel",
759
+ "model_module_version": "1.2.0",
760
+ "state": {
761
+ "_model_module": "@jupyter-widgets/base",
762
+ "_model_module_version": "1.2.0",
763
+ "_model_name": "LayoutModel",
764
+ "_view_count": null,
765
+ "_view_module": "@jupyter-widgets/base",
766
+ "_view_module_version": "1.2.0",
767
+ "_view_name": "LayoutView",
768
+ "align_content": null,
769
+ "align_items": null,
770
+ "align_self": null,
771
+ "border": null,
772
+ "bottom": null,
773
+ "display": null,
774
+ "flex": null,
775
+ "flex_flow": null,
776
+ "grid_area": null,
777
+ "grid_auto_columns": null,
778
+ "grid_auto_flow": null,
779
+ "grid_auto_rows": null,
780
+ "grid_column": null,
781
+ "grid_gap": null,
782
+ "grid_row": null,
783
+ "grid_template_areas": null,
784
+ "grid_template_columns": null,
785
+ "grid_template_rows": null,
786
+ "height": null,
787
+ "justify_content": null,
788
+ "justify_items": null,
789
+ "left": null,
790
+ "margin": null,
791
+ "max_height": null,
792
+ "max_width": null,
793
+ "min_height": null,
794
+ "min_width": null,
795
+ "object_fit": null,
796
+ "object_position": null,
797
+ "order": null,
798
+ "overflow": null,
799
+ "overflow_x": null,
800
+ "overflow_y": null,
801
+ "padding": null,
802
+ "right": null,
803
+ "top": null,
804
+ "visibility": null,
805
+ "width": null
806
+ }
807
+ },
808
+ "aad77133528e44f7831786c99f185ed3": {
809
+ "model_module": "@jupyter-widgets/controls",
810
+ "model_name": "DescriptionStyleModel",
811
+ "model_module_version": "1.5.0",
812
+ "state": {
813
+ "_model_module": "@jupyter-widgets/controls",
814
+ "_model_module_version": "1.5.0",
815
+ "_model_name": "DescriptionStyleModel",
816
+ "_view_count": null,
817
+ "_view_module": "@jupyter-widgets/base",
818
+ "_view_module_version": "1.2.0",
819
+ "_view_name": "StyleView",
820
+ "description_width": ""
821
+ }
822
+ },
823
+ "c2178ad5d0c6491a8bdf9aaa8e840f84": {
824
+ "model_module": "@jupyter-widgets/base",
825
+ "model_name": "LayoutModel",
826
+ "model_module_version": "1.2.0",
827
+ "state": {
828
+ "_model_module": "@jupyter-widgets/base",
829
+ "_model_module_version": "1.2.0",
830
+ "_model_name": "LayoutModel",
831
+ "_view_count": null,
832
+ "_view_module": "@jupyter-widgets/base",
833
+ "_view_module_version": "1.2.0",
834
+ "_view_name": "LayoutView",
835
+ "align_content": null,
836
+ "align_items": null,
837
+ "align_self": null,
838
+ "border": null,
839
+ "bottom": null,
840
+ "display": null,
841
+ "flex": null,
842
+ "flex_flow": null,
843
+ "grid_area": null,
844
+ "grid_auto_columns": null,
845
+ "grid_auto_flow": null,
846
+ "grid_auto_rows": null,
847
+ "grid_column": null,
848
+ "grid_gap": null,
849
+ "grid_row": null,
850
+ "grid_template_areas": null,
851
+ "grid_template_columns": null,
852
+ "grid_template_rows": null,
853
+ "height": null,
854
+ "justify_content": null,
855
+ "justify_items": null,
856
+ "left": null,
857
+ "margin": null,
858
+ "max_height": null,
859
+ "max_width": null,
860
+ "min_height": null,
861
+ "min_width": null,
862
+ "object_fit": null,
863
+ "object_position": null,
864
+ "order": null,
865
+ "overflow": null,
866
+ "overflow_x": null,
867
+ "overflow_y": null,
868
+ "padding": null,
869
+ "right": null,
870
+ "top": null,
871
+ "visibility": null,
872
+ "width": null
873
+ }
874
+ },
875
+ "290c727552a84c2fba378fcd448aae4f": {
876
+ "model_module": "@jupyter-widgets/controls",
877
+ "model_name": "DescriptionStyleModel",
878
+ "model_module_version": "1.5.0",
879
+ "state": {
880
+ "_model_module": "@jupyter-widgets/controls",
881
+ "_model_module_version": "1.5.0",
882
+ "_model_name": "DescriptionStyleModel",
883
+ "_view_count": null,
884
+ "_view_module": "@jupyter-widgets/base",
885
+ "_view_module_version": "1.2.0",
886
+ "_view_name": "StyleView",
887
+ "description_width": ""
888
+ }
889
+ }
890
+ }
891
+ }
892
  },
893
+ "cells": [
894
+ {
895
+ "cell_type": "code",
896
+ "source": [
897
+ "!pip install transformers"
898
+ ],
899
+ "metadata": {
900
+ "colab": {
901
+ "base_uri": "https://localhost:8080/"
902
+ },
903
+ "id": "wybe5jQM1NLf",
904
+ "outputId": "f3c5e205-adad-4a38-8405-e021fbe87c75"
905
+ },
906
+ "execution_count": null,
907
+ "outputs": [
908
+ {
909
+ "output_type": "stream",
910
+ "name": "stdout",
911
+ "text": [
912
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
913
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.28.1)\n",
914
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n",
915
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
916
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
917
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
918
+ "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n",
919
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
920
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n",
921
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
922
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
923
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
924
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (2023.4.0)\n",
925
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
926
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n",
927
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
928
+ "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n"
929
+ ]
930
+ }
931
+ ]
932
+ },
933
+ {
934
+ "cell_type": "markdown",
935
+ "source": [
936
+ "---------------------------------------------------------"
937
+ ],
938
+ "metadata": {
939
+ "id": "AYvuPa35Wq9C"
940
+ }
941
+ },
942
+ {
943
+ "cell_type": "code",
944
+ "source": [
945
+ "import pandas as pd\n",
946
+ "import numpy as np\n",
947
+ "import torch\n",
948
+ "from sklearn.model_selection import train_test_split\n",
949
+ "from torch.utils.data import Dataset\n",
950
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
951
+ "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n"
952
+ ],
953
+ "metadata": {
954
+ "id": "hQN-HmXXW6SA"
955
+ },
956
+ "execution_count": null,
957
+ "outputs": []
958
+ },
959
+ {
960
+ "cell_type": "code",
961
+ "source": [
962
+ "df = pd.read_csv(\"/content/drive/MyDrive/AI_project/data/train.csv\")\n",
963
+ "\n",
964
+ "train_texts = df[\"comment_text\"].values\n",
965
+ "labels = df.columns[2:]\n",
966
+ "id2label = {idx:label for idx, label in enumerate(labels)}\n",
967
+ "label2id = {label:idx for idx, label in enumerate(labels)}\n",
968
+ "train_labels = df[labels].values\n",
969
+ "# print(train_labels[0])\n",
970
+ "\n",
971
+ "\n",
972
+ "\n",
973
+ "np.random.seed(18)\n",
974
+ "small_train_texts = np.random.choice(train_texts, size=30000, replace=False)\n",
975
+ "\n",
976
+ "np.random.seed(18)\n",
977
+ "small_train_labels_idx = np.random.choice(train_labels.shape[0], size=30000, replace=False)\n",
978
+ "small_train_labels = train_labels[small_train_labels_idx, :]\n",
979
+ "# print(small_train_texts,small_train_labels)\n",
980
+ "\n",
981
+ "\n",
982
+ "train_texts, val_texts, train_labels, val_labels = train_test_split(small_train_texts, small_train_labels, test_size=.2)\n",
983
+ "# train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)"
984
+ ],
985
+ "metadata": {
986
+ "id": "WtsAFyrzWuCr"
987
+ },
988
+ "execution_count": null,
989
+ "outputs": []
990
+ },
991
+ {
992
+ "cell_type": "code",
993
+ "source": [
994
+ "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
995
+ "#Set up the dataset\n",
996
+ "# train_encodings = tokenizer(train_texts, truncation=True, padding=True)\n",
997
+ "# val_encodings = tokenizer(val_texts, truncation=True, padding=True)"
998
+ ],
999
+ "metadata": {
1000
+ "id": "pPgvgOaYXb2f"
1001
+ },
1002
+ "execution_count": null,
1003
+ "outputs": []
1004
+ },
1005
+ {
1006
+ "cell_type": "code",
1007
+ "source": [
1008
+ "class TextDataset(Dataset):\n",
1009
+ " def __init__(self,texts,labels):\n",
1010
+ " self.texts = texts\n",
1011
+ " self.labels = labels\n",
1012
+ "\n",
1013
+ " def __getitem__(self,idx):\n",
1014
+ " encodings = tokenizer(self.texts[idx], truncation=True, padding=\"max_length\")\n",
1015
+ " item = {key: torch.tensor(val) for key, val in encodings.items()}\n",
1016
+ " item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)\n",
1017
+ " del encodings\n",
1018
+ " return item\n",
1019
+ "\n",
1020
+ " def __len__(self):\n",
1021
+ " return len(self.labels)\n",
1022
+ "\n"
1023
+ ],
1024
+ "metadata": {
1025
+ "id": "aysAKCYoXBoz"
1026
+ },
1027
+ "execution_count": null,
1028
+ "outputs": []
1029
+ },
1030
+ {
1031
+ "cell_type": "code",
1032
+ "source": [
1033
+ "from huggingface_hub import notebook_login\n",
1034
+ "\n",
1035
+ "notebook_login()"
1036
+ ],
1037
+ "metadata": {
1038
+ "colab": {
1039
+ "base_uri": "https://localhost:8080/",
1040
+ "height": 113,
1041
+ "referenced_widgets": [
1042
+ "5777416c505a42619da32a0cb9707d82",
1043
+ "9e6e8abf4d324a7b8535172edbd7c954",
1044
+ "1315624eb80a4517b79d814770cbe189",
1045
+ "7ec85b07ed1b4fccbab20a3b3183b173",
1046
+ "00aad7e6e5404b2f8f43182d660698ae",
1047
+ "2182e718553d4959bfffc2ef5aa24d53",
1048
+ "98572cb68e274cb99844efa8a661f668",
1049
+ "1ed2e107cb794dc7923089751ac41dc3",
1050
+ "29e71af41da9418db9aaf3a08e3e7d21",
1051
+ "1f7be06c638148d5b11ecee631598e26",
1052
+ "ee9944452aee42d1bd04de0986249c08",
1053
+ "95a9fb94616a4c07b1c2e167a876c3fc",
1054
+ "435f1dfcc2264876a2ef31a634abc13a",
1055
+ "da38a43d5848415280acf6bcc8408d9f",
1056
+ "c0fcb22e27f64aeda77cdb31ba1dbc21",
1057
+ "5f35bbd6fb9f4c7babb294a1cdb65cb6",
1058
+ "d380510cfb9e40b09c98a16ad5c67f51",
1059
+ "09cae48b80da40b39f358d1310f397c3",
1060
+ "8cb9ff0be0904b459ffee0cdd91ecf53",
1061
+ "6cae637442ac47be92792afefb5eca1b",
1062
+ "40b2c027600349f6b026ed63ce056a99",
1063
+ "c0b5b216c5de43d39a5653c6159727c1",
1064
+ "dc816285a79147888b8558524148d042",
1065
+ "ef039b61681e42d59db86fea20dd3019",
1066
+ "7b110bf457514812b02fa2c20c57eb8b",
1067
+ "40b166a8206a42e8982a8331fe38a13c",
1068
+ "aad77133528e44f7831786c99f185ed3",
1069
+ "c2178ad5d0c6491a8bdf9aaa8e840f84",
1070
+ "290c727552a84c2fba378fcd448aae4f"
1071
+ ]
1072
+ },
1073
+ "id": "BcZnYYII3Nxo",
1074
+ "outputId": "16a4dc55-757f-4133-abb5-6e1f482c7e16"
1075
+ },
1076
+ "execution_count": null,
1077
+ "outputs": [
1078
+ {
1079
+ "output_type": "display_data",
1080
+ "data": {
1081
+ "text/plain": [
1082
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
1083
+ ],
1084
+ "application/vnd.jupyter.widget-view+json": {
1085
+ "version_major": 2,
1086
+ "version_minor": 0,
1087
+ "model_id": "5777416c505a42619da32a0cb9707d82"
1088
+ }
1089
+ },
1090
+ "metadata": {}
1091
+ }
1092
+ ]
1093
+ },
1094
+ {
1095
+ "cell_type": "code",
1096
+ "source": [
1097
+ "train_dataset = TextDataset(train_texts,train_labels)\n",
1098
+ "val_dataset = TextDataset(val_texts, val_labels)\n",
1099
+ "# small_train_dataset = train_dataset.shuffle(seed=42).select(range(1000))\n",
1100
+ "# small_val_dataset = val_dataset.shuffle(seed=42).select(range(1000))\n",
1101
+ "\n",
1102
+ "\n",
1103
+ "\n",
1104
+ "# model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=6, problem_type=\"multi_label_classification\")\n",
1105
+ "\n",
1106
+ "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\", \n",
1107
+ " problem_type=\"multi_label_classification\", \n",
1108
+ " num_labels=len(labels),\n",
1109
+ " id2label=id2label,\n",
1110
+ " label2id=label2id)\n",
1111
+ "model.to(device)\n",
1112
+ "\n",
1113
+ "training_args = TrainingArguments(\n",
1114
+ " output_dir=\"finetuned-bert-uncased\",\n",
1115
+ " evaluation_strategy = \"epoch\",\n",
1116
+ " save_strategy = \"epoch\",\n",
1117
+ " learning_rate=2e-5,\n",
1118
+ " per_device_train_batch_size=16,\n",
1119
+ " per_device_eval_batch_size=16,\n",
1120
+ " num_train_epochs=5,\n",
1121
+ " load_best_model_at_end=True,\n",
1122
+ " push_to_hub=True,\n",
1123
+ ")\n",
1124
+ "\n",
1125
+ "trainer = Trainer(\n",
1126
+ " model=model,\n",
1127
+ " args=training_args,\n",
1128
+ " train_dataset=train_dataset,\n",
1129
+ " eval_dataset=val_dataset,\n",
1130
+ " tokenizer=tokenizer\n",
1131
+ ")\n",
1132
+ "\n",
1133
+ "trainer.train()"
1134
+ ],
1135
+ "metadata": {
1136
+ "colab": {
1137
+ "base_uri": "https://localhost:8080/",
1138
+ "height": 320
1139
+ },
1140
+ "id": "BDptWdAAYs29",
1141
+ "outputId": "c885d19a-5fb9-4fec-9468-550928037ba3"
1142
+ },
1143
+ "execution_count": null,
1144
+ "outputs": [
1145
+ {
1146
+ "output_type": "stream",
1147
+ "name": "stderr",
1148
+ "text": [
1149
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
1150
+ "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
1151
+ "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
1152
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
1153
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
1154
+ "/content/finetuned-bert-uncased is already a clone of https://huggingface.co/andyqin18/finetuned-bert-uncased. Make sure you pull the latest changes with `repo.git_pull()`.\n",
1155
+ "WARNING:huggingface_hub.repository:/content/finetuned-bert-uncased is already a clone of https://huggingface.co/andyqin18/finetuned-bert-uncased. Make sure you pull the latest changes with `repo.git_pull()`.\n",
1156
+ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
1157
+ " warnings.warn(\n",
1158
+ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
1159
+ ]
1160
+ },
1161
+ {
1162
+ "output_type": "display_data",
1163
+ "data": {
1164
+ "text/plain": [
1165
+ "<IPython.core.display.HTML object>"
1166
+ ],
1167
+ "text/html": [
1168
+ "\n",
1169
+ " <div>\n",
1170
+ " \n",
1171
+ " <progress value='3001' max='7500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1172
+ " [3001/7500 1:16:16 < 1:54:24, 0.66 it/s, Epoch 2/5]\n",
1173
+ " </div>\n",
1174
+ " <table border=\"1\" class=\"dataframe\">\n",
1175
+ " <thead>\n",
1176
+ " <tr style=\"text-align: left;\">\n",
1177
+ " <th>Epoch</th>\n",
1178
+ " <th>Training Loss</th>\n",
1179
+ " <th>Validation Loss</th>\n",
1180
+ " </tr>\n",
1181
+ " </thead>\n",
1182
+ " <tbody>\n",
1183
+ " <tr>\n",
1184
+ " <td>1</td>\n",
1185
+ " <td>0.048900</td>\n",
1186
+ " <td>0.054034</td>\n",
1187
+ " </tr>\n",
1188
+ " </tbody>\n",
1189
+ "</table><p>\n",
1190
+ " <div>\n",
1191
+ " \n",
1192
+ " <progress value='273' max='375' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
1193
+ " [273/375 02:25 < 00:54, 1.87 it/s]\n",
1194
+ " </div>\n",
1195
+ " "
1196
+ ]
1197
+ },
1198
+ "metadata": {}
1199
+ }
1200
+ ]
1201
+ },
1202
+ {
1203
+ "cell_type": "code",
1204
+ "source": [
1205
+ "# print(device)"
1206
+ ],
1207
+ "metadata": {
1208
+ "id": "GH702kPdbbjs"
1209
+ },
1210
+ "execution_count": null,
1211
+ "outputs": []
1212
+ },
1213
+ {
1214
+ "cell_type": "code",
1215
+ "source": [
1216
+ "# trainer.push_to_hub()"
1217
+ ],
1218
+ "metadata": {
1219
+ "id": "T-VyJbD_gMkx"
1220
+ },
1221
+ "execution_count": null,
1222
+ "outputs": []
1223
+ },
1224
+ {
1225
+ "cell_type": "code",
1226
+ "source": [
1227
+ "# tokenizer.push_to_hub(\"andyqin18/test-finetuned\")"
1228
+ ],
1229
+ "metadata": {
1230
+ "id": "iIHPfQZfhQpN"
1231
+ },
1232
+ "execution_count": null,
1233
+ "outputs": []
1234
+ }
1235
+ ]
1236
+ }