jbraha commited on
Commit
f2a478c
1 Parent(s): 2c55221

st changes

Browse files
.github/workflows/main.yml CHANGED
@@ -1,7 +1,7 @@
1
  name: Sync to Hugging Face hub
2
  on:
3
  push:
4
- branches: [milestone-2]
5
 
6
  # to run this workflow manually from the Actions tab
7
  workflow_dispatch:
@@ -21,6 +21,6 @@ jobs:
21
  git config user.name "$GITHUB_ACTOR" &&
22
  git config user.email "<>"
23
  && git switch main
24
- && git merge origin/milestone-2
25
  && git push
26
  && git push https://jbraha:$HF_TOKEN@huggingface.co/spaces/jbraha/aiproject
 
1
  name: Sync to Hugging Face hub
2
  on:
3
  push:
4
+ branches: [milestone-3]
5
 
6
  # to run this workflow manually from the Actions tab
7
  workflow_dispatch:
 
21
  git config user.name "$GITHUB_ACTOR" &&
22
  git config user.email "<>"
23
  && git switch main
24
+ && git merge origin/milestone-3
25
  && git push
26
  && git push https://jbraha:$HF_TOKEN@huggingface.co/spaces/jbraha/aiproject
.ipynb_checkpoints/Copy of training-checkpoint.ipynb ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "215a1aae",
7
+ "metadata": {
8
+ "executionInfo": {
9
+ "elapsed": 128,
10
+ "status": "ok",
11
+ "timestamp": 1682285319377,
12
+ "user": {
13
+ "displayName": "",
14
+ "userId": ""
15
+ },
16
+ "user_tz": 240
17
+ },
18
+ "id": "215a1aae"
19
+ },
20
+ "outputs": [
21
+ {
22
+ "name": "stderr",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
26
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
27
+ "2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
28
+ ]
29
+ }
30
+ ],
31
+ "source": [
32
+ "import torch\n",
33
+ "from torch.utils.data import Dataset, DataLoader\n",
34
+ "\n",
35
+ "import pandas as pd\n",
36
+ "\n",
37
+ "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
38
+ "from transformers import Trainer, TrainingArguments"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "id": "J5Tlgp4tNd0U",
45
+ "metadata": {
46
+ "colab": {
47
+ "base_uri": "https://localhost:8080/"
48
+ },
49
+ "executionInfo": {
50
+ "elapsed": 1897,
51
+ "status": "ok",
52
+ "timestamp": 1682285321454,
53
+ "user": {
54
+ "displayName": "",
55
+ "userId": ""
56
+ },
57
+ "user_tz": 240
58
+ },
59
+ "id": "J5Tlgp4tNd0U",
60
+ "outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
61
+ },
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
68
+ "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
69
+ "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
70
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
71
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "model_name = \"bert-base-uncased\"\n",
77
+ "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
78
+ "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
79
+ "max_len = 200\n",
80
+ "\n",
81
+ "training_args = TrainingArguments(\n",
82
+ " output_dir=\"results\",\n",
83
+ " num_train_epochs=1,\n",
84
+ " per_device_train_batch_size=16,\n",
85
+ " per_device_eval_batch_size=64,\n",
86
+ " warmup_steps=500,\n",
87
+ " learning_rate=5e-5,\n",
88
+ " weight_decay=0.01,\n",
89
+ " logging_dir=\"./logs\",\n",
90
+ " logging_steps=10\n",
91
+ " )\n",
92
+ "\n",
93
+ "# dataset class that inherits from torch.utils.data.Dataset\n",
94
+ "class TweetDataset(Dataset):\n",
95
+ " def __init__(self, encodings, labels):\n",
96
+ " self.encodings = encodings\n",
97
+ " self.labels = labels\n",
98
+ " self.tok = tokenizer\n",
99
+ " \n",
100
+ " def __getitem__(self, idx):\n",
101
+ " # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
102
+ " item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
103
+ " item['labels'] = torch.tensor(self.labels[idx])\n",
104
+ " return item\n",
105
+ " \n",
106
+ " def __len__(self):\n",
107
+ " return len(self.labels)\n",
108
+ " \n",
109
+ "class TokenizerDataset(Dataset):\n",
110
+ " def __init__(self, strings):\n",
111
+ " self.strings = strings\n",
112
+ " \n",
113
+ " def __getitem__(self, idx):\n",
114
+ " return self.strings[idx]\n",
115
+ " \n",
116
+ " def __len__(self):\n",
117
+ " return len(self.strings)\n",
118
+ " "
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 3,
124
+ "id": "9969c58c",
125
+ "metadata": {
126
+ "executionInfo": {
127
+ "elapsed": 5145,
128
+ "status": "ok",
129
+ "timestamp": 1682285326593,
130
+ "user": {
131
+ "displayName": "",
132
+ "userId": ""
133
+ },
134
+ "user_tz": 240
135
+ },
136
+ "id": "9969c58c",
137
+ "scrolled": false
138
+ },
139
+ "outputs": [],
140
+ "source": [
141
+ "train_data = pd.read_csv(\"data/train.csv\")\n",
142
+ "train_text = train_data[\"comment_text\"]\n",
143
+ "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
144
+ " \"obscene\", \"threat\", \n",
145
+ " \"insult\", \"identity_hate\"]]\n",
146
+ "\n",
147
+ "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
148
+ "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
149
+ " \"toxic\", \"severe_toxic\", \n",
150
+ " \"obscene\", \"threat\", \n",
151
+ " \"insult\", \"identity_hate\"]]\n",
152
+ "\n",
153
+ "# data preprocessing\n",
154
+ "\n",
155
+ "\n",
156
+ "\n",
157
+ "train_text = train_text.values.tolist()\n",
158
+ "train_labels = train_labels.values.tolist()\n",
159
+ "test_text = test_text.values.tolist()\n",
160
+ "test_labels = test_labels.values.tolist()\n"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "1n56TME9Njde",
167
+ "metadata": {
168
+ "executionInfo": {
169
+ "elapsed": 12,
170
+ "status": "ok",
171
+ "timestamp": 1682285326594,
172
+ "user": {
173
+ "displayName": "",
174
+ "userId": ""
175
+ },
176
+ "user_tz": 240
177
+ },
178
+ "id": "1n56TME9Njde"
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "# prepare tokenizer and dataset\n",
183
+ "\n",
184
+ "train_strings = TokenizerDataset(train_text)\n",
185
+ "test_strings = TokenizerDataset(test_text)\n",
186
+ "\n",
187
+ "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
188
+ "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
189
+ "\n",
190
+ "\n",
191
+ "\n",
192
+ "\n",
193
+ "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
194
+ "# max_length=200, pad_to_max_length=True, \\\n",
195
+ "# truncation=True, return_token_type_ids=False \\\n",
196
+ "# )\n",
197
+ "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
198
+ "# max_length=200, pad_to_max_length=True, \\\n",
199
+ "# truncation=True, return_token_type_ids=False \\\n",
200
+ "# )\n",
201
+ "\n",
202
+ "\n",
203
+ "train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
204
+ "test_encodings = tokenizer(test_text, truncation=True, padding=True)"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "id": "a5c7a657",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "f = open(\"traintokens.txt\", 'a')\n",
215
+ "f.write(train_encodings)\n",
216
+ "f.write('\\n\\n\\n\\n\\n')\n",
217
+ "f.close()\n",
218
+ "\n",
219
+ "g = open(\"testtokens.txt\", 'a')\n",
220
+ "g.write(test_encodings)\n",
221
+ "g.write('\\n\\n\\n\\n\\n')\n",
222
+ "\n",
223
+ "g.close()"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": null,
229
+ "id": "4kwydz67qjW9",
230
+ "metadata": {
231
+ "executionInfo": {
232
+ "elapsed": 10,
233
+ "status": "ok",
234
+ "timestamp": 1682285326595,
235
+ "user": {
236
+ "displayName": "",
237
+ "userId": ""
238
+ },
239
+ "user_tz": 240
240
+ },
241
+ "id": "4kwydz67qjW9"
242
+ },
243
+ "outputs": [],
244
+ "source": [
245
+ "train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
246
+ "test_dataset = TweetDataset(test_encodings, test_labels)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "krZKjDVwNnWI",
253
+ "metadata": {
254
+ "executionInfo": {
255
+ "elapsed": 10,
256
+ "status": "ok",
257
+ "timestamp": 1682285326596,
258
+ "user": {
259
+ "displayName": "",
260
+ "userId": ""
261
+ },
262
+ "user_tz": 240
263
+ },
264
+ "id": "krZKjDVwNnWI"
265
+ },
266
+ "outputs": [],
267
+ "source": [
268
+ "# training\n",
269
+ "trainer = Trainer(\n",
270
+ " model=model, \n",
271
+ " args=training_args, \n",
272
+ " train_dataset=train_dataset, \n",
273
+ " eval_dataset=test_dataset\n",
274
+ " )"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "VwsyMZg_tgTg",
281
+ "metadata": {
282
+ "colab": {
283
+ "base_uri": "https://localhost:8080/",
284
+ "height": 416
285
+ },
286
+ "executionInfo": {
287
+ "elapsed": 27193,
288
+ "status": "error",
289
+ "timestamp": 1682285353779,
290
+ "user": {
291
+ "displayName": "",
292
+ "userId": ""
293
+ },
294
+ "user_tz": 240
295
+ },
296
+ "id": "VwsyMZg_tgTg",
297
+ "outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
298
+ },
299
+ "outputs": [],
300
+ "source": [
301
+ "trainer.train()"
302
+ ]
303
+ }
304
+ ],
305
+ "metadata": {
306
+ "colab": {
307
+ "provenance": [
308
+ {
309
+ "file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
310
+ "timestamp": 1682285843150
311
+ }
312
+ ]
313
+ },
314
+ "kernelspec": {
315
+ "display_name": "Python 3 (ipykernel)",
316
+ "language": "python",
317
+ "name": "python3"
318
+ },
319
+ "language_info": {
320
+ "codemirror_mode": {
321
+ "name": "ipython",
322
+ "version": 3
323
+ },
324
+ "file_extension": ".py",
325
+ "mimetype": "text/x-python",
326
+ "name": "python",
327
+ "nbconvert_exporter": "python",
328
+ "pygments_lexer": "ipython3",
329
+ "version": "3.10.6"
330
+ }
331
+ },
332
+ "nbformat": 4,
333
+ "nbformat_minor": 5
334
+ }
Copy of training.ipynb ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "215a1aae",
7
+ "metadata": {
8
+ "executionInfo": {
9
+ "elapsed": 128,
10
+ "status": "ok",
11
+ "timestamp": 1682285319377,
12
+ "user": {
13
+ "displayName": "",
14
+ "userId": ""
15
+ },
16
+ "user_tz": 240
17
+ },
18
+ "id": "215a1aae"
19
+ },
20
+ "outputs": [
21
+ {
22
+ "name": "stderr",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "2023-04-23 18:07:24.557548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
26
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
27
+ "2023-04-23 18:07:25.431969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
28
+ ]
29
+ }
30
+ ],
31
+ "source": [
32
+ "import torch\n",
33
+ "from torch.utils.data import Dataset, DataLoader\n",
34
+ "\n",
35
+ "import pandas as pd\n",
36
+ "\n",
37
+ "from transformers import BertTokenizerFast, BertForSequenceClassification\n",
38
+ "from transformers import Trainer, TrainingArguments"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "id": "J5Tlgp4tNd0U",
45
+ "metadata": {
46
+ "colab": {
47
+ "base_uri": "https://localhost:8080/"
48
+ },
49
+ "executionInfo": {
50
+ "elapsed": 1897,
51
+ "status": "ok",
52
+ "timestamp": 1682285321454,
53
+ "user": {
54
+ "displayName": "",
55
+ "userId": ""
56
+ },
57
+ "user_tz": 240
58
+ },
59
+ "id": "J5Tlgp4tNd0U",
60
+ "outputId": "3c9f0c5b-7bc3-4c15-c5ff-0a77d3b3b607"
61
+ },
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
68
+ "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
69
+ "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
70
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
71
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "model_name = \"bert-base-uncased\"\n",
77
+ "tokenizer = BertTokenizerFast.from_pretrained(model_name)\n",
78
+ "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)\n",
79
+ "max_len = 200\n",
80
+ "\n",
81
+ "training_args = TrainingArguments(\n",
82
+ " output_dir=\"results\",\n",
83
+ " num_train_epochs=1,\n",
84
+ " per_device_train_batch_size=16,\n",
85
+ " per_device_eval_batch_size=64,\n",
86
+ " warmup_steps=500,\n",
87
+ " learning_rate=5e-5,\n",
88
+ " weight_decay=0.01,\n",
89
+ " logging_dir=\"./logs\",\n",
90
+ " logging_steps=10\n",
91
+ " )\n",
92
+ "\n",
93
+ "# dataset class that inherits from torch.utils.data.Dataset\n",
94
+ "class TweetDataset(Dataset):\n",
95
+ " def __init__(self, encodings, labels):\n",
96
+ " self.encodings = encodings\n",
97
+ " self.labels = labels\n",
98
+ " self.tok = tokenizer\n",
99
+ " \n",
100
+ " def __getitem__(self, idx):\n",
101
+ " # encoding = self.tok(self.encodings[idx], truncation=True, padding=\"max_length\", max_length=max_len)\n",
102
+ " item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }\n",
103
+ " item['labels'] = torch.tensor(self.labels[idx])\n",
104
+ " return item\n",
105
+ " \n",
106
+ " def __len__(self):\n",
107
+ " return len(self.labels)\n",
108
+ " \n",
109
+ "class TokenizerDataset(Dataset):\n",
110
+ " def __init__(self, strings):\n",
111
+ " self.strings = strings\n",
112
+ " \n",
113
+ " def __getitem__(self, idx):\n",
114
+ " return self.strings[idx]\n",
115
+ " \n",
116
+ " def __len__(self):\n",
117
+ " return len(self.strings)\n",
118
+ " "
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 3,
124
+ "id": "9969c58c",
125
+ "metadata": {
126
+ "executionInfo": {
127
+ "elapsed": 5145,
128
+ "status": "ok",
129
+ "timestamp": 1682285326593,
130
+ "user": {
131
+ "displayName": "",
132
+ "userId": ""
133
+ },
134
+ "user_tz": 240
135
+ },
136
+ "id": "9969c58c",
137
+ "scrolled": false
138
+ },
139
+ "outputs": [],
140
+ "source": [
141
+ "train_data = pd.read_csv(\"data/train.csv\")\n",
142
+ "train_text = train_data[\"comment_text\"]\n",
143
+ "train_labels = train_data[[\"toxic\", \"severe_toxic\", \n",
144
+ " \"obscene\", \"threat\", \n",
145
+ " \"insult\", \"identity_hate\"]]\n",
146
+ "\n",
147
+ "test_text = pd.read_csv(\"data/test.csv\")[\"comment_text\"]\n",
148
+ "test_labels = pd.read_csv(\"data/test_labels.csv\")[[\n",
149
+ " \"toxic\", \"severe_toxic\", \n",
150
+ " \"obscene\", \"threat\", \n",
151
+ " \"insult\", \"identity_hate\"]]\n",
152
+ "\n",
153
+ "# data preprocessing\n",
154
+ "\n",
155
+ "\n",
156
+ "\n",
157
+ "train_text = train_text.values.tolist()\n",
158
+ "train_labels = train_labels.values.tolist()\n",
159
+ "test_text = test_text.values.tolist()\n",
160
+ "test_labels = test_labels.values.tolist()\n"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "1n56TME9Njde",
167
+ "metadata": {
168
+ "executionInfo": {
169
+ "elapsed": 12,
170
+ "status": "ok",
171
+ "timestamp": 1682285326594,
172
+ "user": {
173
+ "displayName": "",
174
+ "userId": ""
175
+ },
176
+ "user_tz": 240
177
+ },
178
+ "id": "1n56TME9Njde"
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "# prepare tokenizer and dataset\n",
183
+ "\n",
184
+ "train_strings = TokenizerDataset(train_text)\n",
185
+ "test_strings = TokenizerDataset(test_text)\n",
186
+ "\n",
187
+ "train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)\n",
188
+ "test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)\n",
189
+ "\n",
190
+ "\n",
191
+ "\n",
192
+ "\n",
193
+ "# train_encodings = tokenizer.batch_encode_plus(train_text, \\\n",
194
+ "# max_length=200, pad_to_max_length=True, \\\n",
195
+ "# truncation=True, return_token_type_ids=False \\\n",
196
+ "# )\n",
197
+ "# test_encodings = tokenizer.batch_encode_plus(test_text, \\\n",
198
+ "# max_length=200, pad_to_max_length=True, \\\n",
199
+ "# truncation=True, return_token_type_ids=False \\\n",
200
+ "# )\n",
201
+ "\n",
202
+ "\n",
203
+ "train_encodings = tokenizer(train_text, truncation=True, padding=True)\n",
204
+ "test_encodings = tokenizer(test_text, truncation=True, padding=True)"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "id": "a5c7a657",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "f = open(\"traintokens.txt\", 'a')\n",
215
+ "f.write(train_encodings)\n",
216
+ "f.write('\\n\\n\\n\\n\\n')\n",
217
+ "f.close()\n",
218
+ "\n",
219
+ "g = open(\"testtokens.txt\", 'a')\n",
220
+ "g.write(test_encodings)\n",
221
+ "g.write('\\n\\n\\n\\n\\n')\n",
222
+ "\n",
223
+ "g.close()"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": null,
229
+ "id": "4kwydz67qjW9",
230
+ "metadata": {
231
+ "executionInfo": {
232
+ "elapsed": 10,
233
+ "status": "ok",
234
+ "timestamp": 1682285326595,
235
+ "user": {
236
+ "displayName": "",
237
+ "userId": ""
238
+ },
239
+ "user_tz": 240
240
+ },
241
+ "id": "4kwydz67qjW9"
242
+ },
243
+ "outputs": [],
244
+ "source": [
245
+ "train_dataset = TweetDataset(train_ecnodings, train_labels)\n",
246
+ "test_dataset = TweetDataset(test_encodings, test_labels)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "krZKjDVwNnWI",
253
+ "metadata": {
254
+ "executionInfo": {
255
+ "elapsed": 10,
256
+ "status": "ok",
257
+ "timestamp": 1682285326596,
258
+ "user": {
259
+ "displayName": "",
260
+ "userId": ""
261
+ },
262
+ "user_tz": 240
263
+ },
264
+ "id": "krZKjDVwNnWI"
265
+ },
266
+ "outputs": [],
267
+ "source": [
268
+ "# training\n",
269
+ "trainer = Trainer(\n",
270
+ " model=model, \n",
271
+ " args=training_args, \n",
272
+ " train_dataset=train_dataset, \n",
273
+ " eval_dataset=test_dataset\n",
274
+ " )"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "VwsyMZg_tgTg",
281
+ "metadata": {
282
+ "colab": {
283
+ "base_uri": "https://localhost:8080/",
284
+ "height": 416
285
+ },
286
+ "executionInfo": {
287
+ "elapsed": 27193,
288
+ "status": "error",
289
+ "timestamp": 1682285353779,
290
+ "user": {
291
+ "displayName": "",
292
+ "userId": ""
293
+ },
294
+ "user_tz": 240
295
+ },
296
+ "id": "VwsyMZg_tgTg",
297
+ "outputId": "49c3f5c8-0342-45c5-8d0f-5cd5d2d1f9e9"
298
+ },
299
+ "outputs": [],
300
+ "source": [
301
+ "trainer.train()"
302
+ ]
303
+ }
304
+ ],
305
+ "metadata": {
306
+ "colab": {
307
+ "provenance": [
308
+ {
309
+ "file_id": "https://github.com/joebraha/aiproject/blob/milestone-3/training.ipynb",
310
+ "timestamp": 1682285843150
311
+ }
312
+ ]
313
+ },
314
+ "kernelspec": {
315
+ "display_name": "Python 3 (ipykernel)",
316
+ "language": "python",
317
+ "name": "python3"
318
+ },
319
+ "language_info": {
320
+ "codemirror_mode": {
321
+ "name": "ipython",
322
+ "version": 3
323
+ },
324
+ "file_extension": ".py",
325
+ "mimetype": "text/x-python",
326
+ "name": "python",
327
+ "nbconvert_exporter": "python",
328
+ "pygments_lexer": "ipython3",
329
+ "version": "3.10.6"
330
+ }
331
+ },
332
+ "nbformat": 4,
333
+ "nbformat_minor": 5
334
+ }
README.md CHANGED
@@ -10,11 +10,8 @@ pinned: false
10
  ---
11
 
12
 
13
- # Milestone 2
14
 
15
  Here is the link to the HF space:
16
  https://huggingface.co/spaces/jbraha/aiproject
17
 
18
- Other notes:
19
- - the docker image was changed to python 3.8.9 to align withe HF deployment, so tensorflow was imported manually
20
- - Git actions got weird: to use a milestone branch while also deploying to HF successfully, I have a git action automatically merging milestone-2 to the main branch and then pushing to the HF space
 
10
  ---
11
 
12
 
13
+ # Milestone 3
14
 
15
  Here is the link to the HF space:
16
  https://huggingface.co/spaces/jbraha/aiproject
17
 
 
 
 
app.py CHANGED
@@ -10,12 +10,21 @@ st.title("Sentiment Analysis")
10
  def analyze(input, model):
11
  return "This is a sample output"
12
 
 
 
 
 
 
13
  #text insert
14
  input = st.text_area("insert text to be analyzed", value="Nice to see you today.", height=None, max_chars=None, key=None, help=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
15
- model_name = st.text_input("choose a transformer model (nothing for default)", value="")
16
- if model_name:
17
- model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
18
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
19
  classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
20
  else:
21
  classifier = pipeline('sentiment-analysis')
 
10
  def analyze(input, model):
11
  return "This is a sample output"
12
 
13
+
14
+ # load my fine-tuned model
15
+ fine_tuned = None
16
+
17
+
18
  #text insert
19
  input = st.text_area("insert text to be analyzed", value="Nice to see you today.", height=None, max_chars=None, key=None, help=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
20
+ option = st.selectbox(
21
+ 'Choose a transformer model:',
22
+ ('Default', 'Fine-Tuned' , 'Custom'))
23
+
24
+
25
+ if option == 'Fine-Tuned':
26
+ model = TFAutoModelForSequenceClassification.from_pretrained(fine_tuned)
27
+ tokenizer = AutoTokenizer.from_pretrained(fine_tuned)
28
  classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
29
  else:
30
  classifier = pipeline('sentiment-analysis')
data/.~lock.test.csv# DELETED
@@ -1 +0,0 @@
1
- ,joe,mint,23.04.2023 12:27,file:///home/joe/.config/libreoffice/4;
 
 
data/.~lock.test_labels.csv# DELETED
@@ -1 +0,0 @@
1
- ,joe,mint,23.04.2023 11:48,file:///home/joe/.config/libreoffice/4;
 
 
data/.~lock.train.csv# DELETED
@@ -1 +0,0 @@
1
- ,joe,mint,23.04.2023 11:51,file:///home/joe/.config/libreoffice/4;
 
 
train.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset, DataLoader
3
+
4
+ import pandas as pd
5
+
6
+ from transformers import BertTokenizerFast, BertForSequenceClassification
7
+ from transformers import Trainer, TrainingArguments
8
+
9
+
10
+
11
+ model_name = "bert-base-uncased"
12
+ tokenizer = BertTokenizerFast.from_pretrained(model_name)
13
+ model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
14
+ max_len = 200
15
+
16
+ training_args = TrainingArguments(
17
+ output_dir="results",
18
+ num_train_epochs=1,
19
+ per_device_train_batch_size=16,
20
+ per_device_eval_batch_size=64,
21
+ warmup_steps=500,
22
+ learning_rate=5e-5,
23
+ weight_decay=0.01,
24
+ logging_dir="./logs",
25
+ logging_steps=10
26
+ )
27
+
28
+ # dataset class that inherits from torch.utils.data.Dataset
29
+ class TweetDataset(Dataset):
30
+ def __init__(self, encodings, labels):
31
+ self.encodings = encodings
32
+ self.labels = labels
33
+ self.tok = tokenizer
34
+
35
+ def __getitem__(self, idx):
36
+ # encoding = self.tok(self.encodings[idx], truncation=True, padding="max_length", max_length=max_len)
37
+ item = { key: torch.tensor(val[idx]) for key, val in self.encoding.items() }
38
+ item['labels'] = torch.tensor(self.labels[idx])
39
+ return item
40
+
41
+ def __len__(self):
42
+ return len(self.labels)
43
+
44
+ class TokenizerDataset(Dataset):
45
+ def __init__(self, strings):
46
+ self.strings = strings
47
+
48
+ def __getitem__(self, idx):
49
+ return self.strings[idx]
50
+
51
+ def __len__(self):
52
+ return len(self.strings)
53
+
54
+
55
+
56
+
57
+
58
+ train_data = pd.read_csv("data/train.csv")
59
+ train_text = train_data["comment_text"]
60
+ train_labels = train_data[["toxic", "severe_toxic",
61
+ "obscene", "threat",
62
+ "insult", "identity_hate"]]
63
+
64
+ test_text = pd.read_csv("data/test.csv")["comment_text"]
65
+ test_labels = pd.read_csv("data/test_labels.csv")[[
66
+ "toxic", "severe_toxic",
67
+ "obscene", "threat",
68
+ "insult", "identity_hate"]]
69
+
70
+ # data preprocessing
71
+
72
+
73
+
74
+ train_text = train_text.values.tolist()
75
+ train_labels = train_labels.values.tolist()
76
+ test_text = test_text.values.tolist()
77
+ test_labels = test_labels.values.tolist()
78
+
79
+
80
+
81
+
82
+ # prepare tokenizer and dataset
83
+
84
+ train_strings = TokenizerDataset(train_text)
85
+ test_strings = TokenizerDataset(test_text)
86
+
87
+ train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)
88
+ test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)
89
+
90
+
91
+
92
+
93
+ # train_encodings = tokenizer.batch_encode_plus(train_text, \
94
+ # max_length=200, pad_to_max_length=True, \
95
+ # truncation=True, return_token_type_ids=False \
96
+ # )
97
+ # test_encodings = tokenizer.batch_encode_plus(test_text, \
98
+ # max_length=200, pad_to_max_length=True, \
99
+ # truncation=True, return_token_type_ids=False \
100
+ # )
101
+
102
+
103
+ train_encodings = tokenizer.encode(train_text, truncation=True, padding=True)
104
+ test_encodings = tokenizer.encode(test_text, truncation=True, padding=True)
105
+
106
+
107
+ f = open("traintokens.txt", 'a')
108
+ f.write(train_encodings)
109
+ f.write('\n\n\n\n\n')
110
+ f.close()
111
+
112
+ g = open("testtokens.txt", 'a')
113
+ g.write(test_encodings)
114
+ g.write('\n\n\n\n\n')
115
+
116
+ g.close()
117
+
118
+
119
+
120
+ # train_dataset = TweetDataset(train_encodings, train_labels)
121
+ # test_dataset = TweetDataset(test_encodings, test_labels)
122
+
123
+
124
+
125
+
126
+
127
+ # # training
128
+ # trainer = Trainer(
129
+ # model=model,
130
+ # args=training_args,
131
+ # train_dataset=train_dataset,
132
+ # eval_dataset=test_dataset
133
+ # )
134
+
135
+
136
+ # trainer.train()
137
+
138
+
139
+
140
+
141
+
142
+
143
+
traintokens.txt ADDED
File without changes