selinS commited on
Commit
c78d747
1 Parent(s): 4bca5e1

Upload 10 files

Browse files
README.md CHANGED
@@ -1,13 +1,5 @@
1
- ---
2
- title: Sin Kaf
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 4.12.0
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # sin-kaf
2
+ # dataset link (Turkis)
3
+ # https://sites.google.com/site/offensevalsharedtask/more-datasets
4
+
 
 
 
 
 
 
 
5
 
 
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gra
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoModelForSequenceClassification
5
+ from transformers import AutoTokenizer
6
+ from optimum.onnxruntime import ORTModel
7
+ import onnxruntime as rt
8
+
9
+
10
+ ort_session = rt.InferenceSession("/DATA/sin-kaf/onnx_model/model.onnx")
11
+ ort_session.get_providers()
12
+
13
+ # model = ORTModel.load_model("/DATA/sin-kaf/onnx_model/model.onnx")
14
+ # model = AutoModelForSequenceClassification.from_pretrained('/DATA/sin-kaf/test_trainer/checkpoint-18500')
15
+ tokenizer = AutoTokenizer.from_pretrained("Overfit-GM/distilbert-base-turkish-cased-offensive")
16
+
17
+ def user_greeting(sent):
18
+
19
+ encoded_dict = tokenizer.encode_plus(
20
+ sent,
21
+ add_special_tokens = True,
22
+ max_length = 64,
23
+ pad_to_max_length = True,
24
+ return_attention_mask = True,
25
+ return_tensors = 'pt',
26
+ )
27
+
28
+
29
+ input_ids = encoded_dict['input_ids']
30
+ attention_masks = encoded_dict['attention_mask']
31
+
32
+
33
+ input_ids = torch.cat([input_ids], dim=0)
34
+ input_mask = torch.cat([attention_masks], dim=0)
35
+
36
+ input_feed = {
37
+ "input_ids": input_ids.tolist(),
38
+ "attention_mask":input_mask.tolist(),
39
+ }
40
+ output = ort_session.run(None, input_feed)
41
+ return np.argmax((output[0][0]))
42
+ # outputs = model(input_ids, input_mask)
43
+ # return torch.argmax(outputs['logits'])
44
+
45
+
46
+
47
+ app = gra.Interface(fn = user_greeting, inputs="text", outputs="text")
48
+ app.launch()
49
+ # app.launch(server_name="0.0.0.0")
cluster_outliers.csv ADDED
The diff for this file is too large to render. See raw diff
 
main.ipynb ADDED
@@ -0,0 +1,1008 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# dataset link (Turkis)\n",
10
+ "# https://sites.google.com/site/offensevalsharedtask/more-datasets"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "name": "stderr",
20
+ "output_type": "stream",
21
+ "text": [
22
+ "/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/neptune/internal/backends/hosted_client.py:51: NeptuneDeprecationWarning: The 'neptune-client' package has been deprecated and will be removed in the future. Install the 'neptune' package instead. For more, see https://docs.neptune.ai/setup/upgrading/\n",
23
+ " from neptune.version import version as neptune_client_version\n",
24
+ "/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:39: NeptuneDeprecationWarning: You're importing the Neptune client library via the deprecated `neptune.new` module, which will be removed in a future release. Import directly from `neptune` instead.\n",
25
+ " from neptune import new as neptune\n"
26
+ ]
27
+ }
28
+ ],
29
+ "source": [
30
+ "import os\n",
31
+ "import numpy as np\n",
32
+ "import pandas as pd\n",
33
+ "import pytorch_lightning as pl\n",
34
+ "import random\n",
35
+ "import torch\n",
36
+ "import emoji\n",
37
+ "\n",
38
+ "\n",
39
+ "import datetime\n",
40
+ "import numpy as np\n",
41
+ "import torch.optim as optim\n",
42
+ "\n",
43
+ "\n",
44
+ "import torch.nn as nn\n",
45
+ "\n",
46
+ "from torch.utils.data import DataLoader,Dataset,random_split,TensorDataset ,RandomSampler, SequentialSampler\n",
47
+ "from torchmetrics import Accuracy, F1Score \n",
48
+ "from sklearn.preprocessing import LabelEncoder\n",
49
+ "from pytorch_lightning.callbacks import EarlyStopping,ModelCheckpoint\n",
50
+ "from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n",
51
+ "from sklearn.model_selection import train_test_split\n",
52
+ "\n",
53
+ "from sklearn.preprocessing import LabelEncoder\n",
54
+ "from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "data": {
64
+ "text/plain": [
65
+ "device(type='cuda', index=0)"
66
+ ]
67
+ },
68
+ "execution_count": 2,
69
+ "metadata": {},
70
+ "output_type": "execute_result"
71
+ }
72
+ ],
73
+ "source": [
74
+ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
75
+ "device"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/plain": [
86
+ "True"
87
+ ]
88
+ },
89
+ "execution_count": 3,
90
+ "metadata": {},
91
+ "output_type": "execute_result"
92
+ }
93
+ ],
94
+ "source": [
95
+ "torch.cuda.is_available()"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 4,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "seed_val = 42\n",
105
+ "random.seed(seed_val)\n",
106
+ "np.random.seed(seed_val)\n",
107
+ "torch.manual_seed(seed_val)\n",
108
+ "torch.cuda.manual_seed_all(seed_val)"
109
+ ]
110
+ },
111
+ {
112
+ "attachments": {},
113
+ "cell_type": "markdown",
114
+ "metadata": {},
115
+ "source": [
116
+ "# load dataaset\n"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 5,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "# train_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-training-v1/offenseval-tr-training-v1.tsv',sep='\\t')\n",
126
+ "# test_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-testset-v1/offenseval-tr-testset-v1.tsv',sep='\\t')"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 6,
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "ename": "NameError",
136
+ "evalue": "name 'train_df' is not defined",
137
+ "output_type": "error",
138
+ "traceback": [
139
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
140
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
141
+ "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m train_df\u001b[39m=\u001b[39mpd\u001b[39m.\u001b[39mconcat([train_df,test_df], axis\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 2\u001b[0m train_df\u001b[39m=\u001b[39mtrain_df\u001b[39m.\u001b[39mdrop([\u001b[39m'\u001b[39m\u001b[39mid\u001b[39m\u001b[39m'\u001b[39m], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
142
+ "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
143
+ ]
144
+ }
145
+ ],
146
+ "source": [
147
+ "train_df=pd.concat([train_df,test_df], axis=0)\n",
148
+ "train_df=train_df.drop(['id'], axis=1)"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "data": {
158
+ "text/plain": [
159
+ "subtask_a\n",
160
+ "NOT 25231\n",
161
+ "OFF 6046\n",
162
+ "Name: count, dtype: int64"
163
+ ]
164
+ },
165
+ "execution_count": 7,
166
+ "metadata": {},
167
+ "output_type": "execute_result"
168
+ }
169
+ ],
170
+ "source": [
171
+ "train_df['subtask_a'].value_counts()"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 8,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "data=train_df['tweet'].tolist()\n",
181
+ "for i in range(len(data)):\n",
182
+ " data[i] = data[i].replace('@USER','')\n",
183
+ " data[i] = data[i].replace('#','')\n",
184
+ " data[i] = data[i].replace('$','')\n",
185
+ " data[i] = emoji.demojize(data[i])\n",
186
+ " \n",
187
+ "train_df['tweet'] = data"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 9,
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "lab = LabelEncoder()\n",
197
+ "train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 10,
203
+ "metadata": {},
204
+ "outputs": [
205
+ {
206
+ "data": {
207
+ "text/plain": [
208
+ "subtask_a\n",
209
+ "0 25231\n",
210
+ "1 6046\n",
211
+ "2 3515\n",
212
+ "Name: count, dtype: int64"
213
+ ]
214
+ },
215
+ "execution_count": 10,
216
+ "metadata": {},
217
+ "output_type": "execute_result"
218
+ }
219
+ ],
220
+ "source": [
221
+ "train_df['subtask_a'].value_counts()"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": 11,
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "train_df.drop(train_df[train_df['subtask_a'] == 2].index, inplace = True)"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 12,
236
+ "metadata": {},
237
+ "outputs": [
238
+ {
239
+ "data": {
240
+ "text/plain": [
241
+ "subtask_a\n",
242
+ "0 22345\n",
243
+ "1 5417\n",
244
+ "Name: count, dtype: int64"
245
+ ]
246
+ },
247
+ "execution_count": 12,
248
+ "metadata": {},
249
+ "output_type": "execute_result"
250
+ }
251
+ ],
252
+ "source": [
253
+ "train_df['subtask_a'].value_counts()"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 13,
259
+ "metadata": {},
260
+ "outputs": [
261
+ {
262
+ "data": {
263
+ "text/html": [
264
+ "<div>\n",
265
+ "<style scoped>\n",
266
+ " .dataframe tbody tr th:only-of-type {\n",
267
+ " vertical-align: middle;\n",
268
+ " }\n",
269
+ "\n",
270
+ " .dataframe tbody tr th {\n",
271
+ " vertical-align: top;\n",
272
+ " }\n",
273
+ "\n",
274
+ " .dataframe thead th {\n",
275
+ " text-align: right;\n",
276
+ " }\n",
277
+ "</style>\n",
278
+ "<table border=\"1\" class=\"dataframe\">\n",
279
+ " <thead>\n",
280
+ " <tr style=\"text-align: right;\">\n",
281
+ " <th></th>\n",
282
+ " <th>tweet</th>\n",
283
+ " <th>subtask_a</th>\n",
284
+ " </tr>\n",
285
+ " </thead>\n",
286
+ " <tbody>\n",
287
+ " <tr>\n",
288
+ " <th>3515</th>\n",
289
+ " <td>holstein ineği (alacalı siyah-beyaz inek, yani...</td>\n",
290
+ " <td>0</td>\n",
291
+ " </tr>\n",
292
+ " <tr>\n",
293
+ " <th>3516</th>\n",
294
+ " <td>Haaaa. O zaman oylar Binali'ye demek.</td>\n",
295
+ " <td>0</td>\n",
296
+ " </tr>\n",
297
+ " <tr>\n",
298
+ " <th>3517</th>\n",
299
+ " <td>Disk genel merkez yönetimine HDP'nin hiç etki...</td>\n",
300
+ " <td>0</td>\n",
301
+ " </tr>\n",
302
+ " <tr>\n",
303
+ " <th>3518</th>\n",
304
+ " <td>Bir insanı zorla kaliteli yapamazsın. Sen elin...</td>\n",
305
+ " <td>0</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>3519</th>\n",
309
+ " <td>Sus yaa açtım sonra korkudan telefon elimden ...</td>\n",
310
+ " <td>0</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>...</th>\n",
314
+ " <td>...</td>\n",
315
+ " <td>...</td>\n",
316
+ " </tr>\n",
317
+ " <tr>\n",
318
+ " <th>31272</th>\n",
319
+ " <td>Bu ödül sunan kızı kim giydirdiyse, kızın en b...</td>\n",
320
+ " <td>0</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>31273</th>\n",
324
+ " <td>Bunu sana beddua olarak etmiyorum bunlar ilerd...</td>\n",
325
+ " <td>0</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>31274</th>\n",
329
+ " <td>CHP'liler sandıkları bırakmıyor üstüne oturmuş...</td>\n",
330
+ " <td>1</td>\n",
331
+ " </tr>\n",
332
+ " <tr>\n",
333
+ " <th>31275</th>\n",
334
+ " <td>karanlığın içinde yalnız kalsam ne oluuuuurr</td>\n",
335
+ " <td>0</td>\n",
336
+ " </tr>\n",
337
+ " <tr>\n",
338
+ " <th>31276</th>\n",
339
+ " <td>Ne yalan söyleyeyim bu haftalıkta fitil olara...</td>\n",
340
+ " <td>0</td>\n",
341
+ " </tr>\n",
342
+ " </tbody>\n",
343
+ "</table>\n",
344
+ "<p>27762 rows × 2 columns</p>\n",
345
+ "</div>"
346
+ ],
347
+ "text/plain": [
348
+ " tweet subtask_a\n",
349
+ "3515 holstein ineği (alacalı siyah-beyaz inek, yani... 0\n",
350
+ "3516 Haaaa. O zaman oylar Binali'ye demek. 0\n",
351
+ "3517 Disk genel merkez yönetimine HDP'nin hiç etki... 0\n",
352
+ "3518 Bir insanı zorla kaliteli yapamazsın. Sen elin... 0\n",
353
+ "3519 Sus yaa açtım sonra korkudan telefon elimden ... 0\n",
354
+ "... ... ...\n",
355
+ "31272 Bu ödül sunan kızı kim giydirdiyse, kızın en b... 0\n",
356
+ "31273 Bunu sana beddua olarak etmiyorum bunlar ilerd... 0\n",
357
+ "31274 CHP'liler sandıkları bırakmıyor üstüne oturmuş... 1\n",
358
+ "31275 karanlığın içinde yalnız kalsam ne oluuuuurr 0\n",
359
+ "31276 Ne yalan söyleyeyim bu haftalıkta fitil olara... 0\n",
360
+ "\n",
361
+ "[27762 rows x 2 columns]"
362
+ ]
363
+ },
364
+ "execution_count": 13,
365
+ "metadata": {},
366
+ "output_type": "execute_result"
367
+ }
368
+ ],
369
+ "source": [
370
+ "train_df"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 14,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "data = train_df.tweet.values\n",
380
+ "labels = train_df.subtask_a.values"
381
+ ]
382
+ },
383
+ {
384
+ "attachments": {},
385
+ "cell_type": "markdown",
386
+ "metadata": {},
387
+ "source": [
388
+ "# BERT Tokenizer"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": 15,
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": [
397
+ "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", do_basic_tokenize=True)\n",
398
+ "# tokenizer.add_tokens(data)"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": 16,
404
+ "metadata": {},
405
+ "outputs": [
406
+ {
407
+ "name": "stdout",
408
+ "output_type": "stream",
409
+ "text": [
410
+ " Original: Sallandık diyorum, merkezi bilmiyorum, sokağa fırlamadım, duruyorum. Senden bir açıklama gelmeden, ben bu sandığı terketmiyorum \n",
411
+ "Tokenized: ['Sal', '##landı', '##k', 'di', '##yor', '##um', ',', 'merkezi', 'bil', '##mi', '##yor', '##um', ',', 'sok', '##a', '##ğa', 'f', '##ır', '##lama', '##dı', '##m', ',', 'dur', '##uy', '##orum', '.', 'Sen', '##den', 'bir', 'açık', '##lama', 'gel', '##mede', '##n', ',', 'ben', 'bu', 'sand', '##ığı', 'ter', '##ket', '##mi', '##yor', '##um']\n",
412
+ "Token IDs: [64831, 35783, 10174, 10120, 26101, 10465, 117, 47522, 13897, 10500, 26101, 10465, 117, 29509, 10113, 25163, 174, 17145, 24540, 17532, 10147, 117, 28959, 53452, 28048, 119, 18082, 10633, 10561, 71769, 24540, 74458, 59268, 10115, 117, 11015, 11499, 45989, 28581, 12718, 13650, 10500, 26101, 10465]\n"
413
+ ]
414
+ }
415
+ ],
416
+ "source": [
417
+ "print(' Original: ', data[78])\n",
418
+ "print('Tokenized: ', tokenizer.tokenize(data[78]))\n",
419
+ "print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data[78])))"
420
+ ]
421
+ },
422
+ {
423
+ "attachments": {},
424
+ "cell_type": "markdown",
425
+ "metadata": {},
426
+ "source": [
427
+ "# Tokenize Dataset"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 17,
433
+ "metadata": {},
434
+ "outputs": [
435
+ {
436
+ "name": "stderr",
437
+ "output_type": "stream",
438
+ "text": [
439
+ "Token indices sequence length is longer than the specified maximum sequence length for this model (1277 > 512). Running this sequence through the model will result in indexing errors\n"
440
+ ]
441
+ },
442
+ {
443
+ "name": "stdout",
444
+ "output_type": "stream",
445
+ "text": [
446
+ "Max sentence length: 6906\n"
447
+ ]
448
+ }
449
+ ],
450
+ "source": [
451
+ "max_len = 0\n",
452
+ "for sent in data:\n",
453
+ "\n",
454
+ " input_ids = tokenizer.encode(sent, add_special_tokens=True)\n",
455
+ " max_len = max(max_len, len(input_ids))\n",
456
+ "\n",
457
+ "print('Max sentence length: ', max_len)"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": 18,
463
+ "metadata": {},
464
+ "outputs": [
465
+ {
466
+ "name": "stderr",
467
+ "output_type": "stream",
468
+ "text": [
469
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
470
+ "/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
471
+ " warnings.warn(\n"
472
+ ]
473
+ },
474
+ {
475
+ "name": "stdout",
476
+ "output_type": "stream",
477
+ "text": [
478
+ "Original: holstein ineği (alacalı siyah-beyaz inek, yani hollanda ineği) en verimli süt alınan inek ırkıymış, trt belgesel'de öyle söylediler\n",
479
+ "Token IDs: tensor([ 101, 110516, 16206, 10106, 10112, 16054, 113, 21739, 15794,\n",
480
+ " 10713, 34543, 10237, 118, 110744, 10106, 10707, 117, 84251,\n",
481
+ " 46232, 41971, 10106, 10112, 16054, 114, 10110, 55011, 98373,\n",
482
+ " 187, 41559, 10164, 65890, 10106, 10707, 321, 16299, 10713,\n",
483
+ " 16889, 19733, 117, 32221, 10123, 34831, 12912, 112, 10104,\n",
484
+ " 276, 18369, 100721, 18369, 28113, 10165, 102, 0, 0,\n",
485
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
486
+ " 0])\n"
487
+ ]
488
+ }
489
+ ],
490
+ "source": [
491
+ "input_ids = []\n",
492
+ "attention_masks = []\n",
493
+ "\n",
494
+ "for sent in data:\n",
495
+ " encoded_dict = tokenizer.encode_plus(\n",
496
+ " sent, \n",
497
+ " add_special_tokens = True, \n",
498
+ " max_length = 64, \n",
499
+ " pad_to_max_length = True,\n",
500
+ " return_attention_mask = True, \n",
501
+ " return_tensors = 'pt', \n",
502
+ " )\n",
503
+ " \n",
504
+ " \n",
505
+ " input_ids.append(encoded_dict['input_ids'])\n",
506
+ " attention_masks.append(encoded_dict['attention_mask'])\n",
507
+ "\n",
508
+ "\n",
509
+ "input_ids = torch.cat(input_ids, dim=0)\n",
510
+ "attention_masks = torch.cat(attention_masks, dim=0)\n",
511
+ "labels = torch.tensor(labels)\n",
512
+ "\n",
513
+ "\n",
514
+ "print('Original: ', data[0])\n",
515
+ "print('Token IDs:', input_ids[0])"
516
+ ]
517
+ },
518
+ {
519
+ "attachments": {},
520
+ "cell_type": "markdown",
521
+ "metadata": {},
522
+ "source": [
523
+ "# Split Dataset"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 19,
529
+ "metadata": {},
530
+ "outputs": [
531
+ {
532
+ "name": "stdout",
533
+ "output_type": "stream",
534
+ "text": [
535
+ "24,985 training samples\n",
536
+ "2,777 validation samples\n"
537
+ ]
538
+ }
539
+ ],
540
+ "source": [
541
+ "dataset = TensorDataset(input_ids, attention_masks, labels)\n",
542
+ "train_size = int(0.9 * len(dataset))\n",
543
+ "val_size = len(dataset) - train_size\n",
544
+ "\n",
545
+ "\n",
546
+ "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
547
+ "\n",
548
+ "print('{:>5,} training samples'.format(train_size))\n",
549
+ "print('{:>5,} validation samples'.format(val_size))"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "execution_count": 20,
555
+ "metadata": {},
556
+ "outputs": [
557
+ {
558
+ "name": "stderr",
559
+ "output_type": "stream",
560
+ "text": [
561
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
562
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
563
+ ]
564
+ },
565
+ {
566
+ "data": {
567
+ "text/plain": [
568
+ "BertForSequenceClassification(\n",
569
+ " (bert): BertModel(\n",
570
+ " (embeddings): BertEmbeddings(\n",
571
+ " (word_embeddings): Embedding(119547, 768, padding_idx=0)\n",
572
+ " (position_embeddings): Embedding(512, 768)\n",
573
+ " (token_type_embeddings): Embedding(2, 768)\n",
574
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
575
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
576
+ " )\n",
577
+ " (encoder): BertEncoder(\n",
578
+ " (layer): ModuleList(\n",
579
+ " (0-11): 12 x BertLayer(\n",
580
+ " (attention): BertAttention(\n",
581
+ " (self): BertSelfAttention(\n",
582
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
583
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
584
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
585
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
586
+ " )\n",
587
+ " (output): BertSelfOutput(\n",
588
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
589
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
590
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
591
+ " )\n",
592
+ " )\n",
593
+ " (intermediate): BertIntermediate(\n",
594
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
595
+ " (intermediate_act_fn): GELUActivation()\n",
596
+ " )\n",
597
+ " (output): BertOutput(\n",
598
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
599
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
600
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
601
+ " )\n",
602
+ " )\n",
603
+ " )\n",
604
+ " )\n",
605
+ " (pooler): BertPooler(\n",
606
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
607
+ " (activation): Tanh()\n",
608
+ " )\n",
609
+ " )\n",
610
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
611
+ " (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
612
+ ")"
613
+ ]
614
+ },
615
+ "execution_count": 20,
616
+ "metadata": {},
617
+ "output_type": "execute_result"
618
+ }
619
+ ],
620
+ "source": [
621
+ "from transformers import BertForSequenceClassification, AdamW, BertConfig\n",
622
+ "\n",
623
+ "model = BertForSequenceClassification.from_pretrained(\n",
624
+ " \"bert-base-multilingual-cased\",\n",
625
+ " num_labels = 2, \n",
626
+ " output_attentions = False,\n",
627
+ " output_hidden_states = False, \n",
628
+ ")\n",
629
+ "\n",
630
+ "model.cuda()"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 21,
636
+ "metadata": {},
637
+ "outputs": [
638
+ {
639
+ "name": "stdout",
640
+ "output_type": "stream",
641
+ "text": [
642
+ "The BERT model has 201 different named parameters.\n",
643
+ "\n",
644
+ "==== Embedding Layer ====\n",
645
+ "\n",
646
+ "bert.embeddings.word_embeddings.weight (119547, 768)\n",
647
+ "bert.embeddings.position_embeddings.weight (512, 768)\n",
648
+ "bert.embeddings.token_type_embeddings.weight (2, 768)\n",
649
+ "bert.embeddings.LayerNorm.weight (768,)\n",
650
+ "bert.embeddings.LayerNorm.bias (768,)\n",
651
+ "\n",
652
+ "==== First Transformer ====\n",
653
+ "\n",
654
+ "bert.encoder.layer.0.attention.self.query.weight (768, 768)\n",
655
+ "bert.encoder.layer.0.attention.self.query.bias (768,)\n",
656
+ "bert.encoder.layer.0.attention.self.key.weight (768, 768)\n",
657
+ "bert.encoder.layer.0.attention.self.key.bias (768,)\n",
658
+ "bert.encoder.layer.0.attention.self.value.weight (768, 768)\n",
659
+ "bert.encoder.layer.0.attention.self.value.bias (768,)\n",
660
+ "bert.encoder.layer.0.attention.output.dense.weight (768, 768)\n",
661
+ "bert.encoder.layer.0.attention.output.dense.bias (768,)\n",
662
+ "bert.encoder.layer.0.attention.output.LayerNorm.weight (768,)\n",
663
+ "bert.encoder.layer.0.attention.output.LayerNorm.bias (768,)\n",
664
+ "bert.encoder.layer.0.intermediate.dense.weight (3072, 768)\n",
665
+ "bert.encoder.layer.0.intermediate.dense.bias (3072,)\n",
666
+ "bert.encoder.layer.0.output.dense.weight (768, 3072)\n",
667
+ "bert.encoder.layer.0.output.dense.bias (768,)\n",
668
+ "bert.encoder.layer.0.output.LayerNorm.weight (768,)\n",
669
+ "bert.encoder.layer.0.output.LayerNorm.bias (768,)\n",
670
+ "\n",
671
+ "==== Output Layer ====\n",
672
+ "\n",
673
+ "bert.pooler.dense.weight (768, 768)\n",
674
+ "bert.pooler.dense.bias (768,)\n",
675
+ "classifier.weight (2, 768)\n",
676
+ "classifier.bias (2,)\n"
677
+ ]
678
+ }
679
+ ],
680
+ "source": [
681
+ "params = list(model.named_parameters())\n",
682
+ "\n",
683
+ "print('The BERT model has {:} different named parameters.\\n'.format(len(params)))\n",
684
+ "\n",
685
+ "print('==== Embedding Layer ====\\n')\n",
686
+ "\n",
687
+ "for p in params[0:5]:\n",
688
+ " print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n",
689
+ "\n",
690
+ "print('\\n==== First Transformer ====\\n')\n",
691
+ "\n",
692
+ "for p in params[5:21]:\n",
693
+ " print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n",
694
+ "\n",
695
+ "print('\\n==== Output Layer ====\\n')\n",
696
+ "\n",
697
+ "for p in params[-4:]:\n",
698
+ " print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": 22,
704
+ "metadata": {},
705
+ "outputs": [
706
+ {
707
+ "name": "stderr",
708
+ "output_type": "stream",
709
+ "text": [
710
+ "/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
711
+ " warnings.warn(\n"
712
+ ]
713
+ }
714
+ ],
715
+ "source": [
716
+ "optimizer = AdamW(model.parameters(),\n",
717
+ " lr = 2e-5,\n",
718
+ " eps = 1e-8\n",
719
+ " )"
720
+ ]
721
+ },
722
+ {
723
+ "cell_type": "code",
724
+ "execution_count": 23,
725
+ "metadata": {},
726
+ "outputs": [],
727
+ "source": [
728
+ "def flat_accuracy(preds, labels):\n",
729
+ " pred_flat = np.argmax(preds, axis=1).flatten()\n",
730
+ " labels_flat = labels.flatten()\n",
731
+ " return np.sum(pred_flat == labels_flat) / len(labels_flat)"
732
+ ]
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": 24,
737
+ "metadata": {},
738
+ "outputs": [],
739
+ "source": [
740
+ "def format_time(elapsed):\n",
741
+ "\n",
742
+ " elapsed_rounded = int(round((elapsed)))\n",
743
+ " return str(datetime.timedelta(seconds=elapsed_rounded))\n"
744
+ ]
745
+ },
746
+ {
747
+ "cell_type": "code",
748
+ "execution_count": 25,
749
+ "metadata": {},
750
+ "outputs": [],
751
+ "source": [
752
+ "class sinKafModel(pl.LightningModule):\n",
753
+ " def __init__(self, model, optimizer, scheduler):\n",
754
+ " super().__init__()\n",
755
+ " self.model = model\n",
756
+ " self.optimizer = optimizer\n",
757
+ " self.scheduler = scheduler\n",
758
+ "\n",
759
+ "\n",
760
+ " def forward(self, input_ids, attention_mask, labels):\n",
761
+ " outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)\n",
762
+ " return outputs\n",
763
+ "\n",
764
+ " def training_step(self, batch, batch_idx):\n",
765
+ " input_ids, input_mask, labels = batch\n",
766
+ " outputs = self(input_ids, input_mask, labels)\n",
767
+ " loss = outputs.loss\n",
768
+ " self.log('train_loss', loss)\n",
769
+ " return loss\n",
770
+ "\n",
771
+ " def validation_step(self, batch, batch_idx):\n",
772
+ " input_ids, input_mask, labels = batch\n",
773
+ " outputs = self(input_ids, input_mask, labels)\n",
774
+ " loss = outputs.loss\n",
775
+ " logits = outputs.logits\n",
776
+ " preds = torch.argmax(logits, dim=1)\n",
777
+ " acc = (preds == labels).sum().item() / len(labels)\n",
778
+ " self.log('val_loss', loss)\n",
779
+ " self.log('val_acc', acc)\n",
780
+ " return loss\n",
781
+ "\n",
782
+ " def configure_optimizers(self):\n",
783
+ " return [self.optimizer], [self.scheduler]\n",
784
+ "\n",
785
+ " # def train_dataloader(self):\n",
786
+ " # return self.train_dataloader\n",
787
+ "\n",
788
+ " # def val_dataloader(self):\n",
789
+ " # return self.validation_dataloader\n"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 26,
795
+ "metadata": {},
796
+ "outputs": [],
797
+ "source": [
798
+ "train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = 2 )\n",
799
+ "validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = 2 )"
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "code",
804
+ "execution_count": 27,
805
+ "metadata": {},
806
+ "outputs": [],
807
+ "source": [
808
+ "epochs = 4\n",
809
+ "total_steps = len(train_dataloader) * epochs\n",
810
+ "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
811
+ " num_warmup_steps = 0, \n",
812
+ " num_training_steps = total_steps)"
813
+ ]
814
+ },
815
+ {
816
+ "cell_type": "code",
817
+ "execution_count": 28,
818
+ "metadata": {},
819
+ "outputs": [
820
+ {
821
+ "name": "stderr",
822
+ "output_type": "stream",
823
+ "text": [
824
+ "GPU available: True (cuda), used: True\n",
825
+ "TPU available: False, using: 0 TPU cores\n",
826
+ "IPU available: False, using: 0 IPUs\n",
827
+ "HPU available: False, using: 0 HPUs\n",
828
+ "/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
829
+ " warning_cache.warn(\n"
830
+ ]
831
+ },
832
+ {
833
+ "name": "stderr",
834
+ "output_type": "stream",
835
+ "text": [
836
+ "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
837
+ "\n",
838
+ " | Name | Type | Params\n",
839
+ "--------------------------------------------------------\n",
840
+ "0 | model | BertForSequenceClassification | 177 M \n",
841
+ "--------------------------------------------------------\n",
842
+ "177 M Trainable params\n",
843
+ "0 Non-trainable params\n",
844
+ "177 M Total params\n",
845
+ "711.420 Total estimated model params size (MB)\n"
846
+ ]
847
+ },
848
+ {
849
+ "name": "stdout",
850
+ "output_type": "stream",
851
+ "text": [
852
+ "Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]"
853
+ ]
854
+ },
855
+ {
856
+ "name": "stderr",
857
+ "output_type": "stream",
858
+ "text": [
859
+ "/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:442: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
860
+ " rank_zero_warn(\n"
861
+ ]
862
+ },
863
+ {
864
+ "name": "stdout",
865
+ "output_type": "stream",
866
+ "text": [
867
+ " "
868
+ ]
869
+ },
870
+ {
871
+ "name": "stderr",
872
+ "output_type": "stream",
873
+ "text": [
874
+ "/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:442: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
875
+ " rank_zero_warn(\n"
876
+ ]
877
+ },
878
+ {
879
+ "name": "stdout",
880
+ "output_type": "stream",
881
+ "text": [
882
+ "Epoch 0: 0%| | 1/1249 [00:00<05:01, 4.13it/s, v_num=6]"
883
+ ]
884
+ },
885
+ {
886
+ "ename": "OutOfMemoryError",
887
+ "evalue": "CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 4.00 GiB total capacity; 2.67 GiB already allocated; 0 bytes free; 2.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
888
+ "output_type": "error",
889
+ "traceback": [
890
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
891
+ "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
892
+ "Cell \u001b[0;32mIn[28], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[39m=\u001b[39m sinKafModel(model, optimizer, scheduler)\n\u001b[1;32m 2\u001b[0m trainer \u001b[39m=\u001b[39m pl\u001b[39m.\u001b[39mTrainer( max_epochs\u001b[39m=\u001b[39mepochs, limit_train_batches\u001b[39m=\u001b[39m\u001b[39m0.1\u001b[39m, devices\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, accelerator\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mgpu\u001b[39m\u001b[39m'\u001b[39m) \n\u001b[0;32m----> 3\u001b[0m trainer\u001b[39m.\u001b[39;49mfit(model,train_dataloader,validation_dataloader )\n",
893
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:532\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39m_lightning_module \u001b[39m=\u001b[39m model\n\u001b[1;32m 531\u001b[0m _verify_strategy_supports_compile(model, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstrategy)\n\u001b[0;32m--> 532\u001b[0m call\u001b[39m.\u001b[39;49m_call_and_handle_interrupt(\n\u001b[1;32m 533\u001b[0m \u001b[39mself\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path\n\u001b[1;32m 534\u001b[0m )\n",
894
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:43\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m trainer\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39mlauncher \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 42\u001b[0m \u001b[39mreturn\u001b[39;00m trainer\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39mlauncher\u001b[39m.\u001b[39mlaunch(trainer_fn, \u001b[39m*\u001b[39margs, trainer\u001b[39m=\u001b[39mtrainer, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m---> 43\u001b[0m \u001b[39mreturn\u001b[39;00m trainer_fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 45\u001b[0m \u001b[39mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 46\u001b[0m _call_teardown_hook(trainer)\n",
895
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:571\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data_connector\u001b[39m.\u001b[39mattach_data(\n\u001b[1;32m 562\u001b[0m model, train_dataloaders\u001b[39m=\u001b[39mtrain_dataloaders, val_dataloaders\u001b[39m=\u001b[39mval_dataloaders, datamodule\u001b[39m=\u001b[39mdatamodule\n\u001b[1;32m 563\u001b[0m )\n\u001b[1;32m 565\u001b[0m ckpt_path \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_checkpoint_connector\u001b[39m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 566\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mfn,\n\u001b[1;32m 567\u001b[0m ckpt_path,\n\u001b[1;32m 568\u001b[0m model_provided\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 569\u001b[0m model_connected\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlightning_module \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 570\u001b[0m )\n\u001b[0;32m--> 571\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run(model, ckpt_path\u001b[39m=\u001b[39;49mckpt_path)\n\u001b[1;32m 573\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mstopped\n\u001b[1;32m 574\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
896
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:980\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_signal_connector\u001b[39m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m 977\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 978\u001b[0m \u001b[39m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m 979\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 980\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run_stage()\n\u001b[1;32m 982\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 983\u001b[0m \u001b[39m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m 984\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 985\u001b[0m log\u001b[39m.\u001b[39mdebug(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m: trainer tearing down\u001b[39m\u001b[39m\"\u001b[39m)\n",
897
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1023\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_run_sanity_check()\n\u001b[1;32m 1022\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mautograd\u001b[39m.\u001b[39mset_detect_anomaly(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_detect_anomaly):\n\u001b[0;32m-> 1023\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfit_loop\u001b[39m.\u001b[39;49mrun()\n\u001b[1;32m 1024\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 1025\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUnexpected state \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n",
898
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:202\u001b[0m, in \u001b[0;36m_FitLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 201\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_start()\n\u001b[0;32m--> 202\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49madvance()\n\u001b[1;32m 203\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_end()\n\u001b[1;32m 204\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_restarting \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
899
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:355\u001b[0m, in \u001b[0;36m_FitLoop.advance\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data_fetcher\u001b[39m.\u001b[39msetup(combined_loader)\n\u001b[1;32m 354\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtrainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39m\"\u001b[39m\u001b[39mrun_training_epoch\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mepoch_loop\u001b[39m.\u001b[39;49mrun(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_data_fetcher)\n",
900
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py:133\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.run\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdone:\n\u001b[1;32m 132\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 133\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49madvance(data_fetcher)\n\u001b[1;32m 134\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_end()\n\u001b[1;32m 135\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_restarting \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
901
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py:219\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.advance\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[39mwith\u001b[39;00m trainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39m\"\u001b[39m\u001b[39mrun_training_batch\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 217\u001b[0m \u001b[39mif\u001b[39;00m trainer\u001b[39m.\u001b[39mlightning_module\u001b[39m.\u001b[39mautomatic_optimization:\n\u001b[1;32m 218\u001b[0m \u001b[39m# in automatic optimization, there can only be one optimizer\u001b[39;00m\n\u001b[0;32m--> 219\u001b[0m batch_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mautomatic_optimization\u001b[39m.\u001b[39;49mrun(trainer\u001b[39m.\u001b[39;49moptimizers[\u001b[39m0\u001b[39;49m], kwargs)\n\u001b[1;32m 220\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 221\u001b[0m batch_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmanual_optimization\u001b[39m.\u001b[39mrun(kwargs)\n",
902
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py:188\u001b[0m, in \u001b[0;36m_AutomaticOptimization.run\u001b[0;34m(self, optimizer, kwargs)\u001b[0m\n\u001b[1;32m 181\u001b[0m closure()\n\u001b[1;32m 183\u001b[0m \u001b[39m# ------------------------------\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[39m# BACKWARD PASS\u001b[39;00m\n\u001b[1;32m 185\u001b[0m \u001b[39m# ------------------------------\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \u001b[39m# gradient update with accumulated gradients\u001b[39;00m\n\u001b[1;32m 187\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 188\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_optimizer_step(kwargs\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mbatch_idx\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m0\u001b[39;49m), closure)\n\u001b[1;32m 190\u001b[0m result \u001b[39m=\u001b[39m closure\u001b[39m.\u001b[39mconsume_result()\n\u001b[1;32m 191\u001b[0m \u001b[39mif\u001b[39;00m result\u001b[39m.\u001b[39mloss \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
903
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py:266\u001b[0m, in \u001b[0;36m_AutomaticOptimization._optimizer_step\u001b[0;34m(self, batch_idx, train_step_and_backward_closure)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptim_progress\u001b[39m.\u001b[39moptimizer\u001b[39m.\u001b[39mstep\u001b[39m.\u001b[39mincrement_ready()\n\u001b[1;32m 265\u001b[0m \u001b[39m# model hook\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m call\u001b[39m.\u001b[39;49m_call_lightning_module_hook(\n\u001b[1;32m 267\u001b[0m trainer,\n\u001b[1;32m 268\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39moptimizer_step\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 269\u001b[0m trainer\u001b[39m.\u001b[39;49mcurrent_epoch,\n\u001b[1;32m 270\u001b[0m batch_idx,\n\u001b[1;32m 271\u001b[0m optimizer,\n\u001b[1;32m 272\u001b[0m train_step_and_backward_closure,\n\u001b[1;32m 273\u001b[0m )\n\u001b[1;32m 275\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m should_accumulate:\n\u001b[1;32m 276\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptim_progress\u001b[39m.\u001b[39moptimizer\u001b[39m.\u001b[39mstep\u001b[39m.\u001b[39mincrement_completed()\n",
904
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:146\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m 143\u001b[0m pl_module\u001b[39m.\u001b[39m_current_fx_name \u001b[39m=\u001b[39m hook_name\n\u001b[1;32m 145\u001b[0m \u001b[39mwith\u001b[39;00m trainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m[LightningModule]\u001b[39m\u001b[39m{\u001b[39;00mpl_module\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mhook_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 146\u001b[0m output \u001b[39m=\u001b[39m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 148\u001b[0m \u001b[39m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m 149\u001b[0m pl_module\u001b[39m.\u001b[39m_current_fx_name \u001b[39m=\u001b[39m prev_fx_name\n",
905
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1270\u001b[0m, in \u001b[0;36mLightningModule.optimizer_step\u001b[0;34m(self, epoch, batch_idx, optimizer, optimizer_closure)\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39moptimizer_step\u001b[39m(\n\u001b[1;32m 1233\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 1234\u001b[0m epoch: \u001b[39mint\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1237\u001b[0m optimizer_closure: Optional[Callable[[], Any]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 1238\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 1239\u001b[0m \u001b[39m \u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer`\u001b[39;00m\n\u001b[1;32m 1240\u001b[0m \u001b[39m calls the optimizer.\u001b[39;00m\n\u001b[1;32m 1241\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1268\u001b[0m \u001b[39m pg[\"lr\"] = lr_scale * self.learning_rate\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1270\u001b[0m optimizer\u001b[39m.\u001b[39;49mstep(closure\u001b[39m=\u001b[39;49moptimizer_closure)\n",
906
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:161\u001b[0m, in \u001b[0;36mLightningOptimizer.step\u001b[0;34m(self, closure, **kwargs)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[39mraise\u001b[39;00m MisconfigurationException(\u001b[39m\"\u001b[39m\u001b[39mWhen `optimizer.step(closure)` is called, the closure should be callable\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 160\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_strategy \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m--> 161\u001b[0m step_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_strategy\u001b[39m.\u001b[39;49moptimizer_step(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_optimizer, closure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 163\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_on_after_step()\n\u001b[1;32m 165\u001b[0m \u001b[39mreturn\u001b[39;00m step_output\n",
907
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:231\u001b[0m, in \u001b[0;36mStrategy.optimizer_step\u001b[0;34m(self, optimizer, closure, model, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[39m# TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed\u001b[39;00m\n\u001b[1;32m 230\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(model, pl\u001b[39m.\u001b[39mLightningModule)\n\u001b[0;32m--> 231\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprecision_plugin\u001b[39m.\u001b[39;49moptimizer_step(optimizer, model\u001b[39m=\u001b[39;49mmodel, closure\u001b[39m=\u001b[39;49mclosure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
908
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:116\u001b[0m, in \u001b[0;36mPrecisionPlugin.optimizer_step\u001b[0;34m(self, optimizer, model, closure, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Hook to run the optimizer step.\"\"\"\u001b[39;00m\n\u001b[1;32m 115\u001b[0m closure \u001b[39m=\u001b[39m partial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_wrap_closure, model, optimizer, closure)\n\u001b[0;32m--> 116\u001b[0m \u001b[39mreturn\u001b[39;00m optimizer\u001b[39m.\u001b[39;49mstep(closure\u001b[39m=\u001b[39;49mclosure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
909
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/optim/lr_scheduler.py:69\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.with_counter.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 67\u001b[0m instance\u001b[39m.\u001b[39m_step_count \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m 68\u001b[0m wrapped \u001b[39m=\u001b[39m func\u001b[39m.\u001b[39m\u001b[39m__get__\u001b[39m(instance, \u001b[39mcls\u001b[39m)\n\u001b[0;32m---> 69\u001b[0m \u001b[39mreturn\u001b[39;00m wrapped(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
910
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/optim/optimizer.py:280\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 277\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m}\u001b[39;00m\u001b[39m must return None or a tuple of (new_args, new_kwargs),\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 278\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mbut got \u001b[39m\u001b[39m{\u001b[39;00mresult\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 280\u001b[0m out \u001b[39m=\u001b[39m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 281\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m 283\u001b[0m \u001b[39m# call optimizer step post hooks\u001b[39;00m\n",
911
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m 113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecorate_context\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 114\u001b[0m \u001b[39mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
912
+ "File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/optimization.py:468\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 466\u001b[0m exp_avg\u001b[39m.\u001b[39mmul_(beta1)\u001b[39m.\u001b[39madd_(grad, alpha\u001b[39m=\u001b[39m(\u001b[39m1.0\u001b[39m \u001b[39m-\u001b[39m beta1))\n\u001b[1;32m 467\u001b[0m exp_avg_sq\u001b[39m.\u001b[39mmul_(beta2)\u001b[39m.\u001b[39maddcmul_(grad, grad, value\u001b[39m=\u001b[39m\u001b[39m1.0\u001b[39m \u001b[39m-\u001b[39m beta2)\n\u001b[0;32m--> 468\u001b[0m denom \u001b[39m=\u001b[39m exp_avg_sq\u001b[39m.\u001b[39;49msqrt()\u001b[39m.\u001b[39madd_(group[\u001b[39m\"\u001b[39m\u001b[39meps\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 470\u001b[0m step_size \u001b[39m=\u001b[39m group[\u001b[39m\"\u001b[39m\u001b[39mlr\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 471\u001b[0m \u001b[39mif\u001b[39;00m group[\u001b[39m\"\u001b[39m\u001b[39mcorrect_bias\u001b[39m\u001b[39m\"\u001b[39m]: \u001b[39m# No bias correction for Bert\u001b[39;00m\n",
913
+ "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 4.00 GiB total capacity; 2.67 GiB already allocated; 0 bytes free; 2.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
914
+ ]
915
+ }
916
+ ],
917
+ "source": [
918
+ "model = sinKafModel(model, optimizer, scheduler)\n",
919
+ "trainer = pl.Trainer( max_epochs=epochs, limit_train_batches=0.1, devices=1, accelerator='gpu') \n",
920
+ "trainer.fit(model,train_dataloader,validation_dataloader )"
921
+ ]
922
+ },
923
+ {
924
+ "cell_type": "code",
925
+ "execution_count": null,
926
+ "metadata": {},
927
+ "outputs": [],
928
+ "source": [
929
+ "sent = 'Koyunlar hasta'"
930
+ ]
931
+ },
932
+ {
933
+ "cell_type": "code",
934
+ "execution_count": null,
935
+ "metadata": {},
936
+ "outputs": [],
937
+ "source": [
938
+ "input_ids = []\n",
939
+ "attention_masks = []\n",
940
+ "\n",
941
+ "encoded_dict = tokenizer.encode_plus(\n",
942
+ " sent,\n",
943
+ " add_special_tokens = True,\n",
944
+ " max_length = 64,\n",
945
+ " pad_to_max_length = True,\n",
946
+ " return_attention_mask = True,\n",
947
+ " return_tensors = 'pt',\n",
948
+ " )\n",
949
+ "\n",
950
+ "\n",
951
+ "input_ids = encoded_dict['input_ids']\n",
952
+ "attention_masks = encoded_dict['attention_mask']\n",
953
+ "\n",
954
+ "\n",
955
+ "input_ids = torch.cat([input_ids], dim=0)\n",
956
+ "input_mask = torch.cat([attention_masks], dim=0)\n",
957
+ "labels = torch.tensor(labels)\n",
958
+ "\n",
959
+ "\n",
960
+ "\n",
961
+ "\n",
962
+ "print('Original: ', sent)\n",
963
+ "print('Token IDs:', input_ids)\n",
964
+ "print('Token IDs:', input_mask)"
965
+ ]
966
+ },
967
+ {
968
+ "cell_type": "code",
969
+ "execution_count": null,
970
+ "metadata": {},
971
+ "outputs": [],
972
+ "source": [
973
+ "outputs = model(input_ids, input_mask, labels[0])"
974
+ ]
975
+ },
976
+ {
977
+ "cell_type": "code",
978
+ "execution_count": null,
979
+ "metadata": {},
980
+ "outputs": [],
981
+ "source": [
982
+ "outputs[0]"
983
+ ]
984
+ }
985
+ ],
986
+ "metadata": {
987
+ "kernelspec": {
988
+ "display_name": "sbtenv",
989
+ "language": "python",
990
+ "name": "python3"
991
+ },
992
+ "language_info": {
993
+ "codemirror_mode": {
994
+ "name": "ipython",
995
+ "version": 3
996
+ },
997
+ "file_extension": ".py",
998
+ "mimetype": "text/x-python",
999
+ "name": "python",
1000
+ "nbconvert_exporter": "python",
1001
+ "pygments_lexer": "ipython3",
1002
+ "version": "3.9.0"
1003
+ },
1004
+ "orig_nbformat": 4
1005
+ },
1006
+ "nbformat": 4,
1007
+ "nbformat_minor": 2
1008
+ }
onnx_model/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/DATA/sin-kaf/test_trainer/checkpoint-18500/config.json",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "dim": 768,
10
+ "dropout": 0.1,
11
+ "hidden_dim": 3072,
12
+ "initializer_range": 0.02,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "distilbert",
15
+ "n_heads": 12,
16
+ "n_layers": 6,
17
+ "output_past": true,
18
+ "pad_token_id": 0,
19
+ "problem_type": "single_label_classification",
20
+ "qa_dropout": 0.1,
21
+ "seq_classif_dropout": 0.2,
22
+ "sinusoidal_pos_embds": true,
23
+ "tie_weights_": true,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.34.1",
26
+ "vocab_size": 32000
27
+ }
onnx_model/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cee6ddb2c1e1abb21e513881265239a57dd3cba52f621b6c81a78e41e66eae09
3
+ size 272496128
outlier_detection.ipynb ADDED
@@ -0,0 +1,2292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import re\n",
10
+ "import tqdm\n",
11
+ "import numpy as np\n",
12
+ "import pandas as pd\n",
13
+ "import matplotlib.pyplot as plt\n",
14
+ "import seaborn as sns\n",
15
+ "\n",
16
+ "from sklearn.datasets import fetch_20newsgroups\n",
17
+ "from sklearn.manifold import TSNE\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "embeding_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n",
27
+ "embeding_df=embeding_df.drop(['Unnamed: 0'], axis=1)\n",
28
+ "copy_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n",
29
+ "copy_df=copy_df.drop(['Unnamed: 0'], axis=1)\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "<div>\n",
41
+ "<style scoped>\n",
42
+ " .dataframe tbody tr th:only-of-type {\n",
43
+ " vertical-align: middle;\n",
44
+ " }\n",
45
+ "\n",
46
+ " .dataframe tbody tr th {\n",
47
+ " vertical-align: top;\n",
48
+ " }\n",
49
+ "\n",
50
+ " .dataframe thead th {\n",
51
+ " text-align: right;\n",
52
+ " }\n",
53
+ "</style>\n",
54
+ "<table border=\"1\" class=\"dataframe\">\n",
55
+ " <thead>\n",
56
+ " <tr style=\"text-align: right;\">\n",
57
+ " <th></th>\n",
58
+ " <th>0</th>\n",
59
+ " <th>1</th>\n",
60
+ " <th>2</th>\n",
61
+ " <th>3</th>\n",
62
+ " <th>4</th>\n",
63
+ " <th>5</th>\n",
64
+ " <th>6</th>\n",
65
+ " <th>7</th>\n",
66
+ " <th>8</th>\n",
67
+ " <th>9</th>\n",
68
+ " <th>...</th>\n",
69
+ " <th>56</th>\n",
70
+ " <th>57</th>\n",
71
+ " <th>58</th>\n",
72
+ " <th>59</th>\n",
73
+ " <th>60</th>\n",
74
+ " <th>61</th>\n",
75
+ " <th>62</th>\n",
76
+ " <th>63</th>\n",
77
+ " <th>labels</th>\n",
78
+ " <th>tweet</th>\n",
79
+ " </tr>\n",
80
+ " </thead>\n",
81
+ " <tbody>\n",
82
+ " <tr>\n",
83
+ " <th>0</th>\n",
84
+ " <td>101</td>\n",
85
+ " <td>10110</td>\n",
86
+ " <td>175</td>\n",
87
+ " <td>78653</td>\n",
88
+ " <td>189</td>\n",
89
+ " <td>25285</td>\n",
90
+ " <td>15976</td>\n",
91
+ " <td>40840</td>\n",
92
+ " <td>276</td>\n",
93
+ " <td>31623</td>\n",
94
+ " <td>...</td>\n",
95
+ " <td>0</td>\n",
96
+ " <td>0</td>\n",
97
+ " <td>0</td>\n",
98
+ " <td>0</td>\n",
99
+ " <td>0</td>\n",
100
+ " <td>0</td>\n",
101
+ " <td>0</td>\n",
102
+ " <td>0</td>\n",
103
+ " <td>0</td>\n",
104
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
105
+ " </tr>\n",
106
+ " <tr>\n",
107
+ " <th>1</th>\n",
108
+ " <td>101</td>\n",
109
+ " <td>11589</td>\n",
110
+ " <td>10706</td>\n",
111
+ " <td>10713</td>\n",
112
+ " <td>10794</td>\n",
113
+ " <td>94698</td>\n",
114
+ " <td>30668</td>\n",
115
+ " <td>24883</td>\n",
116
+ " <td>117</td>\n",
117
+ " <td>23763</td>\n",
118
+ " <td>...</td>\n",
119
+ " <td>0</td>\n",
120
+ " <td>0</td>\n",
121
+ " <td>0</td>\n",
122
+ " <td>0</td>\n",
123
+ " <td>0</td>\n",
124
+ " <td>0</td>\n",
125
+ " <td>0</td>\n",
126
+ " <td>0</td>\n",
127
+ " <td>0</td>\n",
128
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>2</th>\n",
132
+ " <td>101</td>\n",
133
+ " <td>148</td>\n",
134
+ " <td>30471</td>\n",
135
+ " <td>10774</td>\n",
136
+ " <td>13785</td>\n",
137
+ " <td>13779</td>\n",
138
+ " <td>33642</td>\n",
139
+ " <td>14399</td>\n",
140
+ " <td>48271</td>\n",
141
+ " <td>76686</td>\n",
142
+ " <td>...</td>\n",
143
+ " <td>0</td>\n",
144
+ " <td>0</td>\n",
145
+ " <td>0</td>\n",
146
+ " <td>0</td>\n",
147
+ " <td>0</td>\n",
148
+ " <td>0</td>\n",
149
+ " <td>0</td>\n",
150
+ " <td>0</td>\n",
151
+ " <td>0</td>\n",
152
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
153
+ " </tr>\n",
154
+ " <tr>\n",
155
+ " <th>3</th>\n",
156
+ " <td>101</td>\n",
157
+ " <td>19319</td>\n",
158
+ " <td>16724</td>\n",
159
+ " <td>10118</td>\n",
160
+ " <td>10107</td>\n",
161
+ " <td>78323</td>\n",
162
+ " <td>12407</td>\n",
163
+ " <td>38959</td>\n",
164
+ " <td>22934</td>\n",
165
+ " <td>10147</td>\n",
166
+ " <td>...</td>\n",
167
+ " <td>0</td>\n",
168
+ " <td>0</td>\n",
169
+ " <td>0</td>\n",
170
+ " <td>0</td>\n",
171
+ " <td>0</td>\n",
172
+ " <td>0</td>\n",
173
+ " <td>0</td>\n",
174
+ " <td>0</td>\n",
175
+ " <td>0</td>\n",
176
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
177
+ " </tr>\n",
178
+ " <tr>\n",
179
+ " <th>4</th>\n",
180
+ " <td>101</td>\n",
181
+ " <td>30932</td>\n",
182
+ " <td>58706</td>\n",
183
+ " <td>58054</td>\n",
184
+ " <td>44907</td>\n",
185
+ " <td>10224</td>\n",
186
+ " <td>106583</td>\n",
187
+ " <td>10288</td>\n",
188
+ " <td>12524</td>\n",
189
+ " <td>13878</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>0</td>\n",
192
+ " <td>0</td>\n",
193
+ " <td>0</td>\n",
194
+ " <td>0</td>\n",
195
+ " <td>0</td>\n",
196
+ " <td>0</td>\n",
197
+ " <td>0</td>\n",
198
+ " <td>0</td>\n",
199
+ " <td>0</td>\n",
200
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>...</th>\n",
204
+ " <td>...</td>\n",
205
+ " <td>...</td>\n",
206
+ " <td>...</td>\n",
207
+ " <td>...</td>\n",
208
+ " <td>...</td>\n",
209
+ " <td>...</td>\n",
210
+ " <td>...</td>\n",
211
+ " <td>...</td>\n",
212
+ " <td>...</td>\n",
213
+ " <td>...</td>\n",
214
+ " <td>...</td>\n",
215
+ " <td>...</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>...</td>\n",
218
+ " <td>...</td>\n",
219
+ " <td>...</td>\n",
220
+ " <td>...</td>\n",
221
+ " <td>...</td>\n",
222
+ " <td>...</td>\n",
223
+ " <td>...</td>\n",
224
+ " <td>...</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>43344</th>\n",
228
+ " <td>101</td>\n",
229
+ " <td>20065</td>\n",
230
+ " <td>10161</td>\n",
231
+ " <td>115</td>\n",
232
+ " <td>115</td>\n",
233
+ " <td>103784</td>\n",
234
+ " <td>10774</td>\n",
235
+ " <td>21388</td>\n",
236
+ " <td>10245</td>\n",
237
+ " <td>92067</td>\n",
238
+ " <td>...</td>\n",
239
+ " <td>0</td>\n",
240
+ " <td>0</td>\n",
241
+ " <td>0</td>\n",
242
+ " <td>0</td>\n",
243
+ " <td>0</td>\n",
244
+ " <td>0</td>\n",
245
+ " <td>0</td>\n",
246
+ " <td>0</td>\n",
247
+ " <td>1</td>\n",
248
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
249
+ " </tr>\n",
250
+ " <tr>\n",
251
+ " <th>43345</th>\n",
252
+ " <td>101</td>\n",
253
+ " <td>139</td>\n",
254
+ " <td>80839</td>\n",
255
+ " <td>24109</td>\n",
256
+ " <td>13406</td>\n",
257
+ " <td>18985</td>\n",
258
+ " <td>16285</td>\n",
259
+ " <td>10163</td>\n",
260
+ " <td>11062</td>\n",
261
+ " <td>276</td>\n",
262
+ " <td>...</td>\n",
263
+ " <td>0</td>\n",
264
+ " <td>0</td>\n",
265
+ " <td>0</td>\n",
266
+ " <td>0</td>\n",
267
+ " <td>0</td>\n",
268
+ " <td>0</td>\n",
269
+ " <td>0</td>\n",
270
+ " <td>0</td>\n",
271
+ " <td>1</td>\n",
272
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
273
+ " </tr>\n",
274
+ " <tr>\n",
275
+ " <th>43346</th>\n",
276
+ " <td>101</td>\n",
277
+ " <td>105549</td>\n",
278
+ " <td>102635</td>\n",
279
+ " <td>10140</td>\n",
280
+ " <td>26943</td>\n",
281
+ " <td>11499</td>\n",
282
+ " <td>110516</td>\n",
283
+ " <td>21899</td>\n",
284
+ " <td>11861</td>\n",
285
+ " <td>10561</td>\n",
286
+ " <td>...</td>\n",
287
+ " <td>0</td>\n",
288
+ " <td>0</td>\n",
289
+ " <td>0</td>\n",
290
+ " <td>0</td>\n",
291
+ " <td>0</td>\n",
292
+ " <td>0</td>\n",
293
+ " <td>0</td>\n",
294
+ " <td>0</td>\n",
295
+ " <td>1</td>\n",
296
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
297
+ " </tr>\n",
298
+ " <tr>\n",
299
+ " <th>43347</th>\n",
300
+ " <td>101</td>\n",
301
+ " <td>81424</td>\n",
302
+ " <td>26398</td>\n",
303
+ " <td>92017</td>\n",
304
+ " <td>109620</td>\n",
305
+ " <td>10941</td>\n",
306
+ " <td>76010</td>\n",
307
+ " <td>10115</td>\n",
308
+ " <td>19830</td>\n",
309
+ " <td>26083</td>\n",
310
+ " <td>...</td>\n",
311
+ " <td>0</td>\n",
312
+ " <td>0</td>\n",
313
+ " <td>0</td>\n",
314
+ " <td>0</td>\n",
315
+ " <td>0</td>\n",
316
+ " <td>0</td>\n",
317
+ " <td>0</td>\n",
318
+ " <td>0</td>\n",
319
+ " <td>1</td>\n",
320
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>43348</th>\n",
324
+ " <td>101</td>\n",
325
+ " <td>39774</td>\n",
326
+ " <td>11127</td>\n",
327
+ " <td>45989</td>\n",
328
+ " <td>24596</td>\n",
329
+ " <td>11933</td>\n",
330
+ " <td>170</td>\n",
331
+ " <td>17145</td>\n",
332
+ " <td>10710</td>\n",
333
+ " <td>39125</td>\n",
334
+ " <td>...</td>\n",
335
+ " <td>0</td>\n",
336
+ " <td>0</td>\n",
337
+ " <td>0</td>\n",
338
+ " <td>0</td>\n",
339
+ " <td>0</td>\n",
340
+ " <td>0</td>\n",
341
+ " <td>0</td>\n",
342
+ " <td>0</td>\n",
343
+ " <td>1</td>\n",
344
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
345
+ " </tr>\n",
346
+ " </tbody>\n",
347
+ "</table>\n",
348
+ "<p>43349 rows × 66 columns</p>\n",
349
+ "</div>"
350
+ ],
351
+ "text/plain": [
352
+ " 0 1 2 3 4 5 6 7 8 \n",
353
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
354
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
355
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
356
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
357
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
358
+ "... ... ... ... ... ... ... ... ... ... \n",
359
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
360
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
361
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
362
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
363
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
364
+ "\n",
365
+ " 9 ... 56 57 58 59 60 61 62 63 labels \n",
366
+ "0 31623 ... 0 0 0 0 0 0 0 0 0 \\\n",
367
+ "1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
368
+ "2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
369
+ "3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
370
+ "4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
371
+ "... ... ... .. .. .. .. .. .. .. .. ... \n",
372
+ "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
373
+ "43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
374
+ "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
375
+ "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
376
+ "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
377
+ "\n",
378
+ " tweet \n",
379
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
380
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
381
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
382
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
383
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
384
+ "... ... \n",
385
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
386
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
387
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
388
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
389
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
390
+ "\n",
391
+ "[43349 rows x 66 columns]"
392
+ ]
393
+ },
394
+ "execution_count": 3,
395
+ "metadata": {},
396
+ "output_type": "execute_result"
397
+ }
398
+ ],
399
+ "source": [
400
+ "copy_df"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 4,
406
+ "metadata": {},
407
+ "outputs": [
408
+ {
409
+ "data": {
410
+ "text/html": [
411
+ "<div>\n",
412
+ "<style scoped>\n",
413
+ " .dataframe tbody tr th:only-of-type {\n",
414
+ " vertical-align: middle;\n",
415
+ " }\n",
416
+ "\n",
417
+ " .dataframe tbody tr th {\n",
418
+ " vertical-align: top;\n",
419
+ " }\n",
420
+ "\n",
421
+ " .dataframe thead th {\n",
422
+ " text-align: right;\n",
423
+ " }\n",
424
+ "</style>\n",
425
+ "<table border=\"1\" class=\"dataframe\">\n",
426
+ " <thead>\n",
427
+ " <tr style=\"text-align: right;\">\n",
428
+ " <th></th>\n",
429
+ " <th>0</th>\n",
430
+ " <th>1</th>\n",
431
+ " <th>2</th>\n",
432
+ " <th>3</th>\n",
433
+ " <th>4</th>\n",
434
+ " <th>5</th>\n",
435
+ " <th>6</th>\n",
436
+ " <th>7</th>\n",
437
+ " <th>8</th>\n",
438
+ " <th>9</th>\n",
439
+ " <th>...</th>\n",
440
+ " <th>56</th>\n",
441
+ " <th>57</th>\n",
442
+ " <th>58</th>\n",
443
+ " <th>59</th>\n",
444
+ " <th>60</th>\n",
445
+ " <th>61</th>\n",
446
+ " <th>62</th>\n",
447
+ " <th>63</th>\n",
448
+ " <th>labels</th>\n",
449
+ " <th>tweet</th>\n",
450
+ " </tr>\n",
451
+ " </thead>\n",
452
+ " <tbody>\n",
453
+ " <tr>\n",
454
+ " <th>0</th>\n",
455
+ " <td>101</td>\n",
456
+ " <td>10110</td>\n",
457
+ " <td>175</td>\n",
458
+ " <td>78653</td>\n",
459
+ " <td>189</td>\n",
460
+ " <td>25285</td>\n",
461
+ " <td>15976</td>\n",
462
+ " <td>40840</td>\n",
463
+ " <td>276</td>\n",
464
+ " <td>31623</td>\n",
465
+ " <td>...</td>\n",
466
+ " <td>0</td>\n",
467
+ " <td>0</td>\n",
468
+ " <td>0</td>\n",
469
+ " <td>0</td>\n",
470
+ " <td>0</td>\n",
471
+ " <td>0</td>\n",
472
+ " <td>0</td>\n",
473
+ " <td>0</td>\n",
474
+ " <td>0</td>\n",
475
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
476
+ " </tr>\n",
477
+ " <tr>\n",
478
+ " <th>1</th>\n",
479
+ " <td>101</td>\n",
480
+ " <td>11589</td>\n",
481
+ " <td>10706</td>\n",
482
+ " <td>10713</td>\n",
483
+ " <td>10794</td>\n",
484
+ " <td>94698</td>\n",
485
+ " <td>30668</td>\n",
486
+ " <td>24883</td>\n",
487
+ " <td>117</td>\n",
488
+ " <td>23763</td>\n",
489
+ " <td>...</td>\n",
490
+ " <td>0</td>\n",
491
+ " <td>0</td>\n",
492
+ " <td>0</td>\n",
493
+ " <td>0</td>\n",
494
+ " <td>0</td>\n",
495
+ " <td>0</td>\n",
496
+ " <td>0</td>\n",
497
+ " <td>0</td>\n",
498
+ " <td>0</td>\n",
499
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
500
+ " </tr>\n",
501
+ " <tr>\n",
502
+ " <th>2</th>\n",
503
+ " <td>101</td>\n",
504
+ " <td>148</td>\n",
505
+ " <td>30471</td>\n",
506
+ " <td>10774</td>\n",
507
+ " <td>13785</td>\n",
508
+ " <td>13779</td>\n",
509
+ " <td>33642</td>\n",
510
+ " <td>14399</td>\n",
511
+ " <td>48271</td>\n",
512
+ " <td>76686</td>\n",
513
+ " <td>...</td>\n",
514
+ " <td>0</td>\n",
515
+ " <td>0</td>\n",
516
+ " <td>0</td>\n",
517
+ " <td>0</td>\n",
518
+ " <td>0</td>\n",
519
+ " <td>0</td>\n",
520
+ " <td>0</td>\n",
521
+ " <td>0</td>\n",
522
+ " <td>0</td>\n",
523
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
524
+ " </tr>\n",
525
+ " <tr>\n",
526
+ " <th>3</th>\n",
527
+ " <td>101</td>\n",
528
+ " <td>19319</td>\n",
529
+ " <td>16724</td>\n",
530
+ " <td>10118</td>\n",
531
+ " <td>10107</td>\n",
532
+ " <td>78323</td>\n",
533
+ " <td>12407</td>\n",
534
+ " <td>38959</td>\n",
535
+ " <td>22934</td>\n",
536
+ " <td>10147</td>\n",
537
+ " <td>...</td>\n",
538
+ " <td>0</td>\n",
539
+ " <td>0</td>\n",
540
+ " <td>0</td>\n",
541
+ " <td>0</td>\n",
542
+ " <td>0</td>\n",
543
+ " <td>0</td>\n",
544
+ " <td>0</td>\n",
545
+ " <td>0</td>\n",
546
+ " <td>0</td>\n",
547
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
548
+ " </tr>\n",
549
+ " <tr>\n",
550
+ " <th>4</th>\n",
551
+ " <td>101</td>\n",
552
+ " <td>30932</td>\n",
553
+ " <td>58706</td>\n",
554
+ " <td>58054</td>\n",
555
+ " <td>44907</td>\n",
556
+ " <td>10224</td>\n",
557
+ " <td>106583</td>\n",
558
+ " <td>10288</td>\n",
559
+ " <td>12524</td>\n",
560
+ " <td>13878</td>\n",
561
+ " <td>...</td>\n",
562
+ " <td>0</td>\n",
563
+ " <td>0</td>\n",
564
+ " <td>0</td>\n",
565
+ " <td>0</td>\n",
566
+ " <td>0</td>\n",
567
+ " <td>0</td>\n",
568
+ " <td>0</td>\n",
569
+ " <td>0</td>\n",
570
+ " <td>0</td>\n",
571
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
572
+ " </tr>\n",
573
+ " <tr>\n",
574
+ " <th>...</th>\n",
575
+ " <td>...</td>\n",
576
+ " <td>...</td>\n",
577
+ " <td>...</td>\n",
578
+ " <td>...</td>\n",
579
+ " <td>...</td>\n",
580
+ " <td>...</td>\n",
581
+ " <td>...</td>\n",
582
+ " <td>...</td>\n",
583
+ " <td>...</td>\n",
584
+ " <td>...</td>\n",
585
+ " <td>...</td>\n",
586
+ " <td>...</td>\n",
587
+ " <td>...</td>\n",
588
+ " <td>...</td>\n",
589
+ " <td>...</td>\n",
590
+ " <td>...</td>\n",
591
+ " <td>...</td>\n",
592
+ " <td>...</td>\n",
593
+ " <td>...</td>\n",
594
+ " <td>...</td>\n",
595
+ " <td>...</td>\n",
596
+ " </tr>\n",
597
+ " <tr>\n",
598
+ " <th>43344</th>\n",
599
+ " <td>101</td>\n",
600
+ " <td>20065</td>\n",
601
+ " <td>10161</td>\n",
602
+ " <td>115</td>\n",
603
+ " <td>115</td>\n",
604
+ " <td>103784</td>\n",
605
+ " <td>10774</td>\n",
606
+ " <td>21388</td>\n",
607
+ " <td>10245</td>\n",
608
+ " <td>92067</td>\n",
609
+ " <td>...</td>\n",
610
+ " <td>0</td>\n",
611
+ " <td>0</td>\n",
612
+ " <td>0</td>\n",
613
+ " <td>0</td>\n",
614
+ " <td>0</td>\n",
615
+ " <td>0</td>\n",
616
+ " <td>0</td>\n",
617
+ " <td>0</td>\n",
618
+ " <td>1</td>\n",
619
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
620
+ " </tr>\n",
621
+ " <tr>\n",
622
+ " <th>43345</th>\n",
623
+ " <td>101</td>\n",
624
+ " <td>139</td>\n",
625
+ " <td>80839</td>\n",
626
+ " <td>24109</td>\n",
627
+ " <td>13406</td>\n",
628
+ " <td>18985</td>\n",
629
+ " <td>16285</td>\n",
630
+ " <td>10163</td>\n",
631
+ " <td>11062</td>\n",
632
+ " <td>276</td>\n",
633
+ " <td>...</td>\n",
634
+ " <td>0</td>\n",
635
+ " <td>0</td>\n",
636
+ " <td>0</td>\n",
637
+ " <td>0</td>\n",
638
+ " <td>0</td>\n",
639
+ " <td>0</td>\n",
640
+ " <td>0</td>\n",
641
+ " <td>0</td>\n",
642
+ " <td>1</td>\n",
643
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
644
+ " </tr>\n",
645
+ " <tr>\n",
646
+ " <th>43346</th>\n",
647
+ " <td>101</td>\n",
648
+ " <td>105549</td>\n",
649
+ " <td>102635</td>\n",
650
+ " <td>10140</td>\n",
651
+ " <td>26943</td>\n",
652
+ " <td>11499</td>\n",
653
+ " <td>110516</td>\n",
654
+ " <td>21899</td>\n",
655
+ " <td>11861</td>\n",
656
+ " <td>10561</td>\n",
657
+ " <td>...</td>\n",
658
+ " <td>0</td>\n",
659
+ " <td>0</td>\n",
660
+ " <td>0</td>\n",
661
+ " <td>0</td>\n",
662
+ " <td>0</td>\n",
663
+ " <td>0</td>\n",
664
+ " <td>0</td>\n",
665
+ " <td>0</td>\n",
666
+ " <td>1</td>\n",
667
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
668
+ " </tr>\n",
669
+ " <tr>\n",
670
+ " <th>43347</th>\n",
671
+ " <td>101</td>\n",
672
+ " <td>81424</td>\n",
673
+ " <td>26398</td>\n",
674
+ " <td>92017</td>\n",
675
+ " <td>109620</td>\n",
676
+ " <td>10941</td>\n",
677
+ " <td>76010</td>\n",
678
+ " <td>10115</td>\n",
679
+ " <td>19830</td>\n",
680
+ " <td>26083</td>\n",
681
+ " <td>...</td>\n",
682
+ " <td>0</td>\n",
683
+ " <td>0</td>\n",
684
+ " <td>0</td>\n",
685
+ " <td>0</td>\n",
686
+ " <td>0</td>\n",
687
+ " <td>0</td>\n",
688
+ " <td>0</td>\n",
689
+ " <td>0</td>\n",
690
+ " <td>1</td>\n",
691
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
692
+ " </tr>\n",
693
+ " <tr>\n",
694
+ " <th>43348</th>\n",
695
+ " <td>101</td>\n",
696
+ " <td>39774</td>\n",
697
+ " <td>11127</td>\n",
698
+ " <td>45989</td>\n",
699
+ " <td>24596</td>\n",
700
+ " <td>11933</td>\n",
701
+ " <td>170</td>\n",
702
+ " <td>17145</td>\n",
703
+ " <td>10710</td>\n",
704
+ " <td>39125</td>\n",
705
+ " <td>...</td>\n",
706
+ " <td>0</td>\n",
707
+ " <td>0</td>\n",
708
+ " <td>0</td>\n",
709
+ " <td>0</td>\n",
710
+ " <td>0</td>\n",
711
+ " <td>0</td>\n",
712
+ " <td>0</td>\n",
713
+ " <td>0</td>\n",
714
+ " <td>1</td>\n",
715
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
716
+ " </tr>\n",
717
+ " </tbody>\n",
718
+ "</table>\n",
719
+ "<p>43349 rows × 66 columns</p>\n",
720
+ "</div>"
721
+ ],
722
+ "text/plain": [
723
+ " 0 1 2 3 4 5 6 7 8 \n",
724
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
725
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
726
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
727
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
728
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
729
+ "... ... ... ... ... ... ... ... ... ... \n",
730
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
731
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
732
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
733
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
734
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
735
+ "\n",
736
+ " 9 ... 56 57 58 59 60 61 62 63 labels \n",
737
+ "0 31623 ... 0 0 0 0 0 0 0 0 0 \\\n",
738
+ "1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
739
+ "2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
740
+ "3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
741
+ "4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
742
+ "... ... ... .. .. .. .. .. .. .. .. ... \n",
743
+ "43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
744
+ "43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
745
+ "43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
746
+ "43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
747
+ "43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
748
+ "\n",
749
+ " tweet \n",
750
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
751
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
752
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
753
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
754
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
755
+ "... ... \n",
756
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
757
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
758
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
759
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
760
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
761
+ "\n",
762
+ "[43349 rows x 66 columns]"
763
+ ]
764
+ },
765
+ "execution_count": 4,
766
+ "metadata": {},
767
+ "output_type": "execute_result"
768
+ }
769
+ ],
770
+ "source": [
771
+ "embeding_df"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 5,
777
+ "metadata": {},
778
+ "outputs": [],
779
+ "source": [
780
+ "data = embeding_df.tweet.values"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": 6,
786
+ "metadata": {},
787
+ "outputs": [],
788
+ "source": [
789
+ "embeding_df=embeding_df.drop(['tweet'], axis=1)\n",
790
+ "copy_df=copy_df.drop(['tweet'], axis=1)"
791
+ ]
792
+ },
793
+ {
794
+ "attachments": {},
795
+ "cell_type": "markdown",
796
+ "metadata": {},
797
+ "source": [
798
+ "# isolation forest"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": 7,
804
+ "metadata": {},
805
+ "outputs": [],
806
+ "source": [
807
+ "from sklearn.ensemble import IsolationForest"
808
+ ]
809
+ },
810
+ {
811
+ "cell_type": "code",
812
+ "execution_count": 8,
813
+ "metadata": {},
814
+ "outputs": [
815
+ {
816
+ "name": "stderr",
817
+ "output_type": "stream",
818
+ "text": [
819
+ "/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names\n",
820
+ " warnings.warn(\n"
821
+ ]
822
+ }
823
+ ],
824
+ "source": [
825
+ "# Train the model\n",
826
+ "isf = IsolationForest(contamination=0.04)\n",
827
+ "isf.fit(embeding_df)\n",
828
+ "# Predictions\n",
829
+ "predictions = isf.predict(embeding_df)"
830
+ ]
831
+ },
832
+ {
833
+ "cell_type": "code",
834
+ "execution_count": 9,
835
+ "metadata": {},
836
+ "outputs": [
837
+ {
838
+ "data": {
839
+ "text/html": [
840
+ "<div>\n",
841
+ "<style scoped>\n",
842
+ " .dataframe tbody tr th:only-of-type {\n",
843
+ " vertical-align: middle;\n",
844
+ " }\n",
845
+ "\n",
846
+ " .dataframe tbody tr th {\n",
847
+ " vertical-align: top;\n",
848
+ " }\n",
849
+ "\n",
850
+ " .dataframe thead th {\n",
851
+ " text-align: right;\n",
852
+ " }\n",
853
+ "</style>\n",
854
+ "<table border=\"1\" class=\"dataframe\">\n",
855
+ " <thead>\n",
856
+ " <tr style=\"text-align: right;\">\n",
857
+ " <th></th>\n",
858
+ " <th>0</th>\n",
859
+ " <th>1</th>\n",
860
+ " <th>2</th>\n",
861
+ " <th>3</th>\n",
862
+ " <th>4</th>\n",
863
+ " <th>5</th>\n",
864
+ " <th>6</th>\n",
865
+ " <th>7</th>\n",
866
+ " <th>8</th>\n",
867
+ " <th>9</th>\n",
868
+ " <th>...</th>\n",
869
+ " <th>57</th>\n",
870
+ " <th>58</th>\n",
871
+ " <th>59</th>\n",
872
+ " <th>60</th>\n",
873
+ " <th>61</th>\n",
874
+ " <th>62</th>\n",
875
+ " <th>63</th>\n",
876
+ " <th>labels</th>\n",
877
+ " <th>iso_forest_scores</th>\n",
878
+ " <th>iso_forest_outliers</th>\n",
879
+ " </tr>\n",
880
+ " </thead>\n",
881
+ " <tbody>\n",
882
+ " <tr>\n",
883
+ " <th>count</th>\n",
884
+ " <td>43349.0</td>\n",
885
+ " <td>43349.000000</td>\n",
886
+ " <td>43349.000000</td>\n",
887
+ " <td>43349.000000</td>\n",
888
+ " <td>43349.000000</td>\n",
889
+ " <td>43349.000000</td>\n",
890
+ " <td>43349.000000</td>\n",
891
+ " <td>43349.000000</td>\n",
892
+ " <td>43349.000000</td>\n",
893
+ " <td>43349.000000</td>\n",
894
+ " <td>...</td>\n",
895
+ " <td>43349.000000</td>\n",
896
+ " <td>43349.000000</td>\n",
897
+ " <td>43349.00000</td>\n",
898
+ " <td>43349.000000</td>\n",
899
+ " <td>43349.000000</td>\n",
900
+ " <td>43349.000000</td>\n",
901
+ " <td>43349.000000</td>\n",
902
+ " <td>43349.000000</td>\n",
903
+ " <td>43349.000000</td>\n",
904
+ " <td>43349.000000</td>\n",
905
+ " </tr>\n",
906
+ " <tr>\n",
907
+ " <th>mean</th>\n",
908
+ " <td>101.0</td>\n",
909
+ " <td>27403.389559</td>\n",
910
+ " <td>29588.353803</td>\n",
911
+ " <td>26720.445131</td>\n",
912
+ " <td>27755.110106</td>\n",
913
+ " <td>27346.753628</td>\n",
914
+ " <td>27713.189255</td>\n",
915
+ " <td>27295.717687</td>\n",
916
+ " <td>27136.227410</td>\n",
917
+ " <td>26812.611156</td>\n",
918
+ " <td>...</td>\n",
919
+ " <td>4868.917184</td>\n",
920
+ " <td>4813.145309</td>\n",
921
+ " <td>4733.38919</td>\n",
922
+ " <td>4389.068375</td>\n",
923
+ " <td>4297.575723</td>\n",
924
+ " <td>4176.437080</td>\n",
925
+ " <td>17.000392</td>\n",
926
+ " <td>0.417957</td>\n",
927
+ " <td>0.135546</td>\n",
928
+ " <td>0.919998</td>\n",
929
+ " </tr>\n",
930
+ " <tr>\n",
931
+ " <th>std</th>\n",
932
+ " <td>0.0</td>\n",
933
+ " <td>27382.274693</td>\n",
934
+ " <td>27727.688965</td>\n",
935
+ " <td>26455.267691</td>\n",
936
+ " <td>27026.611068</td>\n",
937
+ " <td>26799.753823</td>\n",
938
+ " <td>27021.950023</td>\n",
939
+ " <td>26761.847936</td>\n",
940
+ " <td>26820.810219</td>\n",
941
+ " <td>26720.480625</td>\n",
942
+ " <td>...</td>\n",
943
+ " <td>15312.358275</td>\n",
944
+ " <td>15491.136511</td>\n",
945
+ " <td>15387.09038</td>\n",
946
+ " <td>14617.253040</td>\n",
947
+ " <td>14643.580886</td>\n",
948
+ " <td>14405.397208</td>\n",
949
+ " <td>38.013945</td>\n",
950
+ " <td>0.493229</td>\n",
951
+ " <td>0.066701</td>\n",
952
+ " <td>0.391927</td>\n",
953
+ " </tr>\n",
954
+ " <tr>\n",
955
+ " <th>min</th>\n",
956
+ " <td>101.0</td>\n",
957
+ " <td>100.000000</td>\n",
958
+ " <td>100.000000</td>\n",
959
+ " <td>0.000000</td>\n",
960
+ " <td>0.000000</td>\n",
961
+ " <td>0.000000</td>\n",
962
+ " <td>0.000000</td>\n",
963
+ " <td>0.000000</td>\n",
964
+ " <td>0.000000</td>\n",
965
+ " <td>0.000000</td>\n",
966
+ " <td>...</td>\n",
967
+ " <td>0.000000</td>\n",
968
+ " <td>0.000000</td>\n",
969
+ " <td>0.00000</td>\n",
970
+ " <td>0.000000</td>\n",
971
+ " <td>0.000000</td>\n",
972
+ " <td>0.000000</td>\n",
973
+ " <td>0.000000</td>\n",
974
+ " <td>0.000000</td>\n",
975
+ " <td>-0.140643</td>\n",
976
+ " <td>-1.000000</td>\n",
977
+ " </tr>\n",
978
+ " <tr>\n",
979
+ " <th>25%</th>\n",
980
+ " <td>101.0</td>\n",
981
+ " <td>10357.000000</td>\n",
982
+ " <td>10506.000000</td>\n",
983
+ " <td>10323.000000</td>\n",
984
+ " <td>10361.000000</td>\n",
985
+ " <td>10350.000000</td>\n",
986
+ " <td>10369.000000</td>\n",
987
+ " <td>10347.000000</td>\n",
988
+ " <td>10330.000000</td>\n",
989
+ " <td>10323.000000</td>\n",
990
+ " <td>...</td>\n",
991
+ " <td>0.000000</td>\n",
992
+ " <td>0.000000</td>\n",
993
+ " <td>0.00000</td>\n",
994
+ " <td>0.000000</td>\n",
995
+ " <td>0.000000</td>\n",
996
+ " <td>0.000000</td>\n",
997
+ " <td>0.000000</td>\n",
998
+ " <td>0.000000</td>\n",
999
+ " <td>0.089100</td>\n",
1000
+ " <td>1.000000</td>\n",
1001
+ " </tr>\n",
1002
+ " <tr>\n",
1003
+ " <th>50%</th>\n",
1004
+ " <td>101.0</td>\n",
1005
+ " <td>18856.000000</td>\n",
1006
+ " <td>16263.000000</td>\n",
1007
+ " <td>13587.000000</td>\n",
1008
+ " <td>14918.000000</td>\n",
1009
+ " <td>14753.000000</td>\n",
1010
+ " <td>15090.000000</td>\n",
1011
+ " <td>14777.000000</td>\n",
1012
+ " <td>14753.000000</td>\n",
1013
+ " <td>14110.000000</td>\n",
1014
+ " <td>...</td>\n",
1015
+ " <td>0.000000</td>\n",
1016
+ " <td>0.000000</td>\n",
1017
+ " <td>0.00000</td>\n",
1018
+ " <td>0.000000</td>\n",
1019
+ " <td>0.000000</td>\n",
1020
+ " <td>0.000000</td>\n",
1021
+ " <td>0.000000</td>\n",
1022
+ " <td>0.000000</td>\n",
1023
+ " <td>0.161505</td>\n",
1024
+ " <td>1.000000</td>\n",
1025
+ " </tr>\n",
1026
+ " <tr>\n",
1027
+ " <th>75%</th>\n",
1028
+ " <td>101.0</td>\n",
1029
+ " <td>41079.000000</td>\n",
1030
+ " <td>40762.000000</td>\n",
1031
+ " <td>35943.000000</td>\n",
1032
+ " <td>37820.000000</td>\n",
1033
+ " <td>36544.000000</td>\n",
1034
+ " <td>37820.000000</td>\n",
1035
+ " <td>36723.000000</td>\n",
1036
+ " <td>36544.000000</td>\n",
1037
+ " <td>36445.000000</td>\n",
1038
+ " <td>...</td>\n",
1039
+ " <td>0.000000</td>\n",
1040
+ " <td>0.000000</td>\n",
1041
+ " <td>0.00000</td>\n",
1042
+ " <td>0.000000</td>\n",
1043
+ " <td>0.000000</td>\n",
1044
+ " <td>0.000000</td>\n",
1045
+ " <td>0.000000</td>\n",
1046
+ " <td>1.000000</td>\n",
1047
+ " <td>0.189511</td>\n",
1048
+ " <td>1.000000</td>\n",
1049
+ " </tr>\n",
1050
+ " <tr>\n",
1051
+ " <th>max</th>\n",
1052
+ " <td>101.0</td>\n",
1053
+ " <td>110744.000000</td>\n",
1054
+ " <td>110966.000000</td>\n",
1055
+ " <td>110966.000000</td>\n",
1056
+ " <td>110966.000000</td>\n",
1057
+ " <td>111720.000000</td>\n",
1058
+ " <td>111720.000000</td>\n",
1059
+ " <td>111720.000000</td>\n",
1060
+ " <td>111720.000000</td>\n",
1061
+ " <td>111720.000000</td>\n",
1062
+ " <td>...</td>\n",
1063
+ " <td>110966.000000</td>\n",
1064
+ " <td>110966.000000</td>\n",
1065
+ " <td>110966.00000</td>\n",
1066
+ " <td>110966.000000</td>\n",
1067
+ " <td>110966.000000</td>\n",
1068
+ " <td>110966.000000</td>\n",
1069
+ " <td>102.000000</td>\n",
1070
+ " <td>1.000000</td>\n",
1071
+ " <td>0.216831</td>\n",
1072
+ " <td>1.000000</td>\n",
1073
+ " </tr>\n",
1074
+ " </tbody>\n",
1075
+ "</table>\n",
1076
+ "<p>8 rows × 67 columns</p>\n",
1077
+ "</div>"
1078
+ ],
1079
+ "text/plain": [
1080
+ " 0 1 2 3 4 \n",
1081
+ "count 43349.0 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
1082
+ "mean 101.0 27403.389559 29588.353803 26720.445131 27755.110106 \n",
1083
+ "std 0.0 27382.274693 27727.688965 26455.267691 27026.611068 \n",
1084
+ "min 101.0 100.000000 100.000000 0.000000 0.000000 \n",
1085
+ "25% 101.0 10357.000000 10506.000000 10323.000000 10361.000000 \n",
1086
+ "50% 101.0 18856.000000 16263.000000 13587.000000 14918.000000 \n",
1087
+ "75% 101.0 41079.000000 40762.000000 35943.000000 37820.000000 \n",
1088
+ "max 101.0 110744.000000 110966.000000 110966.000000 110966.000000 \n",
1089
+ "\n",
1090
+ " 5 6 7 8 \n",
1091
+ "count 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
1092
+ "mean 27346.753628 27713.189255 27295.717687 27136.227410 \n",
1093
+ "std 26799.753823 27021.950023 26761.847936 26820.810219 \n",
1094
+ "min 0.000000 0.000000 0.000000 0.000000 \n",
1095
+ "25% 10350.000000 10369.000000 10347.000000 10330.000000 \n",
1096
+ "50% 14753.000000 15090.000000 14777.000000 14753.000000 \n",
1097
+ "75% 36544.000000 37820.000000 36723.000000 36544.000000 \n",
1098
+ "max 111720.000000 111720.000000 111720.000000 111720.000000 \n",
1099
+ "\n",
1100
+ " 9 ... 57 58 59 \n",
1101
+ "count 43349.000000 ... 43349.000000 43349.000000 43349.00000 \\\n",
1102
+ "mean 26812.611156 ... 4868.917184 4813.145309 4733.38919 \n",
1103
+ "std 26720.480625 ... 15312.358275 15491.136511 15387.09038 \n",
1104
+ "min 0.000000 ... 0.000000 0.000000 0.00000 \n",
1105
+ "25% 10323.000000 ... 0.000000 0.000000 0.00000 \n",
1106
+ "50% 14110.000000 ... 0.000000 0.000000 0.00000 \n",
1107
+ "75% 36445.000000 ... 0.000000 0.000000 0.00000 \n",
1108
+ "max 111720.000000 ... 110966.000000 110966.000000 110966.00000 \n",
1109
+ "\n",
1110
+ " 60 61 62 63 \n",
1111
+ "count 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
1112
+ "mean 4389.068375 4297.575723 4176.437080 17.000392 \n",
1113
+ "std 14617.253040 14643.580886 14405.397208 38.013945 \n",
1114
+ "min 0.000000 0.000000 0.000000 0.000000 \n",
1115
+ "25% 0.000000 0.000000 0.000000 0.000000 \n",
1116
+ "50% 0.000000 0.000000 0.000000 0.000000 \n",
1117
+ "75% 0.000000 0.000000 0.000000 0.000000 \n",
1118
+ "max 110966.000000 110966.000000 110966.000000 102.000000 \n",
1119
+ "\n",
1120
+ " labels iso_forest_scores iso_forest_outliers \n",
1121
+ "count 43349.000000 43349.000000 43349.000000 \n",
1122
+ "mean 0.417957 0.135546 0.919998 \n",
1123
+ "std 0.493229 0.066701 0.391927 \n",
1124
+ "min 0.000000 -0.140643 -1.000000 \n",
1125
+ "25% 0.000000 0.089100 1.000000 \n",
1126
+ "50% 0.000000 0.161505 1.000000 \n",
1127
+ "75% 1.000000 0.189511 1.000000 \n",
1128
+ "max 1.000000 0.216831 1.000000 \n",
1129
+ "\n",
1130
+ "[8 rows x 67 columns]"
1131
+ ]
1132
+ },
1133
+ "execution_count": 9,
1134
+ "metadata": {},
1135
+ "output_type": "execute_result"
1136
+ }
1137
+ ],
1138
+ "source": [
1139
+ "# Extract scores\n",
1140
+ "embeding_df[\"iso_forest_scores\"] = isf.decision_function(embeding_df)\n",
1141
+ "# Extract predictions\n",
1142
+ "embeding_df[\"iso_forest_outliers\"] = predictions\n",
1143
+ "# Describe the dataframe\n",
1144
+ "embeding_df.describe()"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": 10,
1150
+ "metadata": {},
1151
+ "outputs": [
1152
+ {
1153
+ "data": {
1154
+ "text/html": [
1155
+ "<div>\n",
1156
+ "<style scoped>\n",
1157
+ " .dataframe tbody tr th:only-of-type {\n",
1158
+ " vertical-align: middle;\n",
1159
+ " }\n",
1160
+ "\n",
1161
+ " .dataframe tbody tr th {\n",
1162
+ " vertical-align: top;\n",
1163
+ " }\n",
1164
+ "\n",
1165
+ " .dataframe thead th {\n",
1166
+ " text-align: right;\n",
1167
+ " }\n",
1168
+ "</style>\n",
1169
+ "<table border=\"1\" class=\"dataframe\">\n",
1170
+ " <thead>\n",
1171
+ " <tr style=\"text-align: right;\">\n",
1172
+ " <th></th>\n",
1173
+ " <th>0</th>\n",
1174
+ " <th>1</th>\n",
1175
+ " <th>2</th>\n",
1176
+ " <th>3</th>\n",
1177
+ " <th>4</th>\n",
1178
+ " <th>5</th>\n",
1179
+ " <th>6</th>\n",
1180
+ " <th>7</th>\n",
1181
+ " <th>8</th>\n",
1182
+ " <th>9</th>\n",
1183
+ " <th>...</th>\n",
1184
+ " <th>57</th>\n",
1185
+ " <th>58</th>\n",
1186
+ " <th>59</th>\n",
1187
+ " <th>60</th>\n",
1188
+ " <th>61</th>\n",
1189
+ " <th>62</th>\n",
1190
+ " <th>63</th>\n",
1191
+ " <th>labels</th>\n",
1192
+ " <th>iso_forest_scores</th>\n",
1193
+ " <th>iso_forest_outliers</th>\n",
1194
+ " </tr>\n",
1195
+ " </thead>\n",
1196
+ " <tbody>\n",
1197
+ " <tr>\n",
1198
+ " <th>0</th>\n",
1199
+ " <td>101</td>\n",
1200
+ " <td>10110</td>\n",
1201
+ " <td>175</td>\n",
1202
+ " <td>78653</td>\n",
1203
+ " <td>189</td>\n",
1204
+ " <td>25285</td>\n",
1205
+ " <td>15976</td>\n",
1206
+ " <td>40840</td>\n",
1207
+ " <td>276</td>\n",
1208
+ " <td>31623</td>\n",
1209
+ " <td>...</td>\n",
1210
+ " <td>0</td>\n",
1211
+ " <td>0</td>\n",
1212
+ " <td>0</td>\n",
1213
+ " <td>0</td>\n",
1214
+ " <td>0</td>\n",
1215
+ " <td>0</td>\n",
1216
+ " <td>0</td>\n",
1217
+ " <td>0</td>\n",
1218
+ " <td>0.189202</td>\n",
1219
+ " <td>No</td>\n",
1220
+ " </tr>\n",
1221
+ " <tr>\n",
1222
+ " <th>1</th>\n",
1223
+ " <td>101</td>\n",
1224
+ " <td>11589</td>\n",
1225
+ " <td>10706</td>\n",
1226
+ " <td>10713</td>\n",
1227
+ " <td>10794</td>\n",
1228
+ " <td>94698</td>\n",
1229
+ " <td>30668</td>\n",
1230
+ " <td>24883</td>\n",
1231
+ " <td>117</td>\n",
1232
+ " <td>23763</td>\n",
1233
+ " <td>...</td>\n",
1234
+ " <td>0</td>\n",
1235
+ " <td>0</td>\n",
1236
+ " <td>0</td>\n",
1237
+ " <td>0</td>\n",
1238
+ " <td>0</td>\n",
1239
+ " <td>0</td>\n",
1240
+ " <td>0</td>\n",
1241
+ " <td>0</td>\n",
1242
+ " <td>0.181234</td>\n",
1243
+ " <td>No</td>\n",
1244
+ " </tr>\n",
1245
+ " <tr>\n",
1246
+ " <th>2</th>\n",
1247
+ " <td>101</td>\n",
1248
+ " <td>148</td>\n",
1249
+ " <td>30471</td>\n",
1250
+ " <td>10774</td>\n",
1251
+ " <td>13785</td>\n",
1252
+ " <td>13779</td>\n",
1253
+ " <td>33642</td>\n",
1254
+ " <td>14399</td>\n",
1255
+ " <td>48271</td>\n",
1256
+ " <td>76686</td>\n",
1257
+ " <td>...</td>\n",
1258
+ " <td>0</td>\n",
1259
+ " <td>0</td>\n",
1260
+ " <td>0</td>\n",
1261
+ " <td>0</td>\n",
1262
+ " <td>0</td>\n",
1263
+ " <td>0</td>\n",
1264
+ " <td>0</td>\n",
1265
+ " <td>0</td>\n",
1266
+ " <td>0.166332</td>\n",
1267
+ " <td>No</td>\n",
1268
+ " </tr>\n",
1269
+ " <tr>\n",
1270
+ " <th>3</th>\n",
1271
+ " <td>101</td>\n",
1272
+ " <td>19319</td>\n",
1273
+ " <td>16724</td>\n",
1274
+ " <td>10118</td>\n",
1275
+ " <td>10107</td>\n",
1276
+ " <td>78323</td>\n",
1277
+ " <td>12407</td>\n",
1278
+ " <td>38959</td>\n",
1279
+ " <td>22934</td>\n",
1280
+ " <td>10147</td>\n",
1281
+ " <td>...</td>\n",
1282
+ " <td>0</td>\n",
1283
+ " <td>0</td>\n",
1284
+ " <td>0</td>\n",
1285
+ " <td>0</td>\n",
1286
+ " <td>0</td>\n",
1287
+ " <td>0</td>\n",
1288
+ " <td>0</td>\n",
1289
+ " <td>0</td>\n",
1290
+ " <td>0.151816</td>\n",
1291
+ " <td>No</td>\n",
1292
+ " </tr>\n",
1293
+ " <tr>\n",
1294
+ " <th>4</th>\n",
1295
+ " <td>101</td>\n",
1296
+ " <td>30932</td>\n",
1297
+ " <td>58706</td>\n",
1298
+ " <td>58054</td>\n",
1299
+ " <td>44907</td>\n",
1300
+ " <td>10224</td>\n",
1301
+ " <td>106583</td>\n",
1302
+ " <td>10288</td>\n",
1303
+ " <td>12524</td>\n",
1304
+ " <td>13878</td>\n",
1305
+ " <td>...</td>\n",
1306
+ " <td>0</td>\n",
1307
+ " <td>0</td>\n",
1308
+ " <td>0</td>\n",
1309
+ " <td>0</td>\n",
1310
+ " <td>0</td>\n",
1311
+ " <td>0</td>\n",
1312
+ " <td>0</td>\n",
1313
+ " <td>0</td>\n",
1314
+ " <td>0.184008</td>\n",
1315
+ " <td>No</td>\n",
1316
+ " </tr>\n",
1317
+ " <tr>\n",
1318
+ " <th>...</th>\n",
1319
+ " <td>...</td>\n",
1320
+ " <td>...</td>\n",
1321
+ " <td>...</td>\n",
1322
+ " <td>...</td>\n",
1323
+ " <td>...</td>\n",
1324
+ " <td>...</td>\n",
1325
+ " <td>...</td>\n",
1326
+ " <td>...</td>\n",
1327
+ " <td>...</td>\n",
1328
+ " <td>...</td>\n",
1329
+ " <td>...</td>\n",
1330
+ " <td>...</td>\n",
1331
+ " <td>...</td>\n",
1332
+ " <td>...</td>\n",
1333
+ " <td>...</td>\n",
1334
+ " <td>...</td>\n",
1335
+ " <td>...</td>\n",
1336
+ " <td>...</td>\n",
1337
+ " <td>...</td>\n",
1338
+ " <td>...</td>\n",
1339
+ " <td>...</td>\n",
1340
+ " </tr>\n",
1341
+ " <tr>\n",
1342
+ " <th>43344</th>\n",
1343
+ " <td>101</td>\n",
1344
+ " <td>20065</td>\n",
1345
+ " <td>10161</td>\n",
1346
+ " <td>115</td>\n",
1347
+ " <td>115</td>\n",
1348
+ " <td>103784</td>\n",
1349
+ " <td>10774</td>\n",
1350
+ " <td>21388</td>\n",
1351
+ " <td>10245</td>\n",
1352
+ " <td>92067</td>\n",
1353
+ " <td>...</td>\n",
1354
+ " <td>0</td>\n",
1355
+ " <td>0</td>\n",
1356
+ " <td>0</td>\n",
1357
+ " <td>0</td>\n",
1358
+ " <td>0</td>\n",
1359
+ " <td>0</td>\n",
1360
+ " <td>0</td>\n",
1361
+ " <td>1</td>\n",
1362
+ " <td>0.079412</td>\n",
1363
+ " <td>No</td>\n",
1364
+ " </tr>\n",
1365
+ " <tr>\n",
1366
+ " <th>43345</th>\n",
1367
+ " <td>101</td>\n",
1368
+ " <td>139</td>\n",
1369
+ " <td>80839</td>\n",
1370
+ " <td>24109</td>\n",
1371
+ " <td>13406</td>\n",
1372
+ " <td>18985</td>\n",
1373
+ " <td>16285</td>\n",
1374
+ " <td>10163</td>\n",
1375
+ " <td>11062</td>\n",
1376
+ " <td>276</td>\n",
1377
+ " <td>...</td>\n",
1378
+ " <td>0</td>\n",
1379
+ " <td>0</td>\n",
1380
+ " <td>0</td>\n",
1381
+ " <td>0</td>\n",
1382
+ " <td>0</td>\n",
1383
+ " <td>0</td>\n",
1384
+ " <td>0</td>\n",
1385
+ " <td>1</td>\n",
1386
+ " <td>0.118245</td>\n",
1387
+ " <td>No</td>\n",
1388
+ " </tr>\n",
1389
+ " <tr>\n",
1390
+ " <th>43346</th>\n",
1391
+ " <td>101</td>\n",
1392
+ " <td>105549</td>\n",
1393
+ " <td>102635</td>\n",
1394
+ " <td>10140</td>\n",
1395
+ " <td>26943</td>\n",
1396
+ " <td>11499</td>\n",
1397
+ " <td>110516</td>\n",
1398
+ " <td>21899</td>\n",
1399
+ " <td>11861</td>\n",
1400
+ " <td>10561</td>\n",
1401
+ " <td>...</td>\n",
1402
+ " <td>0</td>\n",
1403
+ " <td>0</td>\n",
1404
+ " <td>0</td>\n",
1405
+ " <td>0</td>\n",
1406
+ " <td>0</td>\n",
1407
+ " <td>0</td>\n",
1408
+ " <td>0</td>\n",
1409
+ " <td>1</td>\n",
1410
+ " <td>0.138229</td>\n",
1411
+ " <td>No</td>\n",
1412
+ " </tr>\n",
1413
+ " <tr>\n",
1414
+ " <th>43347</th>\n",
1415
+ " <td>101</td>\n",
1416
+ " <td>81424</td>\n",
1417
+ " <td>26398</td>\n",
1418
+ " <td>92017</td>\n",
1419
+ " <td>109620</td>\n",
1420
+ " <td>10941</td>\n",
1421
+ " <td>76010</td>\n",
1422
+ " <td>10115</td>\n",
1423
+ " <td>19830</td>\n",
1424
+ " <td>26083</td>\n",
1425
+ " <td>...</td>\n",
1426
+ " <td>0</td>\n",
1427
+ " <td>0</td>\n",
1428
+ " <td>0</td>\n",
1429
+ " <td>0</td>\n",
1430
+ " <td>0</td>\n",
1431
+ " <td>0</td>\n",
1432
+ " <td>0</td>\n",
1433
+ " <td>1</td>\n",
1434
+ " <td>0.181065</td>\n",
1435
+ " <td>No</td>\n",
1436
+ " </tr>\n",
1437
+ " <tr>\n",
1438
+ " <th>43348</th>\n",
1439
+ " <td>101</td>\n",
1440
+ " <td>39774</td>\n",
1441
+ " <td>11127</td>\n",
1442
+ " <td>45989</td>\n",
1443
+ " <td>24596</td>\n",
1444
+ " <td>11933</td>\n",
1445
+ " <td>170</td>\n",
1446
+ " <td>17145</td>\n",
1447
+ " <td>10710</td>\n",
1448
+ " <td>39125</td>\n",
1449
+ " <td>...</td>\n",
1450
+ " <td>0</td>\n",
1451
+ " <td>0</td>\n",
1452
+ " <td>0</td>\n",
1453
+ " <td>0</td>\n",
1454
+ " <td>0</td>\n",
1455
+ " <td>0</td>\n",
1456
+ " <td>0</td>\n",
1457
+ " <td>1</td>\n",
1458
+ " <td>0.085161</td>\n",
1459
+ " <td>No</td>\n",
1460
+ " </tr>\n",
1461
+ " </tbody>\n",
1462
+ "</table>\n",
1463
+ "<p>43349 rows × 67 columns</p>\n",
1464
+ "</div>"
1465
+ ],
1466
+ "text/plain": [
1467
+ " 0 1 2 3 4 5 6 7 8 \n",
1468
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
1469
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
1470
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
1471
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
1472
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
1473
+ "... ... ... ... ... ... ... ... ... ... \n",
1474
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
1475
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
1476
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
1477
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
1478
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
1479
+ "\n",
1480
+ " 9 ... 57 58 59 60 61 62 63 labels iso_forest_scores \n",
1481
+ "0 31623 ... 0 0 0 0 0 0 0 0 0.189202 \\\n",
1482
+ "1 23763 ... 0 0 0 0 0 0 0 0 0.181234 \n",
1483
+ "2 76686 ... 0 0 0 0 0 0 0 0 0.166332 \n",
1484
+ "3 10147 ... 0 0 0 0 0 0 0 0 0.151816 \n",
1485
+ "4 13878 ... 0 0 0 0 0 0 0 0 0.184008 \n",
1486
+ "... ... ... .. .. .. .. .. .. .. ... ... \n",
1487
+ "43344 92067 ... 0 0 0 0 0 0 0 1 0.079412 \n",
1488
+ "43345 276 ... 0 0 0 0 0 0 0 1 0.118245 \n",
1489
+ "43346 10561 ... 0 0 0 0 0 0 0 1 0.138229 \n",
1490
+ "43347 26083 ... 0 0 0 0 0 0 0 1 0.181065 \n",
1491
+ "43348 39125 ... 0 0 0 0 0 0 0 1 0.085161 \n",
1492
+ "\n",
1493
+ " iso_forest_outliers \n",
1494
+ "0 No \n",
1495
+ "1 No \n",
1496
+ "2 No \n",
1497
+ "3 No \n",
1498
+ "4 No \n",
1499
+ "... ... \n",
1500
+ "43344 No \n",
1501
+ "43345 No \n",
1502
+ "43346 No \n",
1503
+ "43347 No \n",
1504
+ "43348 No \n",
1505
+ "\n",
1506
+ "[43349 rows x 67 columns]"
1507
+ ]
1508
+ },
1509
+ "execution_count": 10,
1510
+ "metadata": {},
1511
+ "output_type": "execute_result"
1512
+ }
1513
+ ],
1514
+ "source": [
1515
+ "# Replace \"-1\" with \"Yes\" and \"1\" with \"No\"\n",
1516
+ "embeding_df['iso_forest_outliers'] = embeding_df['iso_forest_outliers'].replace([-1, 1], [\"Yes\", \"No\"])\n",
1517
+ "# Print the first 5 firms\n",
1518
+ "embeding_df"
1519
+ ]
1520
+ },
1521
+ {
1522
+ "cell_type": "code",
1523
+ "execution_count": 11,
1524
+ "metadata": {},
1525
+ "outputs": [
1526
+ {
1527
+ "data": {
1528
+ "text/plain": [
1529
+ "iso_forest_outliers\n",
1530
+ "False 43349\n",
1531
+ "Name: count, dtype: int64"
1532
+ ]
1533
+ },
1534
+ "execution_count": 11,
1535
+ "metadata": {},
1536
+ "output_type": "execute_result"
1537
+ }
1538
+ ],
1539
+ "source": [
1540
+ "(embeding_df['iso_forest_outliers']=='YES').value_counts()"
1541
+ ]
1542
+ },
1543
+ {
1544
+ "attachments": {},
1545
+ "cell_type": "markdown",
1546
+ "metadata": {},
1547
+ "source": [
1548
+ "# lof"
1549
+ ]
1550
+ },
1551
+ {
1552
+ "cell_type": "code",
1553
+ "execution_count": 12,
1554
+ "metadata": {},
1555
+ "outputs": [],
1556
+ "source": [
1557
+ "from sklearn.neighbors import LocalOutlierFactor\n",
1558
+ "from numpy import quantile, where, random"
1559
+ ]
1560
+ },
1561
+ {
1562
+ "cell_type": "code",
1563
+ "execution_count": 13,
1564
+ "metadata": {},
1565
+ "outputs": [],
1566
+ "source": [
1567
+ "# Train the model\n",
1568
+ "clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)\n",
1569
+ "out=clf.fit_predict(copy_df)\n",
1570
+ "# Predictions\n",
1571
+ "lof = clf.negative_outlier_factor_\n",
1572
+ "embeding_df[\"lof_outliers\"] = lof\n",
1573
+ "embeding_df[\"outliers\"]= out"
1574
+ ]
1575
+ },
1576
+ {
1577
+ "cell_type": "code",
1578
+ "execution_count": 14,
1579
+ "metadata": {},
1580
+ "outputs": [],
1581
+ "source": [
1582
+ "embeding_df['outliers'] = embeding_df['outliers'].replace([-1, 1], [\"Yes\", \"No\"])"
1583
+ ]
1584
+ },
1585
+ {
1586
+ "cell_type": "code",
1587
+ "execution_count": 15,
1588
+ "metadata": {},
1589
+ "outputs": [],
1590
+ "source": [
1591
+ "embeding_df['tweet']=data"
1592
+ ]
1593
+ },
1594
+ {
1595
+ "cell_type": "code",
1596
+ "execution_count": 16,
1597
+ "metadata": {},
1598
+ "outputs": [],
1599
+ "source": [
1600
+ "x=embeding_df[embeding_df['iso_forest_outliers']=='Yes' ]"
1601
+ ]
1602
+ },
1603
+ {
1604
+ "cell_type": "code",
1605
+ "execution_count": 17,
1606
+ "metadata": {},
1607
+ "outputs": [],
1608
+ "source": [
1609
+ "embeding_df.drop(x.loc[x['outliers']=='Yes' ].index, inplace=True)"
1610
+ ]
1611
+ },
1612
+ {
1613
+ "cell_type": "code",
1614
+ "execution_count": 18,
1615
+ "metadata": {},
1616
+ "outputs": [
1617
+ {
1618
+ "data": {
1619
+ "text/html": [
1620
+ "<div>\n",
1621
+ "<style scoped>\n",
1622
+ " .dataframe tbody tr th:only-of-type {\n",
1623
+ " vertical-align: middle;\n",
1624
+ " }\n",
1625
+ "\n",
1626
+ " .dataframe tbody tr th {\n",
1627
+ " vertical-align: top;\n",
1628
+ " }\n",
1629
+ "\n",
1630
+ " .dataframe thead th {\n",
1631
+ " text-align: right;\n",
1632
+ " }\n",
1633
+ "</style>\n",
1634
+ "<table border=\"1\" class=\"dataframe\">\n",
1635
+ " <thead>\n",
1636
+ " <tr style=\"text-align: right;\">\n",
1637
+ " <th></th>\n",
1638
+ " <th>0</th>\n",
1639
+ " <th>1</th>\n",
1640
+ " <th>2</th>\n",
1641
+ " <th>3</th>\n",
1642
+ " <th>4</th>\n",
1643
+ " <th>5</th>\n",
1644
+ " <th>6</th>\n",
1645
+ " <th>7</th>\n",
1646
+ " <th>8</th>\n",
1647
+ " <th>9</th>\n",
1648
+ " <th>...</th>\n",
1649
+ " <th>60</th>\n",
1650
+ " <th>61</th>\n",
1651
+ " <th>62</th>\n",
1652
+ " <th>63</th>\n",
1653
+ " <th>labels</th>\n",
1654
+ " <th>iso_forest_scores</th>\n",
1655
+ " <th>iso_forest_outliers</th>\n",
1656
+ " <th>lof_outliers</th>\n",
1657
+ " <th>outliers</th>\n",
1658
+ " <th>tweet</th>\n",
1659
+ " </tr>\n",
1660
+ " </thead>\n",
1661
+ " <tbody>\n",
1662
+ " <tr>\n",
1663
+ " <th>0</th>\n",
1664
+ " <td>101</td>\n",
1665
+ " <td>10110</td>\n",
1666
+ " <td>175</td>\n",
1667
+ " <td>78653</td>\n",
1668
+ " <td>189</td>\n",
1669
+ " <td>25285</td>\n",
1670
+ " <td>15976</td>\n",
1671
+ " <td>40840</td>\n",
1672
+ " <td>276</td>\n",
1673
+ " <td>31623</td>\n",
1674
+ " <td>...</td>\n",
1675
+ " <td>0</td>\n",
1676
+ " <td>0</td>\n",
1677
+ " <td>0</td>\n",
1678
+ " <td>0</td>\n",
1679
+ " <td>0</td>\n",
1680
+ " <td>0.189202</td>\n",
1681
+ " <td>No</td>\n",
1682
+ " <td>-1.209681</td>\n",
1683
+ " <td>No</td>\n",
1684
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
1685
+ " </tr>\n",
1686
+ " <tr>\n",
1687
+ " <th>1</th>\n",
1688
+ " <td>101</td>\n",
1689
+ " <td>11589</td>\n",
1690
+ " <td>10706</td>\n",
1691
+ " <td>10713</td>\n",
1692
+ " <td>10794</td>\n",
1693
+ " <td>94698</td>\n",
1694
+ " <td>30668</td>\n",
1695
+ " <td>24883</td>\n",
1696
+ " <td>117</td>\n",
1697
+ " <td>23763</td>\n",
1698
+ " <td>...</td>\n",
1699
+ " <td>0</td>\n",
1700
+ " <td>0</td>\n",
1701
+ " <td>0</td>\n",
1702
+ " <td>0</td>\n",
1703
+ " <td>0</td>\n",
1704
+ " <td>0.181234</td>\n",
1705
+ " <td>No</td>\n",
1706
+ " <td>-1.107479</td>\n",
1707
+ " <td>No</td>\n",
1708
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
1709
+ " </tr>\n",
1710
+ " <tr>\n",
1711
+ " <th>2</th>\n",
1712
+ " <td>101</td>\n",
1713
+ " <td>148</td>\n",
1714
+ " <td>30471</td>\n",
1715
+ " <td>10774</td>\n",
1716
+ " <td>13785</td>\n",
1717
+ " <td>13779</td>\n",
1718
+ " <td>33642</td>\n",
1719
+ " <td>14399</td>\n",
1720
+ " <td>48271</td>\n",
1721
+ " <td>76686</td>\n",
1722
+ " <td>...</td>\n",
1723
+ " <td>0</td>\n",
1724
+ " <td>0</td>\n",
1725
+ " <td>0</td>\n",
1726
+ " <td>0</td>\n",
1727
+ " <td>0</td>\n",
1728
+ " <td>0.166332</td>\n",
1729
+ " <td>No</td>\n",
1730
+ " <td>-1.202529</td>\n",
1731
+ " <td>No</td>\n",
1732
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
1733
+ " </tr>\n",
1734
+ " <tr>\n",
1735
+ " <th>3</th>\n",
1736
+ " <td>101</td>\n",
1737
+ " <td>19319</td>\n",
1738
+ " <td>16724</td>\n",
1739
+ " <td>10118</td>\n",
1740
+ " <td>10107</td>\n",
1741
+ " <td>78323</td>\n",
1742
+ " <td>12407</td>\n",
1743
+ " <td>38959</td>\n",
1744
+ " <td>22934</td>\n",
1745
+ " <td>10147</td>\n",
1746
+ " <td>...</td>\n",
1747
+ " <td>0</td>\n",
1748
+ " <td>0</td>\n",
1749
+ " <td>0</td>\n",
1750
+ " <td>0</td>\n",
1751
+ " <td>0</td>\n",
1752
+ " <td>0.151816</td>\n",
1753
+ " <td>No</td>\n",
1754
+ " <td>-1.216599</td>\n",
1755
+ " <td>No</td>\n",
1756
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
1757
+ " </tr>\n",
1758
+ " <tr>\n",
1759
+ " <th>4</th>\n",
1760
+ " <td>101</td>\n",
1761
+ " <td>30932</td>\n",
1762
+ " <td>58706</td>\n",
1763
+ " <td>58054</td>\n",
1764
+ " <td>44907</td>\n",
1765
+ " <td>10224</td>\n",
1766
+ " <td>106583</td>\n",
1767
+ " <td>10288</td>\n",
1768
+ " <td>12524</td>\n",
1769
+ " <td>13878</td>\n",
1770
+ " <td>...</td>\n",
1771
+ " <td>0</td>\n",
1772
+ " <td>0</td>\n",
1773
+ " <td>0</td>\n",
1774
+ " <td>0</td>\n",
1775
+ " <td>0</td>\n",
1776
+ " <td>0.184008</td>\n",
1777
+ " <td>No</td>\n",
1778
+ " <td>-1.188488</td>\n",
1779
+ " <td>No</td>\n",
1780
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
1781
+ " </tr>\n",
1782
+ " <tr>\n",
1783
+ " <th>...</th>\n",
1784
+ " <td>...</td>\n",
1785
+ " <td>...</td>\n",
1786
+ " <td>...</td>\n",
1787
+ " <td>...</td>\n",
1788
+ " <td>...</td>\n",
1789
+ " <td>...</td>\n",
1790
+ " <td>...</td>\n",
1791
+ " <td>...</td>\n",
1792
+ " <td>...</td>\n",
1793
+ " <td>...</td>\n",
1794
+ " <td>...</td>\n",
1795
+ " <td>...</td>\n",
1796
+ " <td>...</td>\n",
1797
+ " <td>...</td>\n",
1798
+ " <td>...</td>\n",
1799
+ " <td>...</td>\n",
1800
+ " <td>...</td>\n",
1801
+ " <td>...</td>\n",
1802
+ " <td>...</td>\n",
1803
+ " <td>...</td>\n",
1804
+ " <td>...</td>\n",
1805
+ " </tr>\n",
1806
+ " <tr>\n",
1807
+ " <th>43344</th>\n",
1808
+ " <td>101</td>\n",
1809
+ " <td>20065</td>\n",
1810
+ " <td>10161</td>\n",
1811
+ " <td>115</td>\n",
1812
+ " <td>115</td>\n",
1813
+ " <td>103784</td>\n",
1814
+ " <td>10774</td>\n",
1815
+ " <td>21388</td>\n",
1816
+ " <td>10245</td>\n",
1817
+ " <td>92067</td>\n",
1818
+ " <td>...</td>\n",
1819
+ " <td>0</td>\n",
1820
+ " <td>0</td>\n",
1821
+ " <td>0</td>\n",
1822
+ " <td>0</td>\n",
1823
+ " <td>1</td>\n",
1824
+ " <td>0.079412</td>\n",
1825
+ " <td>No</td>\n",
1826
+ " <td>-1.196769</td>\n",
1827
+ " <td>No</td>\n",
1828
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
1829
+ " </tr>\n",
1830
+ " <tr>\n",
1831
+ " <th>43345</th>\n",
1832
+ " <td>101</td>\n",
1833
+ " <td>139</td>\n",
1834
+ " <td>80839</td>\n",
1835
+ " <td>24109</td>\n",
1836
+ " <td>13406</td>\n",
1837
+ " <td>18985</td>\n",
1838
+ " <td>16285</td>\n",
1839
+ " <td>10163</td>\n",
1840
+ " <td>11062</td>\n",
1841
+ " <td>276</td>\n",
1842
+ " <td>...</td>\n",
1843
+ " <td>0</td>\n",
1844
+ " <td>0</td>\n",
1845
+ " <td>0</td>\n",
1846
+ " <td>0</td>\n",
1847
+ " <td>1</td>\n",
1848
+ " <td>0.118245</td>\n",
1849
+ " <td>No</td>\n",
1850
+ " <td>-1.108304</td>\n",
1851
+ " <td>No</td>\n",
1852
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
1853
+ " </tr>\n",
1854
+ " <tr>\n",
1855
+ " <th>43346</th>\n",
1856
+ " <td>101</td>\n",
1857
+ " <td>105549</td>\n",
1858
+ " <td>102635</td>\n",
1859
+ " <td>10140</td>\n",
1860
+ " <td>26943</td>\n",
1861
+ " <td>11499</td>\n",
1862
+ " <td>110516</td>\n",
1863
+ " <td>21899</td>\n",
1864
+ " <td>11861</td>\n",
1865
+ " <td>10561</td>\n",
1866
+ " <td>...</td>\n",
1867
+ " <td>0</td>\n",
1868
+ " <td>0</td>\n",
1869
+ " <td>0</td>\n",
1870
+ " <td>0</td>\n",
1871
+ " <td>1</td>\n",
1872
+ " <td>0.138229</td>\n",
1873
+ " <td>No</td>\n",
1874
+ " <td>-1.307328</td>\n",
1875
+ " <td>No</td>\n",
1876
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
1877
+ " </tr>\n",
1878
+ " <tr>\n",
1879
+ " <th>43347</th>\n",
1880
+ " <td>101</td>\n",
1881
+ " <td>81424</td>\n",
1882
+ " <td>26398</td>\n",
1883
+ " <td>92017</td>\n",
1884
+ " <td>109620</td>\n",
1885
+ " <td>10941</td>\n",
1886
+ " <td>76010</td>\n",
1887
+ " <td>10115</td>\n",
1888
+ " <td>19830</td>\n",
1889
+ " <td>26083</td>\n",
1890
+ " <td>...</td>\n",
1891
+ " <td>0</td>\n",
1892
+ " <td>0</td>\n",
1893
+ " <td>0</td>\n",
1894
+ " <td>0</td>\n",
1895
+ " <td>1</td>\n",
1896
+ " <td>0.181065</td>\n",
1897
+ " <td>No</td>\n",
1898
+ " <td>-1.127932</td>\n",
1899
+ " <td>No</td>\n",
1900
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
1901
+ " </tr>\n",
1902
+ " <tr>\n",
1903
+ " <th>43348</th>\n",
1904
+ " <td>101</td>\n",
1905
+ " <td>39774</td>\n",
1906
+ " <td>11127</td>\n",
1907
+ " <td>45989</td>\n",
1908
+ " <td>24596</td>\n",
1909
+ " <td>11933</td>\n",
1910
+ " <td>170</td>\n",
1911
+ " <td>17145</td>\n",
1912
+ " <td>10710</td>\n",
1913
+ " <td>39125</td>\n",
1914
+ " <td>...</td>\n",
1915
+ " <td>0</td>\n",
1916
+ " <td>0</td>\n",
1917
+ " <td>0</td>\n",
1918
+ " <td>0</td>\n",
1919
+ " <td>1</td>\n",
1920
+ " <td>0.085161</td>\n",
1921
+ " <td>No</td>\n",
1922
+ " <td>-1.286323</td>\n",
1923
+ " <td>No</td>\n",
1924
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
1925
+ " </tr>\n",
1926
+ " </tbody>\n",
1927
+ "</table>\n",
1928
+ "<p>43029 rows × 70 columns</p>\n",
1929
+ "</div>"
1930
+ ],
1931
+ "text/plain": [
1932
+ " 0 1 2 3 4 5 6 7 8 \n",
1933
+ "0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
1934
+ "1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
1935
+ "2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
1936
+ "3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
1937
+ "4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
1938
+ "... ... ... ... ... ... ... ... ... ... \n",
1939
+ "43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
1940
+ "43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
1941
+ "43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
1942
+ "43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
1943
+ "43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
1944
+ "\n",
1945
+ " 9 ... 60 61 62 63 labels iso_forest_scores \n",
1946
+ "0 31623 ... 0 0 0 0 0 0.189202 \\\n",
1947
+ "1 23763 ... 0 0 0 0 0 0.181234 \n",
1948
+ "2 76686 ... 0 0 0 0 0 0.166332 \n",
1949
+ "3 10147 ... 0 0 0 0 0 0.151816 \n",
1950
+ "4 13878 ... 0 0 0 0 0 0.184008 \n",
1951
+ "... ... ... .. .. .. .. ... ... \n",
1952
+ "43344 92067 ... 0 0 0 0 1 0.079412 \n",
1953
+ "43345 276 ... 0 0 0 0 1 0.118245 \n",
1954
+ "43346 10561 ... 0 0 0 0 1 0.138229 \n",
1955
+ "43347 26083 ... 0 0 0 0 1 0.181065 \n",
1956
+ "43348 39125 ... 0 0 0 0 1 0.085161 \n",
1957
+ "\n",
1958
+ " iso_forest_outliers lof_outliers outliers \n",
1959
+ "0 No -1.209681 No \\\n",
1960
+ "1 No -1.107479 No \n",
1961
+ "2 No -1.202529 No \n",
1962
+ "3 No -1.216599 No \n",
1963
+ "4 No -1.188488 No \n",
1964
+ "... ... ... ... \n",
1965
+ "43344 No -1.196769 No \n",
1966
+ "43345 No -1.108304 No \n",
1967
+ "43346 No -1.307328 No \n",
1968
+ "43347 No -1.127932 No \n",
1969
+ "43348 No -1.286323 No \n",
1970
+ "\n",
1971
+ " tweet \n",
1972
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
1973
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
1974
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
1975
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
1976
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
1977
+ "... ... \n",
1978
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
1979
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
1980
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
1981
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
1982
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
1983
+ "\n",
1984
+ "[43029 rows x 70 columns]"
1985
+ ]
1986
+ },
1987
+ "execution_count": 18,
1988
+ "metadata": {},
1989
+ "output_type": "execute_result"
1990
+ }
1991
+ ],
1992
+ "source": [
1993
+ "embeding_df"
1994
+ ]
1995
+ },
1996
+ {
1997
+ "cell_type": "code",
1998
+ "execution_count": 19,
1999
+ "metadata": {},
2000
+ "outputs": [],
2001
+ "source": [
2002
+ "# embeding_df.drop(embeding_df.loc[embeding_df['outliers']=='Yes' ].index, inplace=True)\n",
2003
+ "# embeding_df.drop(embeding_df.loc[embeding_df['iso_forest_outliers']=='Yes' ].index, inplace=True)"
2004
+ ]
2005
+ },
2006
+ {
2007
+ "cell_type": "code",
2008
+ "execution_count": 20,
2009
+ "metadata": {},
2010
+ "outputs": [],
2011
+ "source": [
2012
+ "# iso_df=embeding_df[embeding_df['iso_forest_outliers']=='Yes' ]\n",
2013
+ "# embeding_df.drop(embeding_df.loc[embeding_df['iso_forest_outliers']=='Yes' ].index, inplace=True)"
2014
+ ]
2015
+ },
2016
+ {
2017
+ "cell_type": "code",
2018
+ "execution_count": 21,
2019
+ "metadata": {},
2020
+ "outputs": [],
2021
+ "source": [
2022
+ "# lof_df=embeding_df[embeding_df['outliers']=='Yes' ]\n",
2023
+ "# embeding_df.drop(embeding_df.loc[embeding_df['outliers']=='Yes' ].index, inplace=True)"
2024
+ ]
2025
+ },
2026
+ {
2027
+ "cell_type": "code",
2028
+ "execution_count": 22,
2029
+ "metadata": {},
2030
+ "outputs": [],
2031
+ "source": [
2032
+ "# iso_df"
2033
+ ]
2034
+ },
2035
+ {
2036
+ "cell_type": "code",
2037
+ "execution_count": 23,
2038
+ "metadata": {},
2039
+ "outputs": [],
2040
+ "source": [
2041
+ "# iso_df['labels']=iso_df['labels'].replace({0: 1, 1: 0})"
2042
+ ]
2043
+ },
2044
+ {
2045
+ "cell_type": "code",
2046
+ "execution_count": 24,
2047
+ "metadata": {},
2048
+ "outputs": [],
2049
+ "source": [
2050
+ "# iso_df"
2051
+ ]
2052
+ },
2053
+ {
2054
+ "cell_type": "code",
2055
+ "execution_count": 25,
2056
+ "metadata": {},
2057
+ "outputs": [],
2058
+ "source": [
2059
+ "# lof_df['labels']=lof_df['labels'].replace({0: 1, 1: 0})"
2060
+ ]
2061
+ },
2062
+ {
2063
+ "cell_type": "code",
2064
+ "execution_count": 26,
2065
+ "metadata": {},
2066
+ "outputs": [],
2067
+ "source": [
2068
+ "# lof_df"
2069
+ ]
2070
+ },
2071
+ {
2072
+ "cell_type": "code",
2073
+ "execution_count": 27,
2074
+ "metadata": {},
2075
+ "outputs": [],
2076
+ "source": [
2077
+ "# x=pd.concat([lof_df,iso_df], axis=0)"
2078
+ ]
2079
+ },
2080
+ {
2081
+ "cell_type": "code",
2082
+ "execution_count": 28,
2083
+ "metadata": {},
2084
+ "outputs": [],
2085
+ "source": [
2086
+ "# embeding_df=pd.concat([x,embeding_df], axis=0)"
2087
+ ]
2088
+ },
2089
+ {
2090
+ "cell_type": "code",
2091
+ "execution_count": 29,
2092
+ "metadata": {},
2093
+ "outputs": [],
2094
+ "source": [
2095
+ "# embeding_df.reset_index()"
2096
+ ]
2097
+ },
2098
+ {
2099
+ "cell_type": "code",
2100
+ "execution_count": 30,
2101
+ "metadata": {},
2102
+ "outputs": [],
2103
+ "source": [
2104
+ "# embeding_df=embeding_df.drop(['iso_forest_scores', 'iso_forest_outliers','lof_outliers','outliers'], axis=1)"
2105
+ ]
2106
+ },
2107
+ {
2108
+ "cell_type": "code",
2109
+ "execution_count": 31,
2110
+ "metadata": {},
2111
+ "outputs": [],
2112
+ "source": [
2113
+ "embeding_df['0'] = embeding_df[embeding_df.columns[:-1]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)\n"
2114
+ ]
2115
+ },
2116
+ {
2117
+ "cell_type": "code",
2118
+ "execution_count": 32,
2119
+ "metadata": {},
2120
+ "outputs": [],
2121
+ "source": [
2122
+ "df=pd.DataFrame()\n",
2123
+ "df['tweet']=embeding_df['tweet']\n",
2124
+ "df['subtas_a']=embeding_df['labels']\n"
2125
+ ]
2126
+ },
2127
+ {
2128
+ "cell_type": "code",
2129
+ "execution_count": 33,
2130
+ "metadata": {},
2131
+ "outputs": [
2132
+ {
2133
+ "data": {
2134
+ "text/html": [
2135
+ "<div>\n",
2136
+ "<style scoped>\n",
2137
+ " .dataframe tbody tr th:only-of-type {\n",
2138
+ " vertical-align: middle;\n",
2139
+ " }\n",
2140
+ "\n",
2141
+ " .dataframe tbody tr th {\n",
2142
+ " vertical-align: top;\n",
2143
+ " }\n",
2144
+ "\n",
2145
+ " .dataframe thead th {\n",
2146
+ " text-align: right;\n",
2147
+ " }\n",
2148
+ "</style>\n",
2149
+ "<table border=\"1\" class=\"dataframe\">\n",
2150
+ " <thead>\n",
2151
+ " <tr style=\"text-align: right;\">\n",
2152
+ " <th></th>\n",
2153
+ " <th>tweet</th>\n",
2154
+ " <th>subtas_a</th>\n",
2155
+ " </tr>\n",
2156
+ " </thead>\n",
2157
+ " <tbody>\n",
2158
+ " <tr>\n",
2159
+ " <th>0</th>\n",
2160
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
2161
+ " <td>0</td>\n",
2162
+ " </tr>\n",
2163
+ " <tr>\n",
2164
+ " <th>1</th>\n",
2165
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
2166
+ " <td>0</td>\n",
2167
+ " </tr>\n",
2168
+ " <tr>\n",
2169
+ " <th>2</th>\n",
2170
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
2171
+ " <td>0</td>\n",
2172
+ " </tr>\n",
2173
+ " <tr>\n",
2174
+ " <th>3</th>\n",
2175
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
2176
+ " <td>0</td>\n",
2177
+ " </tr>\n",
2178
+ " <tr>\n",
2179
+ " <th>4</th>\n",
2180
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
2181
+ " <td>0</td>\n",
2182
+ " </tr>\n",
2183
+ " <tr>\n",
2184
+ " <th>...</th>\n",
2185
+ " <td>...</td>\n",
2186
+ " <td>...</td>\n",
2187
+ " </tr>\n",
2188
+ " <tr>\n",
2189
+ " <th>43344</th>\n",
2190
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
2191
+ " <td>1</td>\n",
2192
+ " </tr>\n",
2193
+ " <tr>\n",
2194
+ " <th>43345</th>\n",
2195
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
2196
+ " <td>1</td>\n",
2197
+ " </tr>\n",
2198
+ " <tr>\n",
2199
+ " <th>43346</th>\n",
2200
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
2201
+ " <td>1</td>\n",
2202
+ " </tr>\n",
2203
+ " <tr>\n",
2204
+ " <th>43347</th>\n",
2205
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
2206
+ " <td>1</td>\n",
2207
+ " </tr>\n",
2208
+ " <tr>\n",
2209
+ " <th>43348</th>\n",
2210
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
2211
+ " <td>1</td>\n",
2212
+ " </tr>\n",
2213
+ " </tbody>\n",
2214
+ "</table>\n",
2215
+ "<p>43029 rows × 2 columns</p>\n",
2216
+ "</div>"
2217
+ ],
2218
+ "text/plain": [
2219
+ " tweet subtas_a\n",
2220
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
2221
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
2222
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
2223
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
2224
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
2225
+ "... ... ...\n",
2226
+ "43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
2227
+ "43345 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
2228
+ "43346 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
2229
+ "43347 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
2230
+ "43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
2231
+ "\n",
2232
+ "[43029 rows x 2 columns]"
2233
+ ]
2234
+ },
2235
+ "execution_count": 33,
2236
+ "metadata": {},
2237
+ "output_type": "execute_result"
2238
+ }
2239
+ ],
2240
+ "source": [
2241
+ "df"
2242
+ ]
2243
+ },
2244
+ {
2245
+ "cell_type": "code",
2246
+ "execution_count": 40,
2247
+ "metadata": {},
2248
+ "outputs": [],
2249
+ "source": [
2250
+ "df.to_csv('inverse_outliers.csv') "
2251
+ ]
2252
+ },
2253
+ {
2254
+ "cell_type": "code",
2255
+ "execution_count": 34,
2256
+ "metadata": {},
2257
+ "outputs": [],
2258
+ "source": [
2259
+ "df.to_csv('int_2_outliers.csv') "
2260
+ ]
2261
+ },
2262
+ {
2263
+ "cell_type": "code",
2264
+ "execution_count": null,
2265
+ "metadata": {},
2266
+ "outputs": [],
2267
+ "source": []
2268
+ }
2269
+ ],
2270
+ "metadata": {
2271
+ "kernelspec": {
2272
+ "display_name": "dl_env",
2273
+ "language": "python",
2274
+ "name": "python3"
2275
+ },
2276
+ "language_info": {
2277
+ "codemirror_mode": {
2278
+ "name": "ipython",
2279
+ "version": 3
2280
+ },
2281
+ "file_extension": ".py",
2282
+ "mimetype": "text/x-python",
2283
+ "name": "python",
2284
+ "nbconvert_exporter": "python",
2285
+ "pygments_lexer": "ipython3",
2286
+ "version": "3.9.0"
2287
+ },
2288
+ "orig_nbformat": 4
2289
+ },
2290
+ "nbformat": 4,
2291
+ "nbformat_minor": 2
2292
+ }
pycaret_outlier_detection.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pytorch_lightning
2
+ emoji
3
+ transformers
4
+ numpy
5
+ pandas
6
+ os
7
+ random
8
+ torch
9
+ torch-metrics
10
+ torch-utils
trainer.ipynb ADDED
@@ -0,0 +1,1165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import emoji\n",
10
+ "import numpy as np\n",
11
+ "import pandas as pd\n",
12
+ "from sklearn.preprocessing import LabelEncoder\n",
13
+ "from transformers import AutoTokenizer"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "# train_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-training-v1.tsv',sep='\\t')\n",
23
+ "# test_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-testset-v1.tsv',sep='\\t')\n",
24
+ "# augmented_df=pd.read_csv('augmented_data_offensive.csv')\n",
25
+ "# selin_df=pd.read_csv('/DATA/sin-kaf/selin_data.csv')"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "outliers_df=pd.read_csv('/DATA/sin-kaf/cluster_outliers.csv')\n",
35
+ "outliers_df=outliers_df.drop(['Unnamed: 0'], axis=1)\n",
36
+ "outliers_df['subtask_a'] = outliers_df['subtas_a']\n",
37
+ "outliers_df=outliers_df.drop(['subtas_a'], axis=1)\n"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 4,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "train_df=outliers_df"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 5,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# augmented_df=augmented_df.drop(['Unnamed: 0'], axis=1)\n",
56
+ "# augmented_df = augmented_df.dropna()\n",
57
+ "# train_df=pd.concat([train_df,augmented_df], axis=0)\n",
58
+ "# train_df=pd.concat([train_df,test_df], axis=0)\n",
59
+ "# train_df=train_df.drop(['id'], axis=1)\n",
60
+ "data=train_df['tweet'].tolist()\n",
61
+ "for i in range(len(data)):\n",
62
+ " data[i] = data[i].replace('@USER','')\n",
63
+ " data[i] = data[i].replace('#','')\n",
64
+ " data[i] = data[i].replace('$','')\n",
65
+ " data[i] = emoji.demojize(data[i])\n",
66
+ " \n",
67
+ "train_df['tweet'] = data\n",
68
+ "lab = LabelEncoder()\n",
69
+ "train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])\n",
70
+ "df = train_df[train_df.subtask_a != 2]"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 6,
76
+ "metadata": {},
77
+ "outputs": [
78
+ {
79
+ "data": {
80
+ "text/html": [
81
+ "<div>\n",
82
+ "<style scoped>\n",
83
+ " .dataframe tbody tr th:only-of-type {\n",
84
+ " vertical-align: middle;\n",
85
+ " }\n",
86
+ "\n",
87
+ " .dataframe tbody tr th {\n",
88
+ " vertical-align: top;\n",
89
+ " }\n",
90
+ "\n",
91
+ " .dataframe thead th {\n",
92
+ " text-align: right;\n",
93
+ " }\n",
94
+ "</style>\n",
95
+ "<table border=\"1\" class=\"dataframe\">\n",
96
+ " <thead>\n",
97
+ " <tr style=\"text-align: right;\">\n",
98
+ " <th></th>\n",
99
+ " <th>tweet</th>\n",
100
+ " <th>subtask_a</th>\n",
101
+ " </tr>\n",
102
+ " </thead>\n",
103
+ " <tbody>\n",
104
+ " <tr>\n",
105
+ " <th>0</th>\n",
106
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
107
+ " <td>0</td>\n",
108
+ " </tr>\n",
109
+ " <tr>\n",
110
+ " <th>1</th>\n",
111
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
112
+ " <td>0</td>\n",
113
+ " </tr>\n",
114
+ " <tr>\n",
115
+ " <th>2</th>\n",
116
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
117
+ " <td>0</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>3</th>\n",
121
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
122
+ " <td>0</td>\n",
123
+ " </tr>\n",
124
+ " <tr>\n",
125
+ " <th>4</th>\n",
126
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
127
+ " <td>0</td>\n",
128
+ " </tr>\n",
129
+ " <tr>\n",
130
+ " <th>...</th>\n",
131
+ " <td>...</td>\n",
132
+ " <td>...</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>41177</th>\n",
136
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
137
+ " <td>1</td>\n",
138
+ " </tr>\n",
139
+ " <tr>\n",
140
+ " <th>41178</th>\n",
141
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
142
+ " <td>1</td>\n",
143
+ " </tr>\n",
144
+ " <tr>\n",
145
+ " <th>41179</th>\n",
146
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
147
+ " <td>1</td>\n",
148
+ " </tr>\n",
149
+ " <tr>\n",
150
+ " <th>41180</th>\n",
151
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
152
+ " <td>1</td>\n",
153
+ " </tr>\n",
154
+ " <tr>\n",
155
+ " <th>41181</th>\n",
156
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
157
+ " <td>1</td>\n",
158
+ " </tr>\n",
159
+ " </tbody>\n",
160
+ "</table>\n",
161
+ "<p>41182 rows × 2 columns</p>\n",
162
+ "</div>"
163
+ ],
164
+ "text/plain": [
165
+ " tweet subtask_a\n",
166
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
167
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
168
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
169
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
170
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
171
+ "... ... ...\n",
172
+ "41177 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
173
+ "41178 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
174
+ "41179 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
175
+ "41180 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
176
+ "41181 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
177
+ "\n",
178
+ "[41182 rows x 2 columns]"
179
+ ]
180
+ },
181
+ "execution_count": 6,
182
+ "metadata": {},
183
+ "output_type": "execute_result"
184
+ }
185
+ ],
186
+ "source": [
187
+ "train_df"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 7,
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "# train_df=pd.concat([train_df,selin_df], axis=0)"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 8,
202
+ "metadata": {},
203
+ "outputs": [
204
+ {
205
+ "data": {
206
+ "text/html": [
207
+ "<div>\n",
208
+ "<style scoped>\n",
209
+ " .dataframe tbody tr th:only-of-type {\n",
210
+ " vertical-align: middle;\n",
211
+ " }\n",
212
+ "\n",
213
+ " .dataframe tbody tr th {\n",
214
+ " vertical-align: top;\n",
215
+ " }\n",
216
+ "\n",
217
+ " .dataframe thead th {\n",
218
+ " text-align: right;\n",
219
+ " }\n",
220
+ "</style>\n",
221
+ "<table border=\"1\" class=\"dataframe\">\n",
222
+ " <thead>\n",
223
+ " <tr style=\"text-align: right;\">\n",
224
+ " <th></th>\n",
225
+ " <th>tweet</th>\n",
226
+ " <th>subtask_a</th>\n",
227
+ " </tr>\n",
228
+ " </thead>\n",
229
+ " <tbody>\n",
230
+ " <tr>\n",
231
+ " <th>0</th>\n",
232
+ " <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
233
+ " <td>0</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>1</th>\n",
237
+ " <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
238
+ " <td>0</td>\n",
239
+ " </tr>\n",
240
+ " <tr>\n",
241
+ " <th>2</th>\n",
242
+ " <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
243
+ " <td>0</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>3</th>\n",
247
+ " <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
248
+ " <td>0</td>\n",
249
+ " </tr>\n",
250
+ " <tr>\n",
251
+ " <th>4</th>\n",
252
+ " <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
253
+ " <td>0</td>\n",
254
+ " </tr>\n",
255
+ " <tr>\n",
256
+ " <th>...</th>\n",
257
+ " <td>...</td>\n",
258
+ " <td>...</td>\n",
259
+ " </tr>\n",
260
+ " <tr>\n",
261
+ " <th>41177</th>\n",
262
+ " <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
263
+ " <td>1</td>\n",
264
+ " </tr>\n",
265
+ " <tr>\n",
266
+ " <th>41178</th>\n",
267
+ " <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
268
+ " <td>1</td>\n",
269
+ " </tr>\n",
270
+ " <tr>\n",
271
+ " <th>41179</th>\n",
272
+ " <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
273
+ " <td>1</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>41180</th>\n",
277
+ " <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
278
+ " <td>1</td>\n",
279
+ " </tr>\n",
280
+ " <tr>\n",
281
+ " <th>41181</th>\n",
282
+ " <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
283
+ " <td>1</td>\n",
284
+ " </tr>\n",
285
+ " </tbody>\n",
286
+ "</table>\n",
287
+ "<p>41182 rows × 2 columns</p>\n",
288
+ "</div>"
289
+ ],
290
+ "text/plain": [
291
+ " tweet subtask_a\n",
292
+ "0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
293
+ "1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
294
+ "2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
295
+ "3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
296
+ "4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
297
+ "... ... ...\n",
298
+ "41177 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
299
+ "41178 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
300
+ "41179 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
301
+ "41180 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
302
+ "41181 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
303
+ "\n",
304
+ "[41182 rows x 2 columns]"
305
+ ]
306
+ },
307
+ "execution_count": 8,
308
+ "metadata": {},
309
+ "output_type": "execute_result"
310
+ }
311
+ ],
312
+ "source": [
313
+ "train_df"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": 9,
319
+ "metadata": {},
320
+ "outputs": [],
321
+ "source": [
322
+ "train_df = df.sample(frac = 0.7, random_state = 200)\n",
323
+ "df_2 = df.drop(train_df.index)\n",
324
+ "test_df = df_2.sample(frac = 0.15, random_state = 200)\n",
325
+ "val_df = df_2.drop(test_df.index)"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 10,
331
+ "metadata": {},
332
+ "outputs": [],
333
+ "source": [
334
+ "text_train = train_df.tweet.values\n",
335
+ "label_train = train_df.subtask_a.values"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": 11,
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": [
344
+ "text_test = test_df.tweet.values\n",
345
+ "label_test = test_df.subtask_a.values"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": 12,
351
+ "metadata": {},
352
+ "outputs": [],
353
+ "source": [
354
+ "text_val = val_df.tweet.values\n",
355
+ "label_val = val_df.subtask_a.values"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 13,
361
+ "metadata": {},
362
+ "outputs": [],
363
+ "source": [
364
+ "from datasets.dataset_dict import DatasetDict\n",
365
+ "from datasets import Dataset\n",
366
+ "dataset={'train':Dataset.from_dict({'label':label_train,'text':text_train}),\n",
367
+ " 'val':Dataset.from_dict({'label':label_val,'text':text_val}),\n",
368
+ " 'test':Dataset.from_dict({'label':label_test,'text':text_test})\n",
369
+ " }\n",
370
+ "dataset = DatasetDict(dataset)"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 14,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\")\n",
380
+ "# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\")\n",
381
+ "tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\")\n",
382
+ "# tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",max_length=208,padding=\"max_length\",truncation=True,return_tensors=\"pt\",add_special_tokens=True,)\n",
383
+ "# tokenizer = AutoTokenizer.from_pretrained(\"stage_f/pretrain_mlm_distilbert-base-turkish-cased\")\n",
384
+ "def tokenize_function(examples):\n",
385
+ " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 15,
391
+ "metadata": {},
392
+ "outputs": [
393
+ {
394
+ "data": {
395
+ "application/vnd.jupyter.widget-view+json": {
396
+ "model_id": "5fba4c9671724e9a93d6ad14a1427345",
397
+ "version_major": 2,
398
+ "version_minor": 0
399
+ },
400
+ "text/plain": [
401
+ "Map: 0%| | 0/28827 [00:00<?, ? examples/s]"
402
+ ]
403
+ },
404
+ "metadata": {},
405
+ "output_type": "display_data"
406
+ },
407
+ {
408
+ "data": {
409
+ "application/vnd.jupyter.widget-view+json": {
410
+ "model_id": "2fff446f4f094d2fb66da549a49ad8a4",
411
+ "version_major": 2,
412
+ "version_minor": 0
413
+ },
414
+ "text/plain": [
415
+ "Map: 0%| | 0/10502 [00:00<?, ? examples/s]"
416
+ ]
417
+ },
418
+ "metadata": {},
419
+ "output_type": "display_data"
420
+ },
421
+ {
422
+ "data": {
423
+ "application/vnd.jupyter.widget-view+json": {
424
+ "model_id": "675f3b595b21489abaca01453c06db2c",
425
+ "version_major": 2,
426
+ "version_minor": 0
427
+ },
428
+ "text/plain": [
429
+ "Map: 0%| | 0/1853 [00:00<?, ? examples/s]"
430
+ ]
431
+ },
432
+ "metadata": {},
433
+ "output_type": "display_data"
434
+ }
435
+ ],
436
+ "source": [
437
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 16,
443
+ "metadata": {},
444
+ "outputs": [],
445
+ "source": [
446
+ "small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42)\n",
447
+ "small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42)"
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": 17,
453
+ "metadata": {},
454
+ "outputs": [
455
+ {
456
+ "data": {
457
+ "text/plain": [
458
+ "Dataset({\n",
459
+ " features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
460
+ " num_rows: 28827\n",
461
+ "})"
462
+ ]
463
+ },
464
+ "execution_count": 17,
465
+ "metadata": {},
466
+ "output_type": "execute_result"
467
+ }
468
+ ],
469
+ "source": [
470
+ "small_train_dataset"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 18,
476
+ "metadata": {},
477
+ "outputs": [
478
+ {
479
+ "data": {
480
+ "text/plain": [
481
+ "Dataset({\n",
482
+ " features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
483
+ " num_rows: 1853\n",
484
+ "})"
485
+ ]
486
+ },
487
+ "execution_count": 18,
488
+ "metadata": {},
489
+ "output_type": "execute_result"
490
+ }
491
+ ],
492
+ "source": [
493
+ "small_eval_dataset"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 19,
499
+ "metadata": {},
500
+ "outputs": [
501
+ {
502
+ "name": "stderr",
503
+ "output_type": "stream",
504
+ "text": [
505
+ "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Overfit-GM/distilbert-base-turkish-cased-offensive and are newly initialized because the shapes did not match:\n",
506
+ "- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated\n",
507
+ "- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated\n",
508
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
509
+ ]
510
+ }
511
+ ],
512
+ "source": [
513
+ "from transformers import AutoModelForSequenceClassification\n",
514
+ "\n",
515
+ "# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\",num_labels = 2)\n",
516
+ "# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\",num_labels = 2)\n",
517
+ "# model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)\n",
518
+ "model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)"
519
+ ]
520
+ },
521
+ {
522
+ "cell_type": "code",
523
+ "execution_count": 20,
524
+ "metadata": {},
525
+ "outputs": [],
526
+ "source": [
527
+ "from transformers import TrainingArguments\n",
528
+ "\n",
529
+ "training_args = TrainingArguments(output_dir=\"test_trainer\")"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 21,
535
+ "metadata": {},
536
+ "outputs": [],
537
+ "source": [
538
+ "# import numpy as np\n",
539
+ "# import evaluate\n",
540
+ "\n",
541
+ "# # metric = evaluate.load(\"accuracy\")\n",
542
+ "# # confusion_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")\n",
543
+ "# # metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\", \"confusion_matrix\"])\n",
544
+ "# metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": 22,
550
+ "metadata": {},
551
+ "outputs": [],
552
+ "source": [
553
+ "import numpy as np\n",
554
+ "import evaluate\n",
555
+ "\n",
556
+ "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
557
+ "conf_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 23,
563
+ "metadata": {},
564
+ "outputs": [],
565
+ "source": [
566
+ "def compute_metrics(eval_pred):\n",
567
+ " logits, labels = eval_pred\n",
568
+ " predictions = np.argmax(logits, axis=-1)\n",
569
+ " print(conf_matrix.compute(predictions=predictions, references=labels))\n",
570
+ " return metric.compute(predictions=predictions, references=labels)"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": 24,
576
+ "metadata": {},
577
+ "outputs": [],
578
+ "source": [
579
+ "from transformers import TrainingArguments, Trainer\n",
580
+ "from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n",
581
+ "\n",
582
+ "training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\", num_train_epochs = 5, logging_dir ='TensorBoard',report_to ='mlflow')"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": 25,
588
+ "metadata": {},
589
+ "outputs": [],
590
+ "source": [
591
+ "trainer = Trainer(\n",
592
+ " model=model,\n",
593
+ " args=training_args,\n",
594
+ " train_dataset=small_train_dataset,\n",
595
+ " eval_dataset=small_eval_dataset,\n",
596
+ " compute_metrics=compute_metrics,\n",
597
+ ")"
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": 26,
603
+ "metadata": {},
604
+ "outputs": [
605
+ {
606
+ "name": "stderr",
607
+ "output_type": "stream",
608
+ "text": [
609
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
610
+ "To disable this warning, you can either:\n",
611
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
612
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
613
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
614
+ "To disable this warning, you can either:\n",
615
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
616
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
617
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
618
+ "To disable this warning, you can either:\n",
619
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
620
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
621
+ ]
622
+ },
623
+ {
624
+ "data": {
625
+ "application/vnd.jupyter.widget-view+json": {
626
+ "model_id": "a38121a009be4a0f90e30fc9c0cf49ed",
627
+ "version_major": 2,
628
+ "version_minor": 0
629
+ },
630
+ "text/plain": [
631
+ " 0%| | 0/18020 [00:00<?, ?it/s]"
632
+ ]
633
+ },
634
+ "metadata": {},
635
+ "output_type": "display_data"
636
+ },
637
+ {
638
+ "name": "stdout",
639
+ "output_type": "stream",
640
+ "text": [
641
+ "{'loss': 0.4638, 'learning_rate': 4.86126526082131e-05, 'epoch': 0.14}\n",
642
+ "{'loss': 0.3886, 'learning_rate': 4.72253052164262e-05, 'epoch': 0.28}\n",
643
+ "{'loss': 0.3893, 'learning_rate': 4.583795782463929e-05, 'epoch': 0.42}\n",
644
+ "{'loss': 0.3594, 'learning_rate': 4.445061043285239e-05, 'epoch': 0.55}\n",
645
+ "{'loss': 0.3547, 'learning_rate': 4.306326304106548e-05, 'epoch': 0.69}\n",
646
+ "{'loss': 0.3384, 'learning_rate': 4.167591564927858e-05, 'epoch': 0.83}\n",
647
+ "{'loss': 0.3498, 'learning_rate': 4.028856825749168e-05, 'epoch': 0.97}\n"
648
+ ]
649
+ },
650
+ {
651
+ "data": {
652
+ "application/vnd.jupyter.widget-view+json": {
653
+ "model_id": "94ab139e1ebb482da2111517ad5a3a78",
654
+ "version_major": 2,
655
+ "version_minor": 0
656
+ },
657
+ "text/plain": [
658
+ " 0%| | 0/232 [00:00<?, ?it/s]"
659
+ ]
660
+ },
661
+ "metadata": {},
662
+ "output_type": "display_data"
663
+ },
664
+ {
665
+ "name": "stdout",
666
+ "output_type": "stream",
667
+ "text": [
668
+ "{'confusion_matrix': array([[966, 90],\n",
669
+ " [118, 679]])}\n",
670
+ "{'eval_loss': 0.28741681575775146, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8671775223499362, 'eval_precision': 0.88296488946684, 'eval_recall': 0.8519447929736512, 'eval_runtime': 11.4928, 'eval_samples_per_second': 161.231, 'eval_steps_per_second': 20.186, 'epoch': 1.0}\n",
671
+ "{'loss': 0.2449, 'learning_rate': 3.890122086570477e-05, 'epoch': 1.11}\n",
672
+ "{'loss': 0.2178, 'learning_rate': 3.751387347391787e-05, 'epoch': 1.25}\n",
673
+ "{'loss': 0.2431, 'learning_rate': 3.612652608213097e-05, 'epoch': 1.39}\n",
674
+ "{'loss': 0.2261, 'learning_rate': 3.4739178690344064e-05, 'epoch': 1.53}\n",
675
+ "{'loss': 0.2365, 'learning_rate': 3.3351831298557165e-05, 'epoch': 1.66}\n",
676
+ "{'loss': 0.2169, 'learning_rate': 3.196448390677026e-05, 'epoch': 1.8}\n",
677
+ "{'loss': 0.222, 'learning_rate': 3.0577136514983354e-05, 'epoch': 1.94}\n"
678
+ ]
679
+ },
680
+ {
681
+ "data": {
682
+ "application/vnd.jupyter.widget-view+json": {
683
+ "model_id": "063c47c6cae0467194d4c0827e67c277",
684
+ "version_major": 2,
685
+ "version_minor": 0
686
+ },
687
+ "text/plain": [
688
+ " 0%| | 0/232 [00:00<?, ?it/s]"
689
+ ]
690
+ },
691
+ "metadata": {},
692
+ "output_type": "display_data"
693
+ },
694
+ {
695
+ "name": "stdout",
696
+ "output_type": "stream",
697
+ "text": [
698
+ "{'confusion_matrix': array([[900, 156],\n",
699
+ " [ 76, 721]])}\n",
700
+ "{'eval_loss': 0.47509443759918213, 'eval_accuracy': 0.8747976254722072, 'eval_f1': 0.8614097968936678, 'eval_precision': 0.82212086659065, 'eval_recall': 0.904642409033877, 'eval_runtime': 11.6203, 'eval_samples_per_second': 159.462, 'eval_steps_per_second': 19.965, 'epoch': 2.0}\n",
701
+ "{'loss': 0.146, 'learning_rate': 2.918978912319645e-05, 'epoch': 2.08}\n",
702
+ "{'loss': 0.1163, 'learning_rate': 2.7802441731409544e-05, 'epoch': 2.22}\n",
703
+ "{'loss': 0.1008, 'learning_rate': 2.641509433962264e-05, 'epoch': 2.36}\n",
704
+ "{'loss': 0.0967, 'learning_rate': 2.502774694783574e-05, 'epoch': 2.5}\n",
705
+ "{'loss': 0.1456, 'learning_rate': 2.3640399556048838e-05, 'epoch': 2.64}\n",
706
+ "{'loss': 0.1178, 'learning_rate': 2.2253052164261932e-05, 'epoch': 2.77}\n",
707
+ "{'loss': 0.1155, 'learning_rate': 2.0865704772475027e-05, 'epoch': 2.91}\n"
708
+ ]
709
+ },
710
+ {
711
+ "data": {
712
+ "application/vnd.jupyter.widget-view+json": {
713
+ "model_id": "4fa52dfbbae54cde8c627a237bed51bc",
714
+ "version_major": 2,
715
+ "version_minor": 0
716
+ },
717
+ "text/plain": [
718
+ " 0%| | 0/232 [00:00<?, ?it/s]"
719
+ ]
720
+ },
721
+ "metadata": {},
722
+ "output_type": "display_data"
723
+ },
724
+ {
725
+ "name": "stdout",
726
+ "output_type": "stream",
727
+ "text": [
728
+ "{'confusion_matrix': array([[954, 102],\n",
729
+ " [106, 691]])}\n",
730
+ "{'eval_loss': 0.5530020594596863, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8691823899371071, 'eval_precision': 0.8713745271122321, 'eval_recall': 0.8670012547051443, 'eval_runtime': 11.6026, 'eval_samples_per_second': 159.706, 'eval_steps_per_second': 19.996, 'epoch': 3.0}\n",
731
+ "{'loss': 0.0879, 'learning_rate': 1.9478357380688125e-05, 'epoch': 3.05}\n",
732
+ "{'loss': 0.0351, 'learning_rate': 1.8091009988901223e-05, 'epoch': 3.19}\n",
733
+ "{'loss': 0.0501, 'learning_rate': 1.670366259711432e-05, 'epoch': 3.33}\n",
734
+ "{'loss': 0.0425, 'learning_rate': 1.5316315205327412e-05, 'epoch': 3.47}\n",
735
+ "{'loss': 0.0564, 'learning_rate': 1.392896781354051e-05, 'epoch': 3.61}\n",
736
+ "{'loss': 0.05, 'learning_rate': 1.2541620421753608e-05, 'epoch': 3.75}\n",
737
+ "{'loss': 0.034, 'learning_rate': 1.1154273029966705e-05, 'epoch': 3.88}\n"
738
+ ]
739
+ },
740
+ {
741
+ "data": {
742
+ "application/vnd.jupyter.widget-view+json": {
743
+ "model_id": "a9b754cd0e7641cb8d8023f28bc32a06",
744
+ "version_major": 2,
745
+ "version_minor": 0
746
+ },
747
+ "text/plain": [
748
+ " 0%| | 0/232 [00:00<?, ?it/s]"
749
+ ]
750
+ },
751
+ "metadata": {},
752
+ "output_type": "display_data"
753
+ },
754
+ {
755
+ "name": "stdout",
756
+ "output_type": "stream",
757
+ "text": [
758
+ "{'confusion_matrix': array([[966, 90],\n",
759
+ " [109, 688]])}\n",
760
+ "{'eval_loss': 0.824292778968811, 'eval_accuracy': 0.8926065839179709, 'eval_f1': 0.8736507936507937, 'eval_precision': 0.884318766066838, 'eval_recall': 0.863237139272271, 'eval_runtime': 11.6185, 'eval_samples_per_second': 159.487, 'eval_steps_per_second': 19.968, 'epoch': 4.0}\n",
761
+ "{'loss': 0.0354, 'learning_rate': 9.766925638179801e-06, 'epoch': 4.02}\n",
762
+ "{'loss': 0.0165, 'learning_rate': 8.379578246392897e-06, 'epoch': 4.16}\n",
763
+ "{'loss': 0.0119, 'learning_rate': 6.992230854605994e-06, 'epoch': 4.3}\n",
764
+ "{'loss': 0.0145, 'learning_rate': 5.60488346281909e-06, 'epoch': 4.44}\n",
765
+ "{'loss': 0.0169, 'learning_rate': 4.217536071032187e-06, 'epoch': 4.58}\n",
766
+ "{'loss': 0.0132, 'learning_rate': 2.830188679245283e-06, 'epoch': 4.72}\n",
767
+ "{'loss': 0.0232, 'learning_rate': 1.4428412874583796e-06, 'epoch': 4.86}\n",
768
+ "{'loss': 0.0189, 'learning_rate': 5.549389567147614e-08, 'epoch': 4.99}\n"
769
+ ]
770
+ },
771
+ {
772
+ "data": {
773
+ "application/vnd.jupyter.widget-view+json": {
774
+ "model_id": "e66e5b59c6ba42ae9939f55dcda3c877",
775
+ "version_major": 2,
776
+ "version_minor": 0
777
+ },
778
+ "text/plain": [
779
+ " 0%| | 0/232 [00:00<?, ?it/s]"
780
+ ]
781
+ },
782
+ "metadata": {},
783
+ "output_type": "display_data"
784
+ },
785
+ {
786
+ "name": "stdout",
787
+ "output_type": "stream",
788
+ "text": [
789
+ "{'confusion_matrix': array([[955, 101],\n",
790
+ " [111, 686]])}\n",
791
+ "{'eval_loss': 0.937654972076416, 'eval_accuracy': 0.8855909336211549, 'eval_f1': 0.8661616161616161, 'eval_precision': 0.8716645489199492, 'eval_recall': 0.8607277289836889, 'eval_runtime': 11.5644, 'eval_samples_per_second': 160.233, 'eval_steps_per_second': 20.062, 'epoch': 5.0}\n",
792
+ "{'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0}\n"
793
+ ]
794
+ },
795
+ {
796
+ "data": {
797
+ "text/plain": [
798
+ "TrainOutput(global_step=18020, training_loss=0.15528733040680712, metrics={'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0})"
799
+ ]
800
+ },
801
+ "execution_count": 26,
802
+ "metadata": {},
803
+ "output_type": "execute_result"
804
+ }
805
+ ],
806
+ "source": [
807
+ "trainer.train()"
808
+ ]
809
+ },
810
+ {
811
+ "cell_type": "markdown",
812
+ "metadata": {},
813
+ "source": [
814
+ "# best case"
815
+ ]
816
+ },
817
+ {
818
+ "cell_type": "code",
819
+ "execution_count": 25,
820
+ "metadata": {},
821
+ "outputs": [
822
+ {
823
+ "name": "stderr",
824
+ "output_type": "stream",
825
+ "text": [
826
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
827
+ "To disable this warning, you can either:\n",
828
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
829
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
830
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
831
+ "To disable this warning, you can either:\n",
832
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
833
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
834
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
835
+ "To disable this warning, you can either:\n",
836
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
837
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
838
+ ]
839
+ },
840
+ {
841
+ "data": {
842
+ "application/vnd.jupyter.widget-view+json": {
843
+ "model_id": "4620503cb22c41a582c44a3d17fac2f6",
844
+ "version_major": 2,
845
+ "version_minor": 0
846
+ },
847
+ "text/plain": [
848
+ " 0%| | 0/18825 [00:00<?, ?it/s]"
849
+ ]
850
+ },
851
+ "metadata": {},
852
+ "output_type": "display_data"
853
+ },
854
+ {
855
+ "name": "stdout",
856
+ "output_type": "stream",
857
+ "text": [
858
+ "{'loss': 0.4623, 'learning_rate': 4.867197875166003e-05, 'epoch': 0.13}\n",
859
+ "{'loss': 0.3955, 'learning_rate': 4.734395750332006e-05, 'epoch': 0.27}\n",
860
+ "{'loss': 0.3695, 'learning_rate': 4.601593625498008e-05, 'epoch': 0.4}\n",
861
+ "{'loss': 0.368, 'learning_rate': 4.4687915006640105e-05, 'epoch': 0.53}\n",
862
+ "{'loss': 0.3418, 'learning_rate': 4.335989375830013e-05, 'epoch': 0.66}\n",
863
+ "{'loss': 0.3519, 'learning_rate': 4.203187250996016e-05, 'epoch': 0.8}\n",
864
+ "{'loss': 0.3418, 'learning_rate': 4.070385126162019e-05, 'epoch': 0.93}\n"
865
+ ]
866
+ },
867
+ {
868
+ "data": {
869
+ "application/vnd.jupyter.widget-view+json": {
870
+ "model_id": "c81779b9a7eb43cfa29966957f13ec31",
871
+ "version_major": 2,
872
+ "version_minor": 0
873
+ },
874
+ "text/plain": [
875
+ " 0%| | 0/242 [00:00<?, ?it/s]"
876
+ ]
877
+ },
878
+ "metadata": {},
879
+ "output_type": "display_data"
880
+ },
881
+ {
882
+ "name": "stdout",
883
+ "output_type": "stream",
884
+ "text": [
885
+ "{'eval_loss': 0.2548353374004364, 'eval_accuracy': 0.9013429752066116, 'eval_f1': 0.8737607402511566, 'eval_precision': 0.9218967921896792, 'eval_recall': 0.8304020100502513, 'eval_runtime': 12.1488, 'eval_samples_per_second': 159.357, 'eval_steps_per_second': 19.92, 'epoch': 1.0}\n",
886
+ "{'loss': 0.2884, 'learning_rate': 3.9375830013280215e-05, 'epoch': 1.06}\n",
887
+ "{'loss': 0.2136, 'learning_rate': 3.804780876494024e-05, 'epoch': 1.2}\n",
888
+ "{'loss': 0.2422, 'learning_rate': 3.671978751660027e-05, 'epoch': 1.33}\n",
889
+ "{'loss': 0.2105, 'learning_rate': 3.53917662682603e-05, 'epoch': 1.46}\n",
890
+ "{'loss': 0.2203, 'learning_rate': 3.406374501992032e-05, 'epoch': 1.59}\n",
891
+ "{'loss': 0.2455, 'learning_rate': 3.2735723771580345e-05, 'epoch': 1.73}\n",
892
+ "{'loss': 0.2282, 'learning_rate': 3.140770252324037e-05, 'epoch': 1.86}\n",
893
+ "{'loss': 0.2328, 'learning_rate': 3.00796812749004e-05, 'epoch': 1.99}\n"
894
+ ]
895
+ },
896
+ {
897
+ "data": {
898
+ "application/vnd.jupyter.widget-view+json": {
899
+ "model_id": "f83c5030d5c34216ba6422f2c22858ba",
900
+ "version_major": 2,
901
+ "version_minor": 0
902
+ },
903
+ "text/plain": [
904
+ " 0%| | 0/242 [00:00<?, ?it/s]"
905
+ ]
906
+ },
907
+ "metadata": {},
908
+ "output_type": "display_data"
909
+ },
910
+ {
911
+ "name": "stdout",
912
+ "output_type": "stream",
913
+ "text": [
914
+ "{'eval_loss': 0.4118729829788208, 'eval_accuracy': 0.8982438016528925, 'eval_f1': 0.8763339610797238, 'eval_precision': 0.875784190715182, 'eval_recall': 0.8768844221105527, 'eval_runtime': 12.1691, 'eval_samples_per_second': 159.092, 'eval_steps_per_second': 19.886, 'epoch': 2.0}\n",
915
+ "{'loss': 0.1086, 'learning_rate': 2.8751660026560427e-05, 'epoch': 2.12}\n",
916
+ "{'loss': 0.1137, 'learning_rate': 2.742363877822045e-05, 'epoch': 2.26}\n",
917
+ "{'loss': 0.1058, 'learning_rate': 2.609561752988048e-05, 'epoch': 2.39}\n",
918
+ "{'loss': 0.1073, 'learning_rate': 2.4767596281540506e-05, 'epoch': 2.52}\n",
919
+ "{'loss': 0.0953, 'learning_rate': 2.3439575033200534e-05, 'epoch': 2.66}\n",
920
+ "{'loss': 0.1066, 'learning_rate': 2.2111553784860558e-05, 'epoch': 2.79}\n",
921
+ "{'loss': 0.1152, 'learning_rate': 2.0783532536520585e-05, 'epoch': 2.92}\n"
922
+ ]
923
+ },
924
+ {
925
+ "data": {
926
+ "application/vnd.jupyter.widget-view+json": {
927
+ "model_id": "3c4d464cb3a340d4aa4f6a1a8e4d95b9",
928
+ "version_major": 2,
929
+ "version_minor": 0
930
+ },
931
+ "text/plain": [
932
+ " 0%| | 0/242 [00:00<?, ?it/s]"
933
+ ]
934
+ },
935
+ "metadata": {},
936
+ "output_type": "display_data"
937
+ },
938
+ {
939
+ "name": "stdout",
940
+ "output_type": "stream",
941
+ "text": [
942
+ "{'eval_loss': 0.4992543160915375, 'eval_accuracy': 0.9039256198347108, 'eval_f1': 0.8831658291457286, 'eval_precision': 0.8831658291457286, 'eval_recall': 0.8831658291457286, 'eval_runtime': 12.145, 'eval_samples_per_second': 159.407, 'eval_steps_per_second': 19.926, 'epoch': 3.0}\n",
943
+ "{'loss': 0.0761, 'learning_rate': 1.9455511288180613e-05, 'epoch': 3.05}\n",
944
+ "{'loss': 0.0434, 'learning_rate': 1.812749003984064e-05, 'epoch': 3.19}\n",
945
+ "{'loss': 0.0395, 'learning_rate': 1.6799468791500664e-05, 'epoch': 3.32}\n",
946
+ "{'loss': 0.0516, 'learning_rate': 1.547144754316069e-05, 'epoch': 3.45}\n",
947
+ "{'loss': 0.0344, 'learning_rate': 1.4143426294820719e-05, 'epoch': 3.59}\n",
948
+ "{'loss': 0.0588, 'learning_rate': 1.2815405046480745e-05, 'epoch': 3.72}\n",
949
+ "{'loss': 0.0323, 'learning_rate': 1.148738379814077e-05, 'epoch': 3.85}\n",
950
+ "{'loss': 0.0574, 'learning_rate': 1.0159362549800798e-05, 'epoch': 3.98}\n"
951
+ ]
952
+ },
953
+ {
954
+ "data": {
955
+ "application/vnd.jupyter.widget-view+json": {
956
+ "model_id": "bf0675bd947c472bb221d755dc55a219",
957
+ "version_major": 2,
958
+ "version_minor": 0
959
+ },
960
+ "text/plain": [
961
+ " 0%| | 0/242 [00:00<?, ?it/s]"
962
+ ]
963
+ },
964
+ "metadata": {},
965
+ "output_type": "display_data"
966
+ },
967
+ {
968
+ "name": "stdout",
969
+ "output_type": "stream",
970
+ "text": [
971
+ "{'eval_loss': 0.6084339618682861, 'eval_accuracy': 0.9121900826446281, 'eval_f1': 0.8933500627352573, 'eval_precision': 0.8922305764411027, 'eval_recall': 0.8944723618090452, 'eval_runtime': 11.9875, 'eval_samples_per_second': 161.502, 'eval_steps_per_second': 20.188, 'epoch': 4.0}\n",
972
+ "{'loss': 0.0175, 'learning_rate': 8.831341301460823e-06, 'epoch': 4.12}\n",
973
+ "{'loss': 0.0248, 'learning_rate': 7.503320053120851e-06, 'epoch': 4.25}\n",
974
+ "{'loss': 0.0212, 'learning_rate': 6.175298804780877e-06, 'epoch': 4.38}\n",
975
+ "{'loss': 0.0215, 'learning_rate': 4.847277556440903e-06, 'epoch': 4.52}\n",
976
+ "{'loss': 0.0216, 'learning_rate': 3.51925630810093e-06, 'epoch': 4.65}\n",
977
+ "{'loss': 0.0169, 'learning_rate': 2.1912350597609563e-06, 'epoch': 4.78}\n",
978
+ "{'loss': 0.0199, 'learning_rate': 8.632138114209828e-07, 'epoch': 4.91}\n"
979
+ ]
980
+ },
981
+ {
982
+ "data": {
983
+ "application/vnd.jupyter.widget-view+json": {
984
+ "model_id": "0ac0cee28031479d9721321ec9c949a4",
985
+ "version_major": 2,
986
+ "version_minor": 0
987
+ },
988
+ "text/plain": [
989
+ " 0%| | 0/242 [00:00<?, ?it/s]"
990
+ ]
991
+ },
992
+ "metadata": {},
993
+ "output_type": "display_data"
994
+ },
995
+ {
996
+ "name": "stdout",
997
+ "output_type": "stream",
998
+ "text": [
999
+ "{'eval_loss': 0.6909418106079102, 'eval_accuracy': 0.9158057851239669, 'eval_f1': 0.8963763509218055, 'eval_precision': 0.9073359073359073, 'eval_recall': 0.885678391959799, 'eval_runtime': 12.1798, 'eval_samples_per_second': 158.952, 'eval_steps_per_second': 19.869, 'epoch': 5.0}\n",
1000
+ "{'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0}\n"
1001
+ ]
1002
+ },
1003
+ {
1004
+ "data": {
1005
+ "text/plain": [
1006
+ "TrainOutput(global_step=18825, training_loss=0.15457879885892628, metrics={'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0})"
1007
+ ]
1008
+ },
1009
+ "execution_count": 25,
1010
+ "metadata": {},
1011
+ "output_type": "execute_result"
1012
+ }
1013
+ ],
1014
+ "source": [
1015
+ "trainer.train()"
1016
+ ]
1017
+ },
1018
+ {
1019
+ "cell_type": "markdown",
1020
+ "metadata": {},
1021
+ "source": [
1022
+ "# load model"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": 20,
1028
+ "metadata": {},
1029
+ "outputs": [],
1030
+ "source": [
1031
+ "import torch"
1032
+ ]
1033
+ },
1034
+ {
1035
+ "cell_type": "code",
1036
+ "execution_count": 21,
1037
+ "metadata": {},
1038
+ "outputs": [],
1039
+ "source": [
1040
+ "model = AutoModelForSequenceClassification.from_pretrained('/DATA/sin-kaf/test_trainer/checkpoint-16000')"
1041
+ ]
1042
+ },
1043
+ {
1044
+ "cell_type": "code",
1045
+ "execution_count": 30,
1046
+ "metadata": {},
1047
+ "outputs": [
1048
+ {
1049
+ "name": "stdout",
1050
+ "output_type": "stream",
1051
+ "text": [
1052
+ "Original: güzel kızz\n",
1053
+ "Token IDs: tensor([[ 2, 2639, 2889, 1050, 3, 0, 0, 0, 0, 0, 0, 0,\n",
1054
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1055
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1056
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1057
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1058
+ " 0, 0, 0, 0]])\n",
1059
+ "Token IDs: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1060
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1061
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n"
1062
+ ]
1063
+ }
1064
+ ],
1065
+ "source": [
1066
+ "sent = 'güzel kızz'\n",
1067
+ "input_ids = []\n",
1068
+ "attention_masks = []\n",
1069
+ "\n",
1070
+ "encoded_dict = tokenizer.encode_plus(\n",
1071
+ " sent,\n",
1072
+ " add_special_tokens = True,\n",
1073
+ " max_length = 64,\n",
1074
+ " pad_to_max_length = True,\n",
1075
+ " return_attention_mask = True,\n",
1076
+ " return_tensors = 'pt',\n",
1077
+ " )\n",
1078
+ "\n",
1079
+ "\n",
1080
+ "input_ids = encoded_dict['input_ids']\n",
1081
+ "attention_masks = encoded_dict['attention_mask']\n",
1082
+ "\n",
1083
+ "\n",
1084
+ "input_ids = torch.cat([input_ids], dim=0)\n",
1085
+ "input_mask = torch.cat([attention_masks], dim=0)\n",
1086
+ "\n",
1087
+ "\n",
1088
+ "\n",
1089
+ "print('Original: ', sent)\n",
1090
+ "print('Token IDs:', input_ids)\n",
1091
+ "print('Token IDs:', input_mask)"
1092
+ ]
1093
+ },
1094
+ {
1095
+ "cell_type": "code",
1096
+ "execution_count": 31,
1097
+ "metadata": {},
1098
+ "outputs": [],
1099
+ "source": [
1100
+ "outputs = model(input_ids, input_mask)"
1101
+ ]
1102
+ },
1103
+ {
1104
+ "cell_type": "code",
1105
+ "execution_count": 32,
1106
+ "metadata": {},
1107
+ "outputs": [
1108
+ {
1109
+ "data": {
1110
+ "text/plain": [
1111
+ "SequenceClassifierOutput(loss=None, logits=tensor([[ 3.6835, -3.6147]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
1112
+ ]
1113
+ },
1114
+ "execution_count": 32,
1115
+ "metadata": {},
1116
+ "output_type": "execute_result"
1117
+ }
1118
+ ],
1119
+ "source": [
1120
+ "outputs"
1121
+ ]
1122
+ },
1123
+ {
1124
+ "cell_type": "code",
1125
+ "execution_count": 33,
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "text/plain": [
1131
+ "tensor(0)"
1132
+ ]
1133
+ },
1134
+ "execution_count": 33,
1135
+ "metadata": {},
1136
+ "output_type": "execute_result"
1137
+ }
1138
+ ],
1139
+ "source": [
1140
+ "torch.argmax(outputs['logits'])"
1141
+ ]
1142
+ }
1143
+ ],
1144
+ "metadata": {
1145
+ "kernelspec": {
1146
+ "display_name": "dlenv",
1147
+ "language": "python",
1148
+ "name": "python3"
1149
+ },
1150
+ "language_info": {
1151
+ "codemirror_mode": {
1152
+ "name": "ipython",
1153
+ "version": 3
1154
+ },
1155
+ "file_extension": ".py",
1156
+ "mimetype": "text/x-python",
1157
+ "name": "python",
1158
+ "nbconvert_exporter": "python",
1159
+ "pygments_lexer": "ipython3",
1160
+ "version": "3.9.17"
1161
+ }
1162
+ },
1163
+ "nbformat": 4,
1164
+ "nbformat_minor": 2
1165
+ }